Create a dictionary with:
- Frequency of words
- Type of words
- Length of words
- Lowercase
- Only words (a-z)

# Option 1

In [2]:
# Get imports
import re
from collections import Counter

In [3]:
# Function to process text and create the dictionary
def create_dictionary(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove non-alphabetic characters (keeping only a-z and spaces)
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Split the text into words
    words = text.split()
    
    # Count the frequency of each word
    word_frequency = Counter(words)
    
    # Get the unique words (types)
    word_types = set(words)
    
    # Calculate the length of each word
    word_lengths = {word: len(word) for word in words}
    
    # Create a dictionary with the required information
    word_data = {
        'frequency': dict(word_frequency),
        'types': len(word_types),
        'lengths': word_lengths
    }
    
    return word_data

In [34]:
# Example text
text = """
    This is a sample text. This text is just to check how the function behaves. 
    It should calculate frequency, types, and word lengths.
"""

In [5]:
# Processing text
word_data = create_dictionary(text)

In [6]:
# Output dictionary
word_data

{'frequency': {'this': 2,
  'is': 2,
  'a': 1,
  'sample': 1,
  'text': 2,
  'just': 1,
  'to': 1,
  'check': 1,
  'how': 1,
  'the': 1,
  'function': 1,
  'behaves': 1,
  'it': 1,
  'should': 1,
  'calculate': 1,
  'frequency': 1,
  'types': 1,
  'and': 1,
  'word': 1,
  'lengths': 1},
 'types': 20,
 'lengths': {'this': 4,
  'is': 2,
  'a': 1,
  'sample': 6,
  'text': 4,
  'just': 4,
  'to': 2,
  'check': 5,
  'how': 3,
  'the': 3,
  'function': 8,
  'behaves': 7,
  'it': 2,
  'should': 6,
  'calculate': 9,
  'frequency': 9,
  'types': 5,
  'and': 3,
  'word': 4,
  'lengths': 7}}

# Option 2

In [38]:
# Get imports
import re
from collections import Counter
import nltk
from nltk import word_tokenize, pos_tag

# Download resources from NLTK
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Dictionary to map POS tags to descriptive types
pos_map = {
    'NN': 'noun', 'NNS': 'noun', 'NNP': 'proper noun', 'NNPS': 'proper noun',
    'VB': 'verb', 'VBD': 'verb', 'VBG': 'verb', 'VBN': 'verb', 'VBP': 'verb', 'VBZ': 'verb',
    'JJ': 'adjective', 'JJR': 'adjective', 'JJS': 'adjective',
    'RB': 'adverb', 'RBR': 'adverb', 'RBS': 'adverb',
    'IN': 'preposition', 'DT': 'determiner', 'PRP': 'pronoun', 'PRP$': 'pronoun',
    'TO': 'preposition', 'CC': 'conjunction', 'UH': 'interjection'
}

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/timursalakhetdinov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/timursalakhetdinov/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [48]:
# Function to map POS tags to descriptive types
def get_pos_description(tag):
    return pos_map.get(tag, 'other')

# Function to process text and create the dictionary
def process_text(text):
    # Remove any characters that are not alphabetic or apostrophes
    words = re.findall(r"[a-zA-Z']+", text.lower())
    
    # Perform POS tagging
    pos_tags = pos_tag(words)
    
    # Count word frequencies
    word_frequency = Counter(words)
    
    # Create the desired dictionary with 'word': {'frequency', 'type', 'length'}
    word_data = {}
    for word, tag in pos_tags:
        pos_description = get_pos_description(tag)  # Get the descriptive POS type
        word_data[word] = {
            'frequency': word_frequency[word],
            'type': pos_description,
            'length': len(word)
        }
    
    return word_data

In [None]:
# Example text
text = """
    This is a sample text. This text is just to check how the function behaves. 
    It should calculate frequency, types, and word lengths.
"""

In [50]:
# Processing the text
word_data = process_text(text)

# Output result
display(word_data)

{'this': {'frequency': 2, 'type': 'determiner', 'length': 4},
 'is': {'frequency': 2, 'type': 'verb', 'length': 2},
 'a': {'frequency': 1, 'type': 'determiner', 'length': 1},
 'sample': {'frequency': 1, 'type': 'adjective', 'length': 6},
 'text': {'frequency': 2, 'type': 'noun', 'length': 4},
 'just': {'frequency': 1, 'type': 'adverb', 'length': 4},
 'to': {'frequency': 1, 'type': 'preposition', 'length': 2},
 'check': {'frequency': 1, 'type': 'verb', 'length': 5},
 'how': {'frequency': 1, 'type': 'other', 'length': 3},
 'the': {'frequency': 1, 'type': 'determiner', 'length': 3},
 'function': {'frequency': 1, 'type': 'noun', 'length': 8},
 'behaves': {'frequency': 1, 'type': 'verb', 'length': 7},
 'it': {'frequency': 1, 'type': 'pronoun', 'length': 2},
 'should': {'frequency': 1, 'type': 'other', 'length': 6},
 'calculate': {'frequency': 1, 'type': 'verb', 'length': 9},
 'frequency': {'frequency': 1, 'type': 'noun', 'length': 9},
 'types': {'frequency': 1, 'type': 'noun', 'length': 5},