# 1. Python Basics - Dictionary creation

Create a dictionary with:
- Frequency of words
- Type of words
- Length of words
- Lowercase
- Only words (a-z)

In [2]:
# Example input text
text = """
    In academia and political debates, the notions of 'degrowth' has gained traction since the dawn of the 21st century. 
    While some uncertainty around its exact definition remains, research on degrowth revolves around the idea of reducing resource and energy throughput as a unifying theme.
    We employ a mixed-methods design to systematicaly review the scientific peer-reviewed English literature from 2008 to 2022 that refers to 'degrowth' or 'post-growth' in title, keywords or abstract (N = 951).
    We find a lack of concrete distributional and monetary policy proposals in the same analyzed, and a low overall degree of collaboration among authors in relation to degrowth's age and sieze.
"""

## Basic Option

In [4]:
# Import packages
import re
from collections import Counter

In [5]:
# Create function to check types of words
def guess_word_type(word):
    if word.endswith('ing'):
        return 'verb'
    elif word.endswith('ed'):
        return 'verb'
    elif word.endswith('ly'):
        return 'adverb'
    elif word.endswith('tion') or word.endswith('ment'):
        return 'noun'
    elif word.endswith('ous') or word.endswith('ive') or word.endswith('able'):
        return 'adjective'
    elif word in ['a', 'the', 'an']:
        return 'determiner'
    elif word in ['and', 'or']:
        return 'conjunction'
    else:
        return 'noun'

In [6]:
# Create function to process text and create the dictionary
def process_text(text):

    # Find words containing only alphabetic characters and apostrophes
    words = re.findall(r"\b[a-zA-Z']+\b", text.lower())
    
    # Remove 's and  apostrophes
    cleaned_words = [re.sub(r"'s$|'", '', word) for word in words]
    
    # Count word frequencies
    word_frequency = Counter(cleaned_words)
    
    # Create the desired dictionary with word: frequency, type, length
    word_data = {}
    for word in cleaned_words:
        word_type = guess_word_type(word)  # Function to check word types
        word_data[word] = {
            'frequency': word_frequency[word],
            'type': word_type,
            'length': len(word)
        }
    # Sort the dictionary alphabetically
    word_data = {key: word_data[key] for key in sorted(word_data)}
    
    return word_data

In [7]:
# Processing the text
dictionary = process_text(text)

# Displaying the result
display(dictionary)

{'a': {'frequency': 4, 'type': 'determiner', 'length': 1},
 'abstract': {'frequency': 1, 'type': 'noun', 'length': 8},
 'academia': {'frequency': 1, 'type': 'noun', 'length': 8},
 'age': {'frequency': 1, 'type': 'noun', 'length': 3},
 'among': {'frequency': 1, 'type': 'noun', 'length': 5},
 'analyzed': {'frequency': 1, 'type': 'verb', 'length': 8},
 'and': {'frequency': 5, 'type': 'conjunction', 'length': 3},
 'around': {'frequency': 2, 'type': 'noun', 'length': 6},
 'as': {'frequency': 1, 'type': 'noun', 'length': 2},
 'authors': {'frequency': 1, 'type': 'noun', 'length': 7},
 'century': {'frequency': 1, 'type': 'noun', 'length': 7},
 'collaboration': {'frequency': 1, 'type': 'noun', 'length': 13},
 'concrete': {'frequency': 1, 'type': 'noun', 'length': 8},
 'dawn': {'frequency': 1, 'type': 'noun', 'length': 4},
 'debates': {'frequency': 1, 'type': 'noun', 'length': 7},
 'definition': {'frequency': 1, 'type': 'noun', 'length': 10},
 'degree': {'frequency': 1, 'type': 'noun', 'length':

## Option with NLTK

In [9]:
# Get imports
import re
from collections import Counter
import nltk
from nltk import word_tokenize, pos_tag

# Download NLTK
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/timursalakhetdinov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/timursalakhetdinov/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [10]:
# Dictionary to map POS tags to proper description
pos_map = {
    'NN': 'noun', 'NNS': 'noun', 'NNP': 'proper noun', 'NNPS': 'proper noun',
    'VB': 'verb', 'VBD': 'verb', 'VBG': 'verb', 'VBN': 'verb', 'VBP': 'verb', 'VBZ': 'verb',
    'JJ': 'adjective', 'JJR': 'adjective', 'JJS': 'adjective',
    'RB': 'adverb', 'RBR': 'adverb', 'RBS': 'adverb',
    'IN': 'preposition', 'DT': 'determiner', 'PRP': 'pronoun', 'PRP$': 'pronoun',
    'TO': 'preposition', 'CC': 'conjunction', 'UH': 'interjection'
}

# Function to process text and create dictionary
def process_text_with_nltk(text):
    # Find words containing only alphabetic characters and apostrophes
    words = re.findall(r"\b[a-zA-Z']+\b", text.lower())
    
    # Remove possessive 's (like in "degrowth's") and standalone apostrophes
    cleaned_words = [re.sub(r"'s$|'", '', word) for word in words]
    
    # Perform POS tagging
    pos_tags = pos_tag(cleaned_words)
    
    # Count word frequencies
    word_frequency = Counter(cleaned_words)
    
    # Create the desired dictionary with word: {frequency, type, length}
    word_data = {}
    for word, tag in pos_tags:
        pos_description = pos_map.get(tag, 'other')  # Get the descriptive type
        word_data[word] = {
            'frequency': word_frequency[word],
            'type': pos_description,
            'length': len(word)
        }
    # Sort the dictionary alphabetically
    word_data = {key: word_data[key] for key in sorted(word_data)}
    
    return word_data

In [11]:
# Processing the text
dictionary = process_text_with_nltk(text)

# Output result
display(dictionary)

{'a': {'frequency': 4, 'type': 'determiner', 'length': 1},
 'abstract': {'frequency': 1, 'type': 'adjective', 'length': 8},
 'academia': {'frequency': 1, 'type': 'noun', 'length': 8},
 'age': {'frequency': 1, 'type': 'noun', 'length': 3},
 'among': {'frequency': 1, 'type': 'preposition', 'length': 5},
 'analyzed': {'frequency': 1, 'type': 'adjective', 'length': 8},
 'and': {'frequency': 5, 'type': 'conjunction', 'length': 3},
 'around': {'frequency': 2, 'type': 'preposition', 'length': 6},
 'as': {'frequency': 1, 'type': 'preposition', 'length': 2},
 'authors': {'frequency': 1, 'type': 'noun', 'length': 7},
 'century': {'frequency': 1, 'type': 'noun', 'length': 7},
 'collaboration': {'frequency': 1, 'type': 'noun', 'length': 13},
 'concrete': {'frequency': 1, 'type': 'adjective', 'length': 8},
 'dawn': {'frequency': 1, 'type': 'noun', 'length': 4},
 'debates': {'frequency': 1, 'type': 'noun', 'length': 7},
 'definition': {'frequency': 1, 'type': 'noun', 'length': 10},
 'degree': {'freq