# Emotion and Reason in Political Language: Examining the UN General speeches
## Script 1: Preprocessing & Token Frequencies
### by Sarah Franzen

### Description: 
#### - Extract documents from their original txt documents and store them as one csv
#### - Data Cleaning and Pre-Processing
#### - Count word frequencies and weight them


## Setup, Installation and Verification of required Packages and Libraries

In [118]:
InstallPackages = False # Set this to True to install the following packages 

if InstallPackages:
    import sys
    !{sys.executable} -m pip install pandas
    !{sys.executable} -m pip install nltk
    !{sys.executable} -m pip install spacy
    !{sys.executable} -m pip install numpy
    !{sys.executable} -m pip install gensim
    !{sys.executable} -m pip install pycountry
    !{sys.executable} -m pip install wordcloud matplotlib

#########################
# Check if all packages are included
##########################

In [120]:
# == Import standard and third-party libraries for data processing, NLP, and visualization ==

import gensim
import joblib
import nltk
import os
import pandas as pd
import pycountry
import random
import re
import spacy
import time
import pickle

from collections import Counter
from itertools import chain
#from matplotlib.colors import ListedColormap
from multiprocessing import Pool, freeze_support
# from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOPWORDS
from string import punctuation
from tqdm import tqdm

# === Initialize NLP Tools ===

# Translator to remove punctuation
translator = str.maketrans('', '', punctuation)

# POS tagger (not used by SpaCy, but optionally available via NLTK)
tagger = nltk.perceptron.PerceptronTagger()

# Load SpaCy English model with unnecessary components disabled
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

stemmer = SnowballStemmer("english")


# === Set Working Directory ===

# Set your working directory (adjust this as needed)
wd = r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit"
os.chdir(wd)

# === Define Folder Paths ===

# Make sure that you have these folders in your working directory
data_c = os.path.join(wd, 'data')
data_temp = os.path.join(data_c, 'temp')
data_freq = os.path.join(data_c, 'freq')
data_dict = os.path.join(data_c, 'dictionaries')
data_preprocessed = os.path.join(data_c, 'preprocessed')
data_tokenized = os.path.join(data_c, 'tokenized')
fig = os.path.join(wd, 'Code/0_descriptives/fig')

In [121]:
print("Current working directory:", os.getcwd())

Current working directory: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit


In [122]:
# Set DownloadAdditions to True if you need to download these additional resources.

DownloadAdditions = False
if DownloadAdditions:
    spacy.cli.download('en_core_web_lg')         # Download spaCy English model (large)

## Load and Prepare Corpus

This chunk can be skipped at the moment

In [133]:
# == Load and Save Sample from UN General Debate Corpus ==             ######################## ADJUST LATER

# Set Folder path containing the original TXT files    
base_folder = r".\data\data_original\UN General Debate Corpus\UNGDC_1946-2023\TXT"

#  Gather all relevant txt-files
all_txt_files = []
for root, dirs, files in os.walk(base_folder):
    for file in files:
        if file.endswith('.txt') and not file.startswith('._'):
            all_txt_files.append(os.path.join(root, file))

print(f"ðŸ§¾ Total speeches found: {len(all_txt_files)}")

# Randomly pick 800 files from the full collection   ###################################################### REMOVE AT LATER POINT
sampled_files = random.sample(all_txt_files,10761)

# Read the selected files into a list
raw_data = []
for filepath in sampled_files:
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
        raw_data.append({'filename': os.path.basename(filepath), 'speech': content})

# Create DataFrame from the collected speeches
df_raw = pd.DataFrame(raw_data)

df_raw = df_raw[df_raw['filename'] != '.DS_Store-to-UTF-8.txt'].copy()

# Save df_raw as a pickle file for quick future loading
raw_pickle_path = r".\data\un_corpus_raw.pkl"
df_raw.to_pickle(raw_pickle_path)

# Export df as CSV 
raw_output_path = r".\data\un_corpus_raw.csv"
df_raw.to_csv(raw_output_path, index=False, sep=';', encoding='utf-8')

print(f"\nâœ… Saved raw data with {len(df_raw)} speeches to '{raw_output_path}'")


ðŸ§¾ Total speeches found: 10761

âœ… Saved raw data with 10760 speeches to '.\data\un_corpus_raw.csv'


In [134]:
# == Check if everything worked ==

# Load df_raw
df_raw = pd.read_pickle(r".\data\un_corpus_raw.pkl")

# View df to check structure
df_raw.head()         


Unnamed: 0,filename,speech
0,GIN_77_2022.txt,"At the outset of my remarks, I wish to convey ..."
1,ITA_43_1988.txt,"ï»¿\nMr. President, on behalf of the Italian Gov..."
2,SWE_26_1971.txt,"1 Mr. President, it gives me great pleasure to..."
3,BRB_61_2006.txt,I am pleased to \njoin with preceding speakers...
4,LBY_15_1960.txt,"Once again, Mr. President, allow me to congrat..."


In [143]:
# Extract the year (may include NaNs if no match)
df_raw['year'] = df_raw['filename'].str.extract(r'_(\d{4})\.txt$')

# Print rows where year is NaN
na_rows = df_raw[df_raw['year'].isna()]
print("Rows with missing years:\n", na_rows[['filename']])

# Optional: check how many rows are affected
print(f"Number of rows with missing year: {len(na_rows)}")


Rows with missing years:
 Empty DataFrame
Columns: [filename]
Index: []
Number of rows with missing year: 0


In [145]:
# == Create new variables: year, country_code and country_name ==

# Extract country code (first 3 letters) and year (last 4 digits before .txt)
df_raw['country_code'] = df_raw['filename'].str.extract(r'^([A-Z]{2,3})')
df_raw['year'] = df_raw['filename'].str.extract(r'_(\d{4})\.txt$').astype(int)

print("Min year:", df_raw['year'].min())
print("Max year:", df_raw['year'].max())

# Match country codes to country names
code_to_name = {country.alpha_3: country.name for country in pycountry.countries}

# Add custom short names and legacy codes
custom_names = {
    "BOL": "Bolivia",
    "COD": "The Democratic Republic of the Congo",
    "IRN": "Iran",
    "LAO": "Laos",
    "MDA": "Moldova",
    "PRK": "North Korea",
    "PSE": "Palestine",
    "RUS": "Russia",
    "SYR": "Syria",
    "TZA": "Tanzania",
    "VAT": "Vatican City State",
    "VEN": "Venezuela",
    "VNM": "Vietnam",
    #"YD":  "South Yemen",
    "YMD": "Soth Yemen",
    "YUG": "Yugoslavia",
    "DDR": "East Germany",
    "SUN": "Soviet Union",
    "EU": "European Union",
    "CSK": "Czechoslovakia",
    "FSM": "Micronesia",
    "KOR": "South Korea"
    
}

# Update the main mapping with custom names
code_to_name.update(custom_names)

# Map with updated dictionary
df_raw['country_name'] = df_raw['country_code'].map(code_to_name)

# Check missing mappings
missing = df_raw.loc[df_raw['country_name'].isna(), 'country_code'].unique()
print("Missing codes:", missing)

# Check structure of the df
df_raw.head() 

save_path = os.path.join(data_c, 'un_corpus_raw.pkl')
df_raw.to_pickle(save_path)
print(f"df_raw saved to {save_path}")

Min year: 1946
Max year: 2023
Missing codes: []
df_raw saved to C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\un_corpus_raw.pkl


In [147]:
# == Check the country names 

pd.set_option('display.max_rows', None)
print(df_raw[['country_code', 'country_name']].drop_duplicates().sort_values('country_code').reset_index(drop=True))
# Reset to default afterward
pd.reset_option('display.max_rows')

    country_code                          country_name
0            AFG                           Afghanistan
1            AGO                                Angola
2            ALB                               Albania
3            AND                               Andorra
4            ARE                  United Arab Emirates
5            ARG                             Argentina
6            ARM                               Armenia
7            ATG                   Antigua and Barbuda
8            AUS                             Australia
9            AUT                               Austria
10           AZE                            Azerbaijan
11           BDI                               Burundi
12           BEL                               Belgium
13           BEN                                 Benin
14           BFA                          Burkina Faso
15           BGD                            Bangladesh
16           BGR                              Bulgaria
17        

## Pre-processing

### Cleaning

In [151]:
# == Clean text by removing empty spaces, line breaks, hyphenation, stray characters, and escape quote ==

# Define cleaning function
def clean_text(content):
    if pd.isna(content):
        return ""
    
    # Remove line breaks and carriage returns
    content = content.replace('\n', ' ').replace('\r', ' ')

    # Collapse multiple spaces
    content = ' '.join(content.split())

    # Fix punctuation spacing (e.g. "word,another" â†’ "word, another")
    content = re.sub(r'(?<=[.,])(?=[^\s])', r' ', content)

    # Remove hyphenation at line breaks (e.g. "inter- national" â†’ "international")
    content = re.sub(r'-\s', '', content)

    ############NEW

     # Replace hyphen between letters with a space to prevent merging words (e.g. "russian-and" â†’ "russian and")
    content = re.sub(r'(?<=\w)-(?=\w)', ' ', content)
    #################NEW

    # Remove stray backslashes
    content = content.replace("\\", "")

    # Escape double quotes for CSV safety
    content = content.replace('"', '""')

    return content

# Apply cleaning to each speech
df_raw['speech'] = df_raw['speech'].astype(str)  # Ensure column is string type
df_clean = df_raw.copy()
df_clean['speech'] = df_clean['speech'].apply(clean_text)

# Drop rows with empty speeches after cleaning
df_clean = df_clean[df_clean['speech'].str.strip().astype(bool)].reset_index(drop=True)


In [152]:
# == Split cleaned data into chunks and save as separate files ==

# Convert cleaned DataFrame to list of lists
clean_data = df_clean[['filename', 'speech']].values.tolist()

# Split cleaned data into 4 equal chunks
data_id1 = clean_data[:int(len(clean_data)/4)]
data_id2 = clean_data[int(len(clean_data)/4): int(2*len(clean_data)/4)]
data_id3 = clean_data[int(2*len(clean_data)/4): int(3*len(clean_data)/4)]
data_id4 = clean_data[int(3*len(clean_data)/4):]

# Change directory to the temp folder
os.chdir(data_temp)  # make sure `data_temp` exists and is defined

# Save each chunk with joblib
joblib.dump(data_id1, 'cleanspeeches_indexed1.pkl')
joblib.dump(data_id2, 'cleanspeeches_indexed2.pkl')
joblib.dump(data_id3, 'cleanspeeches_indexed3.pkl')
joblib.dump(data_id4, 'cleanspeeches_indexed4.pkl')

# Store list of cleaned data chunk paths to feed into preprocessing function later
data_files = [
    os.path.join(data_temp, 'cleanspeeches_indexed1.pkl'),
    os.path.join(data_temp, 'cleanspeeches_indexed2.pkl'),
    os.path.join(data_temp, 'cleanspeeches_indexed3.pkl'),
    os.path.join(data_temp, 'cleanspeeches_indexed4.pkl')
]

print(f"âœ… Saved clean speeches chunks in '{data_temp}'")

âœ… Saved clean speeches chunks in 'C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp'


### Advanced Text Pre-Processing

In [154]:
# == Define function to Tokenize, eliminate digits, remove stopwords, lemmatize, POS-Tagging ==

def pro1(lista):
    # Remove punctuation
    return [[row[0], row[1].translate(translator)] for row in lista]

def pro2(lista):
    # Tokenize and lowercase with gensim
    return [[row[0], gensim.utils.simple_preprocess(row[1])] for row in lista]

def pro3(lista):
    # Remove digits
        return [[row[0], [w for w in row[1] if not any(char.isdigit() for char in w)]] for row in lista]

def pro4(lista):
    # Drop short words
    return [[row[0], [w for w in row[1] if len(w) > 2]] for row in lista]

def tags_spacy(lista):
    texts = [' '.join(row[1]) for row in lista]
    docs = list(nlp.pipe(texts, batch_size=20, n_process=1))
    result = []
    for i, doc in enumerate(docs):
        filtered_tokens = [token.text for token in doc if token.tag_.startswith(('N', 'V', 'J'))]
        result.append([lista[i][0], filtered_tokens])
    return result


def pro5(lista):
    # Remove stopwords using SpaCy stopword list
    return [[row[0], [w for w in row[1] if w not in SPACY_STOPWORDS]] for row in lista]

def pro6(lista):
      return [
        [row[0], [stemmer.stem(token) for token in row[1]]]
        for row in lista
    ]
   # texts = [' '.join(row[1]) for row in lista]
   # docs = list(nlp.pipe(texts, batch_size=20, n_process=1))
   # result = []
   # for i, doc in enumerate(docs):
    # lemmatized = [token.lemma_ for token in doc]
     #    result.append([lista[i][0], lemmatized])
  #  return result

########################## Question for Max: They removed procedural words in the paper

def dropnull(lista):
    # Drop empty speeches
    return [row for row in lista if len(' '.join(row[1])) > 0]

In [155]:
# == Create full pre-processing function and call it

def preprocessing(data_name):
    t0 = time.time()
    print(f"Starting preprocessing for {data_name}...")

    data = joblib.load(data_name)
    data = pro1(data)
    data = pro2(data)
    data = pro3(data)
    data = pro4(data)

    print(f"[{data_name}] Before tagging: {time.time() - t0:.2f}s")
    data = tags_spacy(data)
    print(f"[{data_name}] After tagging: {time.time() - t0:.2f}s")

    data = pro5(data)
   # data = pro6(data)
    data = dropnull(data)

    # out_name = data_name.replace('cleanspeeches_', 'preprocessed_speeches_').replace('.pkl', '_temp.pkl')

    # original filename replacement
    filename_wordcloud = data_name.replace('cleanspeeches_', 'wordcloud_speeches_').replace('.pkl', '.pkl')

    # full path in data_preprocessed folder
    out_name_wordcloud = os.path.join(data_preprocessed, os.path.basename(filename_wordcloud))

  #  joblib.dump(data, out_name)
    joblib.dump(data, out_name_wordcloud)

    data_stemmed = pro6(data)

    filename_preprocessed = data_name.replace('cleanspeeches_', 'preprocessed_speeches_').replace('.pkl', '.pkl')
    out_preprocessed = os.path.join(data_preprocessed, os.path.basename(filename_preprocessed))
    joblib.dump(data_stemmed, out_preprocessed)
    print(f"[{data_name}] Saved stemmed version: {out_preprocessed}")

    print(f"[{data_name}] Done. Total time: {time.time() - t0:.2f}s\n")

def main():
    for fname in data_files:
        preprocessing(fname)

if __name__ == "__main__":
    main()

Starting preprocessing for C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1.pkl...
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1.pkl] Before tagging: 24.05s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1.pkl] After tagging: 465.79s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1.pkl] Saved stemmed version: C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed1.pkl
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed1.pkl] Done. Total time: 546.51s

Starting preprocessing for C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed2.pkl...
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed2.pkl] Before tagging: 21.03s
[C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp\cleanspeeches_indexed2.pkl] After tagging: 445.62s
[C:\Users\sara

In [156]:
# Store the pre-processed data
preprocessed_files = [
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed1.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed2.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed3.pkl'),
    os.path.join(data_preprocessed, 'preprocessed_speeches_indexed4.pkl')
]

wordcloud_files = [
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed1.pkl'),
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed2.pkl'),
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed3.pkl'),
    os.path.join(data_preprocessed, 'wordcloud_speeches_indexed4.pkl')
]

In [157]:
print(os.getcwd())

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\temp


## Word-Frequencies

### Count frequencies of all tokens and display the most common words

In [160]:
#== Count token frequencies ==

def count_frequencies(filenames):
    total_freq = Counter()
    for fname in tqdm(filenames):
        data = joblib.load(fname)
        tokens = chain.from_iterable(row[1] for row in data if isinstance(row[1], list))
        total_freq.update(tokens)
    return total_freq

def remove_rare_words(filenames, freqs, min_count=10):
    for fname in filenames:
        data = joblib.load(fname)
        filtered_data = []
        for doc_id, tokens in data:
            filtered_tokens = [w for w in tokens if freqs.get(w, 0) >= min_count]
            filtered_data.append([doc_id, filtered_tokens])
        joblib.dump(filtered_data, fname)  # overwrite or save as new file
        print(f"Processed {fname}: removed words with freq < {min_count}")

# === Count for preprocessed (stemmed) speeches ===
word_counts_stemmed = count_frequencies(preprocessed_files)

remove_rare_words(preprocessed_files, word_counts_stemmed, min_count=10)

print("\n[Stemmed] Top 100 most common words:")
for word, count in word_counts_stemmed.most_common(100):
    print(f"{word}: {count}")

print("\n[Stemmed] Top 300 least common words:")
for word, count in word_counts_stemmed.most_common()[:-301:-1]:
    print(f"{word}: {count}")

# Save stemmed word counts
save_path_stemmed = os.path.join(data_freq, 'word_counts_stemmed.pkl')
joblib.dump(word_counts_stemmed, save_path_stemmed)

# === Count for wordcloud (unstemmed) speeches ===
word_counts_wordcloud = count_frequencies(wordcloud_files)

print("\n[Wordcloud] Top 100 most common words:")
for word, count in word_counts_wordcloud.most_common(100):
    print(f"{word}: {count}")

print("\n[Wordcloud] Top 300 least common words:")
for word, count in word_counts_wordcloud.most_common()[:-301:-1]:
    print(f"{word}: {count}")

# Save unstemmed word counts
save_path_wordcloud = os.path.join(data_freq, 'word_counts_wordcloud.pkl')
joblib.dump(word_counts_wordcloud, save_path_wordcloud)


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [00:29<00:00,  7.28s/it]


Processed C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed1.pkl: removed words with freq < 10
Processed C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed2.pkl: removed words with freq < 10
Processed C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed3.pkl: removed words with freq < 10
Processed C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data\preprocessed\preprocessed_speeches_indexed4.pkl: removed words with freq < 10

[Stemmed] Top 100 most common words:
nation: 232332
unit: 186036
countri: 176251
intern: 159425
develop: 145364
peac: 133382
world: 131630
state: 128681
peopl: 126148
secur: 84579
general: 76542
govern: 74954
econom: 72977
organ: 68141
right: 65874
year: 65829
assembl: 63469
new: 58990
effort: 56971
problem: 56698
human: 56072
support: 54788
continu: 53001
communiti: 48791
region: 48258
polit: 48075
time: 47283
member: 42574
africa: 42

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [00:29<00:00,  7.34s/it]



[Wordcloud] Top 100 most common words:
nations: 185541
united: 183294
international: 152545
world: 128865
countries: 117093
peace: 110332
states: 90068
development: 85218
people: 80324
security: 78421
general: 74810
economic: 72599
assembly: 62091
new: 58990
country: 57215
government: 56448
organization: 50483
human: 48889
efforts: 46335
political: 45844
peoples: 45773
community: 44912
rights: 44034
support: 40749
africa: 40620
council: 39845
session: 39784
time: 38904
war: 36554
state: 34415
south: 34165
great: 33644
problems: 33302
republic: 33286
years: 33171
national: 33042
year: 32566
order: 32067
nuclear: 32003
developing: 31781
situation: 31337
global: 30966
work: 30014
social: 29765
conference: 28324
hope: 27770
charter: 27624
president: 27511
today: 26394
continue: 26269
important: 26160
need: 25962
region: 25912
african: 25701
relations: 25428
progress: 25116
principles: 24764
east: 24565
action: 24210
respect: 23983
weapons: 23796
problem: 23396
future: 23232
secretary: 232

['C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit\\data\\freq\\word_counts_wordcloud.pkl']

In [161]:
os.chdir(data_c)
print(os.getcwd())

C:\Users\sarah\OneDrive\Dokumente\Masterarbeit\data


### Count the frequency of the dictionary words

In [163]:
# == Count dictionary words

# Load dictionaries            ##### How did they come up with this dictionary? Why did they exclude words?

affect_path = os.path.join(data_dict, 'dictionary_affect.pkl')
cognition_path = os.path.join(data_dict, 'dictionary_cognition.pkl')

with open(affect_path, 'rb') as f:
    affect_dict = pickle.load(f)
print("Contents of affect dictionary:")
print(affect_dict)
print("Number of words in affect dictionary:", len(affect_dict))

with open(cognition_path, 'rb') as f:
    cognition_dict = pickle.load(f)
print("Contents of cognition dictionary:")
print(cognition_dict)
print("Number of words in cognition dictionary:", len(cognition_dict))

affect = joblib.load(affect_path)
cognition = joblib.load(cognition_path)

a = [[i, word_counts_stemmed[i]] for i in affect if i in word_counts_stemmed]
c = [[i, word_counts_stemmed[i]] for i in cognition if i in word_counts_stemmed]

a = sorted(a, key=lambda x: x[1], reverse=True)
c = sorted(c, key=lambda x: x[1], reverse=True)

a = [[i[0], f"({i[1]}),"] for i in a]
c = [[i[0], f"({i[1]}),"] for i in c]

a1 = ' '.join(str(r) for v in a for r in v)
c1 = ' '.join(str(r) for v in c for r in v)

affect_out_path = os.path.join(data_freq, "affect_words.txt")
cog_out_path = os.path.join(data_freq, "cog_words.txt")

os.makedirs(data_freq, exist_ok=True)  # ensure directory exists

with open(affect_out_path, "w") as output:
    output.write(a1)

with open(cog_out_path, "w") as output:
    output.write(c1)

Contents of affect dictionary:
['forbid', 'unattract', 'cruelti', 'crappi', 'apathi', 'scari', 'unimpress', 'sin', 'dumbest', 'eas', 'agit', 'sob', 'shocker', 'tragedi', 'fabul', 'strongest', 'giver', 'sigh', 'aw', 'witch', 'hurtl', 'fucktard', 'cruel', 'glamor', 'funni', 'smarter', 'brillianc', 'irrate', 'alright', 'honest', 'profit', 'fearless', 'grievous', 'relax', 'isolationist', 'hah', 'shyness', 'poorest', 'cruelest', 'troublemak', 'disagre', 'agon', 'terror', 'fight', 'pleas', 'poor', 'crazi', 'hostil', 'stupid', 'damnat', 'vain', 'jade', 'heartless', 'nag', 'gloomi', 'damn', 'dishearten', 'pleaser', 'credit', 'warmth', 'greatest', 'whine', 'shame', 'angriest', 'envious', 'grin', 'blameless', 'sweeter', 'laidback', 'stupidest', 'unprotect', 'whiner', 'unlov', 'shake', 'boredom', 'fairer', 'weaker', 'wellb', 'bold', 'sucki', 'unsuccess', 'mourner', 'liken', 'defens', 'invigor', 'tedious', 'paranoid', 'cynic', 'dignifi', 'paranoia', 'sweetest', 'contented', 'humili', 'crush', 'ter

In [167]:
# == Calculate weighted frequencies for all words

 # STEMMED OR NOT?
# - downweights very common words by giving more importance to rare ones
word_counts_stemmed = joblib.load(os.path.join(data_freq, 'word_counts_stemmed.pkl'))

l = sum(word_counts_stemmed.values())

a = 0.001
word_counts_weighted = {k: a / (a + (v / l)) for k, v in word_counts.items()}
#for key in word_counts.keys():
 #   word_counts[key] = a / (a + (word_counts[key] / l))

joblib.dump(word_counts_weighted, os.path.join(data_freq, 'word_counts_weighted.pkl'))

################################################################################ ISSUE##################
# To print top 100 by weighted values, sort the dictionary by value descending:
top_100_weighted = sorted(word_counts_weighted.items(), key=lambda x: x[1], reverse=True)[:100]

print("Top 100 words by weighted frequency:")
for word, weight in top_100_weighted:
    print(f"{word}: {weight}")


Top 100 words by weighted frequency:
disloy: 0.9993014671495019
prick: 0.9993014671495019
litr: 0.9993014671495019
chen: 0.9993014671495019
dualism: 0.9993014671495019
praia: 0.9993014671495019
lexicon: 0.9993014671495019
insuffer: 0.9993014671495019
siad: 0.9993014671495019
nothing: 0.9993014671495019
neat: 0.9993014671495019
sojourn: 0.9993014671495019
therefor: 0.9993014671495019
cento: 0.9993014671495019
lish: 0.9993014671495019
pyrrhic: 0.9993014671495019
tenth: 0.9993014671495019
mondlan: 0.9993014671495019
resolu: 0.9993014671495019
tive: 0.9993014671495019
kiloton: 0.9993014671495019
atyp: 0.9993014671495019
dan: 0.9993014671495019
midrand: 0.9993014671495019
mahamadou: 0.9993014671495019
shrank: 0.9993014671495019
unorthodox: 0.9993014671495019
foci: 0.9993014671495019
finer: 0.9993014671495019
gust: 0.9993014671495019
underworld: 0.9993014671495019
bluster: 0.9993014671495019
sneer: 0.9993014671495019
shirt: 0.9993014671495019
stealth: 0.9993014671495019
brigand: 0.9993014671