# Development of Emotion and Reasoning in the General Speeches of the United Nations: A text-based machine learning approach
## MAIN - File

### Description: 
It automatically runs all notebooks in the correct order, handles optional additional analyses, and can install required Python packages and download necessary resources. All outputs, including figures, tables, and results, are saved to their respective folders.

It runs all notebooks in the correct order. The script installs required packages and downloads necessary resources. If you do not want this, set InstallPackages = False. It does not overwrite existing packages; it only installs packages that are missing. The script can also download necessary resources for NLTK and spaCy. This includes tokenizers, taggers, and the en_core_web_lg spaCy model. Existing resources are not overwritten.

By default, it will also run the Additional Analysis. If you do not want the Additional Analysis to run, set RUN_ADDITIONAL_ANALYSIS = False. t
All figures, tables, and results are saved automatically in the corresponding folders.

##  Installation of required Packages

In [None]:
# Set this to True to install the following packages 
# The function will only install packages that are not installed yet

InstallPackages = False 

if InstallPackages:
    import sys

    packages = [
         "gensim",
        "joblib",
        "matplotlib",
        "nbconvert",
        "nltk",
        "numpy",
        "pandas",
        "pycountry",
        "rapidfuzz",
        "scipy",
        "seaborn",
        "spacy",
        "tableone",
        "tabulate",
        "tqdm"
        
    ]

    for package in packages:
        if importlib.util.find_spec(package) is None:
            print(f"Installing package: {package}")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        else:
            print(f"Package already installed: {package}")

# Set to True to download resources; it will only install resources that are missing
DownloadAdditions = False  

if DownloadAdditions:
    import nltk
    import spacy
    import subprocess
    import sys

    # --- NLTK resources ---
    nltk_packages = ["punkt", "averaged_perceptron_tagger"]
    for pkg in nltk_packages:
        try:
            nltk.data.find(f"tokenizers/{pkg}" if pkg == "punkt" else f"taggers/{pkg}")
            print(f"NLTK resource already exists: {pkg}")
        except LookupError:
            print(f"Downloading NLTK resource: {pkg}")
            nltk.download(pkg)

    # --- spaCy model ---
    spacy_model = "en_core_web_lg"
    try:
        spacy.load(spacy_model)
        print(f"spaCy model already exists: {spacy_model}")
    except OSError:
        print(f"Downloading spaCy model: {spacy_model}")
        subprocess.check_call([sys.executable, "-m", "spacy", "download", spacy_model])

## Run Notebooks

In [None]:
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
from pathlib import Path

In [7]:
def run_notebook(notebook_path, timeout=20000):
    """
    Executes the Jupyter notebooks automatically.

    The notebook is loaded and run cell-by-cell.
    """
    notebook_path = Path(notebook_path)
    if not notebook_path.exists():
        raise FileNotFoundError(f"Notebook {notebook_path} not found.")

    print(f"Running notebook: {notebook_path.name} ...")
    
    with open(notebook_path, "r", encoding="utf-8") as f:
        nb = nbformat.read(f, as_version=4)
    
    ep = ExecutePreprocessor(timeout=timeout, kernel_name="python3")
    ep.preprocess(nb, {'metadata': {'path': notebook_path.parent}})
    
    print(f"Finished notebook: {notebook_path.name}\n")


notebooks = [
    "notebooks/0_data_creation.ipynb",
    "notebooks/1_model_training_centroids_scoring.ipynb",
    "notebooks/2_figures.ipynb",
    "notebooks/3_tables.ipynb"
]

for nb in notebooks:
    run_notebook(nb)

print("All notebooks executed successfully!")

Running notebook: 0_data_creation.ipynb ...


CellExecutionError: An error occurred while executing the following cell:
------------------
# == New variable: Speech length of the preprocessed corpus ==

# Count tokens in preprocessed speech
df_clean["speech_length_preprocessed"] = dfclean["speech_preprocessed"].apply(
    lambda x: len(x) if isinstance(x, list) else 0
)

print(df_clean[["filename", "speech_length_preprocessed"]].head())
all_tokens = [token for speech in df_clean["speech_preprocessed"].dropna() for token in speech]
unique_tokens = set(all_tokens)
print("Total unique tokens:", len(unique_tokens))

# Average length of preprocessed speeches
average_length = df_clean["speech_length_preprocessed"].mean()

print(f"Average number of tokens per speech: {average_length:.2f}")
------------------


[1;31m---------------------------------------------------------------------------[0m
[1;31mNameError[0m                                 Traceback (most recent call last)
Cell [1;32mIn[28], line 4[0m
[0;32m      1[0m [38;5;66;03m# == New variable: Speech length of the preprocessed corpus ==[39;00m
[0;32m      2[0m 
[0;32m      3[0m [38;5;66;03m# Count tokens in preprocessed speech[39;00m
[1;32m----> 4[0m df_clean[[38;5;124m"[39m[38;5;124mspeech_length_preprocessed[39m[38;5;124m"[39m] [38;5;241m=[39m dfclean[[38;5;124m"[39m[38;5;124mspeech_preprocessed[39m[38;5;124m"[39m][38;5;241m.[39mapply(
[0;32m      5[0m     [38;5;28;01mlambda[39;00m x: [38;5;28mlen[39m(x) [38;5;28;01mif[39;00m [38;5;28misinstance[39m(x, [38;5;28mlist[39m) [38;5;28;01melse[39;00m [38;5;241m0[39m
[0;32m      6[0m )
[0;32m      8[0m [38;5;28mprint[39m(df_clean[[[38;5;124m"[39m[38;5;124mfilename[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mspeech_length_preprocessed[39m[38;5;124m"[39m]][38;5;241m.[39mhead())
[0;32m      9[0m all_tokens [38;5;241m=[39m [token [38;5;28;01mfor[39;00m speech [38;5;129;01min[39;00m df_clean[[38;5;124m"[39m[38;5;124mspeech_preprocessed[39m[38;5;124m"[39m][38;5;241m.[39mdropna() [38;5;28;01mfor[39;00m token [38;5;129;01min[39;00m speech]

[1;31mNameError[0m: name 'dfclean' is not defined


## Additional Analysis

In [None]:
# === Optional: Run Additional Analysis ===
# If you want to run also the additional analysis, then set the function here to TRUE
RUN_ADDITIONAL_ANALYSIS = False  # Set to True to execute additional analysis notebooks

if RUN_ADDITIONAL_ANALYSIS:
    """
    Executes the additional analysis notebooks automatically.

    Each notebook is loaded and run cell-by-cell.
    """
    additional_notebooks = [
        # Different Calculation Weighted Frequencies
        "notebooks/Additional_Analysis/Different_Calculation_Weighted_Frequencies/0_data_creation_changed_weighted_freq.ipynb",
        "notebooks/Additional_Analysis/Different_Calculation_Weighted_Frequencies/1_model_training_centroids_scoring_changed_weighted_freq.ipynb",
        "notebooks/Additional_Analysis/Different_Calculation_Weighted_Frequencies/2_figures_changed_weighted_freq.ipynb",

        # Individual Stopwords
        "notebooks/Additional_Analysis/Individual_Stopwords/0_data_creation_ind_stopwords.ipynb",
        "notebooks/Additional_Analysis/Individual_Stopwords/1_model_training_centroids_scoring_ind_stopwords.ipynb",
        "notebooks/Additional_Analysis/Individual_Stopwords/2_figures_ind_stopwords.ipynb",

        # Figure Comparison Emotionality Score for the different calculations
        "notebooks/Additional_Analysis/2_figure_comparison_emotionality_score.ipynb",
    ]

    print("Running Additional Analysis Notebooks...")
    for nb in additional_notebooks:
        run_notebook(nb)

    print("All Notebooks for Additional analysis executed successfully.")