# Development of Emotion and Reasoning in the General Speeches of the United Nations: A text-based machine learning approach
## Script 3: Tables
### Author: Sarah Franzen

Run Script 0 till 2 beforehand in order to create the proper folder structure and implement the data.

### Description: 
#### This file creates the following figures and tables

Summary Statistics
- ?????????


In [40]:
# == Import libraries for data processing and visualization ==
import matplotlib.pyplot as plt
#from matplotlib.colors import ListedColormap
import seaborn as sns
import os
import joblib
import pandas as pd
from scipy import stats
from tabulate import tabulate
import numpy as np

# === Set Working Directory ===

# Prompt user to enter working directory path
#wd = input("Please enter your working directory path (e.g., C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit): ").strip()

# Change to the entered working directory
#try:
   # os.chdir(wd)
    #print(f"Working directory set to: {os.getcwd()}")
#except FileNotFoundError:
   # print("ERROR: The directory you entered does not exist. Please restart and enter a valid path.")
    #exit(1)

# Set your working directory (adjust this as needed)
wd = r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit"

# === Define Folder Paths ===
wd_results = os.path.join(wd, 'results')
data_c = os.path.join(wd, 'data')
data_temp = os.path.join(data_c, 'temp')
data_freq = os.path.join(data_c, 'freq')
tables_dir = os.path.join(wd, 'tables')

In [42]:
# === Load data ===

################ FIXXXXX un_corpus_scored also when loading!!!
os.chdir(tables_dir)
un_corpus_scored = pd.read_csv(
    os.path.join(wd_results, "un_corpus_scored.csv"),
    sep=';', 
    encoding='utf-8'
)

# Keep only rows where 'score' is not NA
rows_before = len(un_corpus_scored)
un_corpus_scored = un_corpus_scored[un_corpus_scored['score'].notna()]
print(f"Rows dropped due to missing score: {rows_before - len(un_corpus_scored)}")

Rows dropped due to missing score: 0


## Summary Statistics

In [44]:
position_nonmissing = un_corpus_scored['position'].notna()

position_dummies = pd.get_dummies(un_corpus_scored.loc[position_nonmissing, 'position'])

position_dummies = position_dummies.astype(int)

position_dummies = position_dummies.reindex(un_corpus_scored.index)

position_dummies.loc[~position_nonmissing, :] = pd.NA

position_dummies = position_dummies.astype("Int64")

un_corpus_scored = pd.concat([un_corpus_scored, position_dummies], axis=1)

In [97]:
all_numeric_vars = ['year', 'speech_length_words', 'english_official_language',
                    'security_council_permanent', 'gender_dummy'] + list(position_dummies.columns)

summary_table = pd.DataFrame({
    "Variable": all_numeric_vars,
    "Obs": un_corpus_scored[all_numeric_vars].count().astype(int),
    "Mean": un_corpus_scored[all_numeric_vars].mean().round(3),
    "SD": un_corpus_scored[all_numeric_vars].std().round(3),
    "Min": un_corpus_scored[all_numeric_vars].min(),
    "Max": un_corpus_scored[all_numeric_vars].max()
})

position_header = pd.DataFrame({
    "Variable": ["Position"],
    "Obs": [""],
    "Mean": [""],
    "SD": [""],
    "Min": [""],
    "Max": [""]
})

# Insert after gender_dummy (row index 4)
insert_idx = 5
summary_table = pd.concat([summary_table.iloc[:insert_idx],
                           position_header,
                           summary_table.iloc[insert_idx:]]).reset_index(drop=True)

var_labels = {
    "year": "Year",
    "speech_length_words": "Number of Words",
    "english_official_language": "English as Official Language (Yes = 1)",
    "security_council_permanent": "Permanent Member of the Security Council (Yes = 1)",
    "gender_dummy": "Gender (Female = 1)",
    "(Deputy) Minister for Foreign Affairs": "&nbsp;&nbsp;&nbsp;&nbsp;(Deputy) Minister for Foreign Affairs",
    "(Deputy) Prime Minister": "&nbsp;&nbsp;&nbsp;&nbsp;(Deputy) Prime Minister",
    "(Vice-) President": "&nbsp;&nbsp;&nbsp;&nbsp;(Vice-) President",
    "Diplomatic Representative": "&nbsp;&nbsp;&nbsp;&nbsp;Diplomatic Representative",
    "Others": "&nbsp;&nbsp;&nbsp;&nbsp;Others"
}
summary_table['Variable'] = summary_table['Variable'].replace(var_labels)

numeric_cols = ['Mean','SD','Min','Max']
summary_table[numeric_cols] = summary_table[numeric_cols].replace("", pd.NA)

summary_table[['Min', 'Max']] = summary_table[['Min', 'Max']].astype('Int64')

styled_table = summary_table.style \
    .hide(axis="index") \
    .set_table_styles([
        {'selector': 'th', 'props': [
            ('border-bottom', '3px solid black'),  # thick black line instead of grey background
            ('color', 'black'),
            ('font-weight', 'bold'),
            ('text-align', 'center'),
            ('background-color', 'white')
        ]},
        {'selector': 'th.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col1', 'props': [('min-width', '80px')]},
        {'selector': 'td.col2', 'props': [('min-width', '80px')]},
        {'selector': 'td.col3', 'props': [('min-width', '80px')]},
        {'selector': 'td.col4', 'props': [('min-width', '80px')]},
        {'selector': 'td.col5', 'props': [('min-width', '80px')]}
    ]) \
    .set_properties(**{'text-align': 'center'}, subset=['Obs','Mean','SD','Min','Max']) \
    .format({"Mean": "{:.3f}", "SD": "{:.3f}"})


styled_table.to_html("Thesis_Summary_Table.html")

In [99]:
latex_table = summary_table.to_latex(
    index=False,
    float_format="%.3f",
    caption="Summary Statistics",
    label="tab:summary_stats",
    column_format="lccccc"
)

with open("Thesis_Summary_Table.tex", "w", encoding="utf-8") as f:
    f.write(latex_table)


### Summary Statistic: Score

In [12]:
summary_vars = ['gender_dummy', 'position']
var_labels = {
    'gender_dummy': 'Gender Sample',
    'position': 'Position Sample'
}

summary_list = []

overall = un_corpus_scored['score']
summary_list.append({
    'Variable': 'Full sample',
    'Obs': overall.count(),
    'Mean': round(overall.mean(), 3),
    'SD': round(overall.std(), 3),
    'Min': round(overall.min(), 3),
    'Max': round(overall.max(), 3)
})

for var in summary_vars:
    subset = un_corpus_scored.loc[~un_corpus_scored[var].isna(), 'score']
    summary_list.append({
        'Variable': var_labels.get(var, var),
        'Obs': subset.count(),
        'Mean': round(subset.mean(), 3),
        'SD': round(subset.std(), 3),
        'Min': round(subset.min(), 3),
        'Max': round(subset.max(), 3)
    })

summary_df = pd.DataFrame(summary_list)

styled_table = (
    summary_df.style
    .hide(axis="index")
    .set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#d3d3d3'),
                                     ('color', 'black'),
                                     ('font-weight', 'bold'),
                                     ('text-align', 'center')]},
        {'selector': 'th.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col1', 'props': [('min-width', '80px')]},
        {'selector': 'td.col2', 'props': [('min-width', '80px')]},
        {'selector': 'td.col3', 'props': [('min-width', '80px')]},
        {'selector': 'td.col4', 'props': [('min-width', '80px')]},
        {'selector': 'td.col5', 'props': [('min-width', '80px')]}
    ])
    .set_properties(**{'text-align': 'center'}, subset=['Obs','Mean','SD','Min','Max'])
    .format({
        "Obs": "{:.0f}",
        "Mean": "{:.3f}",
        "SD": "{:.3f}",
        "Min": "{:.3f}",
        "Max": "{:.3f}"
    }, na_rep="-")
)

styled_table.to_html("Thesis_Score_Summary_Full_Gender_Position.html")

print(summary_df)

          Variable    Obs   Mean     SD    Min    Max
0      Full sample  10952  0.765  0.207  0.312  1.635
1    Gender Sample   4704  0.752  0.200  0.312  1.517
2  Position Sample   6273  0.796  0.212  0.312  1.635


In [14]:
decade_start = (np.floor((un_corpus_scored['year'] - 1946) / 10) * 10 + 1946).astype(int)
decade_end = decade_start + 9

decade_end = decade_end.where(decade_end < 2024, 2024)

un_corpus_scored['Decade'] = decade_start.astype(str) + "â€“" + decade_end.astype(str)

decade_summary = un_corpus_scored.groupby('Decade')['score'].agg(
    Obs='count',
    Mean='mean',
    SD='std',
    Min='min',
    Max='max'
).reset_index()

decade_summary['Mean'] = decade_summary['Mean'].round(3)
decade_summary['SD'] = decade_summary['SD'].round(3)
decade_summary['Min'] = decade_summary['Min'].round(3)
decade_summary['Max'] = decade_summary['Max'].round(3)

print(decade_summary)

numeric_cols = ['Mean', 'SD', 'Min', 'Max']
decade_summary[numeric_cols] = decade_summary[numeric_cols].round(3)

styled_decade_table = decade_summary.style \
    .hide(axis="index") \
    .set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#d3d3d3'),
                                     ('color', 'black'),
                                     ('font-weight', 'bold'),
                                     ('text-align', 'center')]},
        {'selector': 'th.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col1', 'props': [('min-width', '80px')]},
        {'selector': 'td.col2', 'props': [('min-width', '80px')]},
        {'selector': 'td.col3', 'props': [('min-width', '80px')]},
        {'selector': 'td.col4', 'props': [('min-width', '80px')]},
        {'selector': 'td.col5', 'props': [('min-width', '80px')]}
    ]) \
    .set_properties(**{'text-align': 'center'}, subset=['Obs','Mean','SD','Min','Max']) \
    .format({col: "{:.3f}" for col in numeric_cols})  # format all numeric columns to 3 decimals

styled_decade_table.to_html("Thesis_Decade_Summary_Table.html")

NameError: name 'np' is not defined

In [None]:
### Summary Statistic: 

In [None]:
# Your categorical variables + position dummies
group_vars = ['english_official_language', 'security_council_permanent', 'gender_dummy'] + list(position_dummies.columns)

var_labels = {
    'english_official_language': 'English as Official Language',
    'security_council_permanent': 'Permanent Member of the Security Council',
    'gender_dummy': 'Gender',
}

value_labels = {
    'english_official_language': {1: 'Yes (=1)', 0: 'No (=0)'},
    'security_council_permanent': {1: 'Yes (=1)', 0: 'No (=0)'},
    'gender_dummy': {1: 'Female (=1)', 0: 'Male (=0)'}
}

summary_list = []

position_header_inserted = False

for var in group_vars:
    # Add Position header row once before position dummies
    if var not in value_labels and not position_header_inserted:
        position_header = pd.DataFrame({
            'Variable': ['Position'],
            'Obs': [""],
            'Mean': [""],
            'SD': [""],
            'Min': [""],
            'Max': [""]
        })
        summary_list.append(position_header)
        position_header_inserted = True
    
    # Position dummies: subset score where dummy == 1
    if var not in value_labels:
        subset = un_corpus_scored[un_corpus_scored[var] == 1]
        summary_list.append(pd.DataFrame({
            'Variable': [f"&nbsp;&nbsp;&nbsp;&nbsp;{var_labels.get(var, var)}"],
            'Obs': [subset['score'].count()],
            'Mean': [subset['score'].mean()],
            'SD': [subset['score'].std()],
            'Min': [subset['score'].min()],
            'Max': [subset['score'].max()]
        }))
    
    # Conditional rows for variables with value_labels
    if var in value_labels:
        # Add variable header
        summary_list.append(pd.DataFrame({
            'Variable': [var_labels[var]],
            'Obs': [""],
            'Mean': [""],
            'SD': [""],
            'Min': [""],
            'Max': [""]
        }))
        # Add rows for each value
        for val in sorted(un_corpus_scored[var].dropna().unique()):
            subset = un_corpus_scored[un_corpus_scored[var] == val]
            summary_list.append(pd.DataFrame({
                'Variable': [f"&nbsp;&nbsp;&nbsp;&nbsp;{value_labels[var][val]}"],
                'Obs': [subset['score'].count()],
                'Mean': [subset['score'].mean()],
                'SD': [subset['score'].std()],
                'Min': [subset['score'].min()],
                'Max': [subset['score'].max()]
            }))

# Combine all summaries
score_summary_table = pd.concat(summary_list, ignore_index=True)

numeric_cols = ['Mean', 'SD', 'Min', 'Max']
score_summary_table[numeric_cols] = score_summary_table[numeric_cols].round(3)
score_summary_table[numeric_cols] = score_summary_table[numeric_cols].replace("", pd.NA)

styled_score_table = score_summary_table.style \
    .hide(axis="index") \
    .set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#d3d3d3'),
                                     ('color', 'black'),
                                     ('font-weight', 'bold'),
                                     ('text-align', 'center')]},
        {'selector': 'th.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col1', 'props': [('min-width', '80px')]},
        {'selector': 'td.col2', 'props': [('min-width', '80px')]},
        {'selector': 'td.col3', 'props': [('min-width', '80px')]},
        {'selector': 'td.col4', 'props': [('min-width', '80px')]},
        {'selector': 'td.col5', 'props': [('min-width', '80px')]}
    ]) \
    .set_properties(**{'text-align': 'center'}, subset=['Obs','Mean','SD','Min','Max']) \
    .format({col: "{:.3f}" for col in numeric_cols})

styled_score_table.to_html("Thesis_Conditional_Variables_Summary.html")

print(score_summary_table)

### T-Test

In [18]:
test_vars = ['gender_dummy', 'position']
var_labels = {
    'gender_dummy': 'Gender',
    'position': 'Position'
}

summary_list = []

overall = un_corpus_scored['score']
overall_mean = overall.mean()

summary_list.append({
    'Variable': 'Full sample',
    'Obs': overall.count(),
    'Mean': round(overall_mean, 3),
    'SD': round(overall.std(), 3),
    'Min': round(overall.min(), 3),
    'Max': round(overall.max(), 3),
    't-test': np.nan,
    'p-value': np.nan
})

for var in test_vars:
    subset = un_corpus_scored.loc[~un_corpus_scored[var].isna(), 'score']
    n = subset.count()
    mean_val = subset.mean()
    sd_val = subset.std()
    min_val = subset.min()
    max_val = subset.max()
    
    if n > 1:
        t_stat, p_val = stats.ttest_1samp(subset, overall_mean, nan_policy='omit')
    else:
        t_stat, p_val = np.nan, np.nan
    
    summary_list.append({
        'Variable': var_labels.get(var, var),
        'Obs': n,
        'Mean': round(mean_val, 3),
        'SD': round(sd_val, 3),
        'Min': round(min_val, 3),
        'Max': round(max_val, 3),
        't-test': round(t_stat, 3),
        'p-value': round(p_val, 3)
    })

summary_df = pd.DataFrame(summary_list)

styled_table = (
    summary_df.style
    .hide(axis="index")
    .set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#d3d3d3'),
                                     ('color', 'black'),
                                     ('font-weight', 'bold'),
                                     ('text-align', 'center')]},
        {'selector': 'td', 'props': [('text-align', 'center')]},
        {'selector': 'td.col0', 'props': [('text-align', 'left')]}
    ])
    .format({
        "Obs": "{:.0f}",
        "Mean": "{:.3f}",
        "SD": "{:.3f}",
        "Min": "{:.3f}",
        "Max": "{:.3f}",
        "t-test": "{:.3f}",
        "p-value": "{:.3f}"
    }, na_rep="-")
)

styled_table.to_html("Thesis_TTest_Summary_Full_Gender_Position.html")

NameError: name 'np' is not defined