# Development of Emotion and Reasoning in the General Speeches of the United Nations: A text-based machine learning approach
## Script 3: Tables
### Author: Sarah Franzen

### Instructions BEFORE running this script:
- Ensure you ran Script 0-2 before completely to create proper folder structure and to get the required data

### Description: 
#### This file creates the following figures and tables

Tables
- Summary Statistics of the given variables
- Summary Statistics Emotionality Scoring - per Decade
- Summary Statistics Emotionality Scoring - XXXX
- T-Test for suplementary data on Gender and Position of the Speaker


In [1]:
# == Import libraries for data processing and visualization ==
import joblib
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from scipy import stats
from tabulate import tabulate

# === Set Working Directory ===
# Prompt user to enter working directory path
#wd = input("Please enter your working directory path (e.g., C:\\Users\\sarah\\OneDrive\\Dokumente\\Masterarbeit): ").strip()

# Change to the entered working directory
#try:
   # os.chdir(wd)
    #print(f"Working directory set to: {os.getcwd()}")
#except FileNotFoundError:
   # print("ERROR: The directory you entered does not exist. Please restart and enter a valid path.")
    #exit(1)

# Set your working directory (adjust this as needed)
wd = r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit"

# === Define Folder Paths ===
wd_results = os.path.join(wd, 'results')
data_c = os.path.join(wd, 'data')
data_temp = os.path.join(data_c, 'temp')
data_freq = os.path.join(data_c, 'freq')
tables_dir = os.path.join(wd, 'tables')

# === Load data ===

os.chdir(tables_dir)
un_corpus_scored = pd.read_csv(
    os.path.join(wd_results, "un_corpus_scored.csv"),
    sep=';', 
    encoding='utf-8'
)

# Ensure no Missings in the emotionality scoring
rows_before = len(un_corpus_scored)
un_corpus_scored = un_corpus_scored[un_corpus_scored['score'].notna()]
print(f"Rows dropped due to missing score: {rows_before - len(un_corpus_scored)}")

Rows dropped due to missing score: 0


## Summary Statistics of the given variables
Create Table and export as tex-file

In [72]:
# Create seperate dummies on the position variable to get a nice summary table

position_nonmissing = un_corpus_scored['position'].notna()

position_dummies = pd.get_dummies(un_corpus_scored.loc[position_nonmissing, 'position'])

position_dummies = position_dummies.astype(int)

position_dummies = position_dummies.reindex(un_corpus_scored.index)

position_dummies.loc[~position_nonmissing, :] = pd.NA

position_dummies = position_dummies.astype("Int64")

un_corpus_scored = pd.concat([un_corpus_scored, position_dummies], axis=1)

In [73]:
all_numeric_vars = ['year', 'speech_length_words', 'english_official_language',
                    'security_council_permanent', 'gender_dummy'] + list(position_dummies.columns)

summary_table = pd.DataFrame({
    "Variable": all_numeric_vars,
    "Obs": un_corpus_scored[all_numeric_vars].count().astype(int),
    "Mean": un_corpus_scored[all_numeric_vars].mean().round(3),
    "SD": un_corpus_scored[all_numeric_vars].std().round(3),
    "Min": un_corpus_scored[all_numeric_vars].min(),
    "Max": un_corpus_scored[all_numeric_vars].max()
})

position_header = pd.DataFrame({
    "Variable": ["Position"],
    "Obs": [""],
    "Mean": [""],
    "SD": [""],
    "Min": [""],
    "Max": [""]
})

insert_idx = 5
summary_table = pd.concat([summary_table.iloc[:insert_idx],
                           position_header,
                           summary_table.iloc[insert_idx:]]).reset_index(drop=True)

var_labels = {
    "year": "Year",
    "speech_length_words": "Number of Words",
    "english_official_language": "English as Official Language (Yes = 1)",
    "security_council_permanent": "Permanent Member Security Council (Yes = 1)",
    "gender_dummy": "Gender (Female = 1)",
    "(Deputy) Minister for Foreign Affairs": "&nbsp;&nbsp;&nbsp;&nbsp;(Deputy) Minister for Foreign Affairs",
    "(Deputy) Prime Minister": "&nbsp;&nbsp;&nbsp;&nbsp;(Deputy) Prime Minister",
    "(Vice-) President": "&nbsp;&nbsp;&nbsp;&nbsp;(Vice-) President",
    "Diplomatic Representative": "&nbsp;&nbsp;&nbsp;&nbsp;Diplomatic Representative",
    "Others": "&nbsp;&nbsp;&nbsp;&nbsp;Others"
}
summary_table['Variable'] = summary_table['Variable'].replace(var_labels)

numeric_cols = ['Mean','SD','Min','Max']
summary_table[numeric_cols] = summary_table[numeric_cols].replace("", pd.NA)

summary_table[['Min', 'Max']] = summary_table[['Min', 'Max']].astype('Int64')

styled_table = summary_table.style \
    .hide(axis="index") \
    .set_table_styles([
        {'selector': 'th', 'props': [
            ('border-bottom', '3px solid black'), 
            ('color', 'black'),
            ('font-weight', 'bold'),
            ('text-align', 'center'),
            ('background-color', 'white')
        ]},
        {'selector': 'th.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col1', 'props': [('min-width', '80px')]},
        {'selector': 'td.col2', 'props': [('min-width', '80px')]},
        {'selector': 'td.col3', 'props': [('min-width', '80px')]},
        {'selector': 'td.col4', 'props': [('min-width', '80px')]},
        {'selector': 'td.col5', 'props': [('min-width', '80px')]}
    ]) \
    .set_properties(**{'text-align': 'center'}, subset=['Obs','Mean','SD','Min','Max']) \
    .format({"Mean": "{:.3f}", "SD": "{:.3f}"})

# --- EXPORT HTML ---
styled_table.to_html("Summary_Statistics_Table.html")

# --- EXPORT LaTeX ---
latex_ready = summary_table.copy()
latex_ready["Variable"] = latex_ready["Variable"].apply(
    lambda x: re.sub(r"&nbsp;+", r"\\hspace*{1em}", str(x)) if isinstance(x, str) else x
)

latex_table = latex_ready.to_latex(
    index=False,
    na_rep="",
    float_format="%.3f",
    column_format="lrrrrr",
    caption="Summary Statistics",
    label="tab:summary_stats",
    header=["Variable", "Obs", "Mean", "SD", "Min", "Max"],
    bold_rows=False,
    escape=False  
)


latex_table = latex_table.replace(
    "\\toprule",
    "\\hline\\hline"
).replace(
    "\\midrule",
    "\\hline"
).replace(
    "\\bottomrule",
    "\\hline\\hline"
)

# Save LaTeX file
with open("Summary_Variables.tex", "w", encoding="utf-8") as f:
    f.write(latex_table)

### Summary Statistics Emotionality Scoring - Subsamples

In [75]:
test_vars = ['gender_dummy', 'position']
var_labels = {
    'gender_dummy': 'Gender Sample',
    'position': 'Position Sample'
}

summary_list = []

# Full sample stats
overall = un_corpus_scored['score']
overall_mean = overall.mean()

summary_list.append({
    'Variable': 'Full sample',
    'Obs': overall.count(),
    'Mean': round(overall_mean, 3),
    'SD': round(overall.std(), 3),
    'Min': round(overall.min(), 3),
    'Max': round(overall.max(), 3),
    't-test': np.nan,
    'p-value': np.nan
})

# Subsamples
for var in test_vars:
    subset = un_corpus_scored.loc[~un_corpus_scored[var].isna(), 'score']
    n = subset.count()
    mean_val = subset.mean()
    sd_val = subset.std()
    min_val = subset.min()
    max_val = subset.max()

    # one-sample t-test against overall mean
    if n > 1:
        t_stat, p_val = stats.ttest_1samp(subset, overall_mean, nan_policy='omit')
    else:
        t_stat, p_val = np.nan, np.nan

    summary_list.append({
        'Variable': var_labels.get(var, var),
        'Obs': n,
        'Mean': round(mean_val, 3),
        'SD': round(sd_val, 3),
        'Min': round(min_val, 3),
        'Max': round(max_val, 3),
        't-test': round(t_stat, 3),
        'p-value': round(p_val, 3)
    })

summary_df = pd.DataFrame(summary_list)

# ---- STYLING ----
styled_table = (
    summary_df.style
    .hide(axis="index")
    .set_table_styles([
        {'selector': 'th', 'props': [
            ('border-bottom', '3px solid black'), 
            ('color', 'black'),
            ('font-weight', 'bold'),
            ('text-align', 'center'),
            ('background-color', 'white')
        ]},
        {'selector': 'th.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td', 'props': [('text-align', 'center')]},
    ])
    .set_properties(**{'text-align': 'center'})
    .format({
        "Obs": "{:.0f}",
        "Mean": "{:.3f}",
        "SD": "{:.3f}",
        "Min": "{:.3f}",
        "Max": "{:.3f}",
        "t-test": "{:.3f}",
        "p-value": "{:.3f}"
    }, na_rep="-")
)

# --- EXPORT HTML ---
styled_table.to_html("TTest_Scoring_Full_Gender_Position.html")

# --- EXPORT LaTeX ---
latex_ready = summary_df.copy()

latex_table = latex_ready.to_latex(
    index=False,
    na_rep="",
    float_format="%.3f",
    column_format="lrrrrrrr",
    caption="Emotionality Scoring of the Subsamples with T-Tests",
    label="tab:summary_stats_ttest",
    header=["Variable", "Obs", "Mean", "SD", "Min", "Max", "t-test", "p-value"],
    bold_rows=False,
    escape=False
)

latex_table = (
    latex_table.replace("\\toprule", "\\hline\\hline")
               .replace("\\midrule", "\\hline")
               .replace("\\bottomrule", "\\hline\\hline")
)

with open("TTest_Scoring_Full_Gender_Position.tex", "w", encoding="utf-8") as f:
    f.write(latex_table)


### Summary Statistics Emotionality Scoring - per Decade

In [77]:
decade_start = (np.floor((un_corpus_scored['year'] - 1946) / 10) * 10 + 1946).astype(int)
decade_end = decade_start + 9
decade_end = decade_end.where(decade_end < 2024, 2024)

un_corpus_scored['Decade'] = decade_start.astype(str) + "–" + decade_end.astype(str)

decade_summary = (
    un_corpus_scored.groupby('Decade')['score']
    .agg(Obs='count', Mean='mean', SD='std', Min='min', Max='max')
    .reset_index()
)

numeric_cols = ['Mean', 'SD', 'Min', 'Max']
decade_summary[numeric_cols] = decade_summary[numeric_cols].round(3)

styled_decade_table = (
    decade_summary.style
    .hide(axis="index")
    .set_table_styles([
        {'selector': 'th', 'props': [
            ('border-bottom', '3px solid black'),
            ('color', 'black'),
            ('font-weight', 'bold'),
            ('text-align', 'center'),
            ('background-color', 'white')
        ]},
        {'selector': 'th.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col1', 'props': [('min-width', '80px')]},
        {'selector': 'td.col2', 'props': [('min-width', '80px')]},
        {'selector': 'td.col3', 'props': [('min-width', '80px')]},
        {'selector': 'td.col4', 'props': [('min-width', '80px')]},
        {'selector': 'td.col5', 'props': [('min-width', '80px')]}
    ])
    .set_properties(**{'text-align': 'center'}, subset=['Obs', 'Mean', 'SD', 'Min', 'Max'])
    .format({
        'Obs': '{:.0f}',
        'Mean': '{:.3f}',
        'SD': '{:.3f}',
        'Min': '{:.3f}',
        'Max': '{:.3f}'
    }, na_rep='-')
)

# --- EXPORT HTML ---
styled_decade_table.to_html("Scoring_per_Decade.html")

# --- EXPORT LaTeX ---
latex_table = decade_summary.to_latex(
    index=False,
    na_rep="",
    float_format="%.3f",
    column_format="lrrrrr",
    caption="Emotionality Scoring by Decade",
    label="tab:summary_decade",
    header=["Decade", "Obs", "Mean", "SD", "Min", "Max"],
    bold_rows=False,
    escape=False
)

latex_table = (
    latex_table.replace("\\toprule", "\\hline\\hline")
               .replace("\\midrule", "\\hline")
               .replace("\\bottomrule", "\\hline\\hline")
)

with open("Scoring_per_Decade.tex", "w", encoding="utf-8") as f:
    f.write(latex_table)

### Summary Statistics Emotionaly Scoring for categorial variables

In [79]:
group_vars = ['english_official_language', 'security_council_permanent', 'gender_dummy'] + list(position_dummies.columns)

var_labels = {
    'english_official_language': 'English as Official Language',
    'security_council_permanent': 'Permanent Member of the Security Council',
    'gender_dummy': 'Gender',
}

value_labels = {
    'english_official_language': {1: 'Yes (=1)', 0: 'No (=0)'},
    'security_council_permanent': {1: 'Yes (=1)', 0: 'No (=0)'},
    'gender_dummy': {1: 'Female (=1)', 0: 'Male (=0)'}
}

summary_list = []
position_header_inserted = False

for var in group_vars:
    # Insert "Position" header row once before position dummies
    if var not in value_labels and not position_header_inserted:
        position_header = pd.DataFrame({
            'Variable': ['Position'],
            'Obs': [""],
            'Mean': [""],
            'SD': [""],
            'Min': [""],
            'Max': [""]
        })
        summary_list.append(position_header)
        position_header_inserted = True
    
   
    if var not in value_labels:
        subset = un_corpus_scored[un_corpus_scored[var] == 1]
        summary_list.append(pd.DataFrame({
            'Variable': [f"&nbsp;&nbsp;&nbsp;&nbsp;{var_labels.get(var, var)}"],
            'Obs': [subset['score'].count()],
            'Mean': [subset['score'].mean()],
            'SD': [subset['score'].std()],
            'Min': [subset['score'].min()],
            'Max': [subset['score'].max()]
        }))
    
   
    if var in value_labels:
        # Variable header
        summary_list.append(pd.DataFrame({
            'Variable': [var_labels[var]],
            'Obs': [""],
            'Mean': [""],
            'SD': [""],
            'Min': [""],
            'Max': [""]
        }))
        for val in sorted(un_corpus_scored[var].dropna().unique()):
            subset = un_corpus_scored[un_corpus_scored[var] == val]
            summary_list.append(pd.DataFrame({
                'Variable': [f"&nbsp;&nbsp;&nbsp;&nbsp;{value_labels[var][val]}"],
                'Obs': [subset['score'].count()],
                'Mean': [subset['score'].mean()],
                'SD': [subset['score'].std()],
                'Min': [subset['score'].min()],
                'Max': [subset['score'].max()]
            }))


score_summary_table = pd.concat(summary_list, ignore_index=True)


numeric_cols = ['Mean', 'SD', 'Min', 'Max']
score_summary_table[numeric_cols] = score_summary_table[numeric_cols].round(3)
score_summary_table[numeric_cols] = score_summary_table[numeric_cols].replace("", pd.NA)

styled_score_table = (
    score_summary_table.style
    .hide(axis="index")
    .set_table_styles([
        {'selector': 'th', 'props': [
            ('border-bottom', '3px solid black'),
            ('color', 'black'),
            ('font-weight', 'bold'),
            ('text-align', 'center'),
            ('background-color', 'white')
        ]},
        {'selector': 'th.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col1', 'props': [('min-width', '80px')]},
        {'selector': 'td.col2', 'props': [('min-width', '80px')]},
        {'selector': 'td.col3', 'props': [('min-width', '80px')]},
        {'selector': 'td.col4', 'props': [('min-width', '80px')]},
        {'selector': 'td.col5', 'props': [('min-width', '80px')]}
    ])
    .set_properties(**{'text-align': 'center'}, subset=['Obs','Mean','SD','Min','Max'])
    .format({col: "{:.3f}" for col in numeric_cols})
)

# --- EXPORT HTML ---
styled_score_table.to_html("Scoring_categorial_variables.html")

# --- EXPORT LaTeX ---
latex_table = score_summary_table.copy()

latex_table['Variable'] = latex_table['Variable'].apply(
    lambda x: str(x).replace("&nbsp;&nbsp;&nbsp;&nbsp;", "\\hspace*{1em}") if isinstance(x, str) else x
)

latex_str = latex_table.to_latex(
    index=False,
    na_rep="",
    float_format="%.3f",
    column_format="lrrrrr",
    caption="Emotionality Scoring for the categorial variables",
    label="tab:conditional_vars",
    escape=False
)


latex_str = latex_str.replace("\\toprule", "\\hline\\hline") \
                     .replace("\\midrule", "\\hline") \
                     .replace("\\bottomrule", "\\hline\\hline")

with open("Scoring_categorial_variable.tex", "w", encoding="utf-8") as f:
    f.write(latex_str)

Top 5 speeches by score:
                           country_name  year  \
9819                            Eritrea  2002   
10666                             Libya  2023   
3231                              Haiti  2003   
4631   Saint Vincent and the Grenadines  2004   
3384                              Haiti  1994   

                                                  speech     score  
9819   ﻿Let me start by\ncongratulating the President...  1.635238  
10666  I address the General Assembly on behalf of th...  1.624794  
3231   ﻿It is my\nhonour to greet the President on be...  1.624352  
4631   The fifty-ninth session of the General Assembl...  1.559744  
3384    It gives me great pleasure to greet all of yo...  1.517082  


Bottom 5 speeches by score:
      country_name  year                                             speech  \
3712      Slovakia  1995  Mr. President, it is\nundoubtedly a great hono...   
594          Spain  1993  I should like first of all, Mr. President, to\...   

Speech from Eritrea in 2002:

﻿Let me start by
congratulating the President on his unanimous election
to preside  over the fifty-seventh session of the United
Nations General Assembly. His election is an honour to
him and to his country, the Czech Republic. The
delegation of the State of Eritrea assures you of its full
cooperation and support.
Allow me also to extend to his predecessor, Mr.
Han Seung-soo of the Republic of Korea, my
delegation's deep appreciation for a job well done
during the fifty-sixth session of the General Assembly.
Profound appreciation is also due to our Secretary-
General, Mr. Kofi Annan, for his tireless efforts to
make our United Nations more responsive and
effective.
On this happy occasion, my delegation welcomes
the Swiss Confederation to the family of nations with
great pleasure. We commend the decision taken by the
Swiss people in allowing their great country to become
a full Member of the United Nations. The same words
go to the soon-to-be 191st Member o

Speech from Slovakia in 1995:

Mr. President, it is
undoubtedly a great honour to be the President of the
United Nations General Assembly session in this very
important year. Please accept my sincere congratulations on
your election and my very best wishes for a successful
course of its fiftieth session.
I should also like to thank your predecessor, His
Excellency Mr. Amara Essy, for his significant contribution
to the positive results achieved at last year’s session.
By the end of 1995, the Slovak Republic will have
concluded the third year of its sovereign existence.
Although my country is relatively new in the international
community, it has during the past three years confirmed
that its activities in the international scene are based on
respect for international law and for the principles of
democratic coexistence between nations.
The basic coordinate of Slovakia’s foreign policy is its
integration into the geopolitical and economic sphere with
which we not only have numerous histo

Top 5 speeches by score (Security Council permanent members):
        country_name  year                                             speech  \
5989   United States  2001  ﻿We meet in a Hall devoted to\npeace; in a cit...   
4228   United States  1998  Let me begin by thanking the\nAssembly for its...   
847    United States  2003  ﻿Twenty-four months ago, and\nyesterday in the...   
5650   United States  2017  I welcome everyone to New York. It is a profou...   
10940  United States  2020  It is my profound honor to address the United ...   

          score  
5989   1.404797  
4228   1.394045  
847    1.387432  
5650   1.356830  
10940  1.352940  


Bottom 5 speeches by score (Security Council permanent members):
       country_name  year                                             speech  \
8510         France  1994  France is especially pleased, Mr. President, t...   
2257         Russia  2007  Traditionally, the General Assembly sums up \n...   
8863  United States  1949  Mr. ACHES