# Development of Emotion and Reasoning in the General Speeches of the United Nations: A text-based machine learning approach
## Script 3: Tables
### Author: Sarah Franzen

### Instructions BEFORE running this script:
- Ensure you ran Script 0 till 2 before completely to create proper folder structure and to get the required data

### Description: 
#### This file creates the following tables:

Descriptive Summary Statistics
- Summary Statistics
- Frequency of Categorial Variables

Emotionality Scoring
- Emotionality Scoring per Decade
- Emotionality Scoring - Subsamples
- Emotionality Scoring - Categorial Variables
- Emotionality Scoring - Position (1994-2024)

Other Tables
- T-Test Subsamples
- Years with more than 5 female speakers
- Number of (Unique) Tokens
- Speeches with the highest and lowest emotionality score
- Speeches of permanent members of the security council with the highest and lowest emotionality score

Appendix
- Yearly Emotionality Scores
- Year with a change in the emotionality score by over 0.08
- Two speeches with the lowest and highest score (Fully printed)

In [1]:
# == Import libraries for data processing and visualization ==
import joblib
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from scipy import stats
from tabulate import tabulate
import re
import ast

# === Set Working Directory ===
# Adjust this as needed
wd = r"C:\Users\sarah\OneDrive\Dokumente\Masterarbeit"

# === Define Folder Paths ===
data_c = os.path.join(wd, 'data')
data_results = os.path.join(data_c, 'results')
data_temp = os.path.join(data_c, 'temp')
data_freq = os.path.join(data_c, 'freq')
tables_dir = os.path.join(wd, 'tables')

# === Load data ===

os.chdir(tables_dir)
un_corpus_scored = pd.read_csv(
    os.path.join(data_results, "un_corpus_scored.csv"),
    sep=';', 
    encoding='utf-8'
)

Rows dropped due to missing score: 0


----

## Descriptive Summary Statistics

In [3]:
# Create seperate dummies on the position variable to get a nice summary table

position_nonmissing = un_corpus_scored['position'].notna()

position_dummies = pd.get_dummies(un_corpus_scored.loc[position_nonmissing, 'position'])

position_dummies = position_dummies.astype(int)

position_dummies = position_dummies.reindex(un_corpus_scored.index)

position_dummies.loc[~position_nonmissing, :] = pd.NA

position_dummies = position_dummies.astype("Int64")

un_corpus_scored = pd.concat([un_corpus_scored, position_dummies], axis=1)

### Table: Summary Statistics (All Variables) (Obs, Mean, SD, Min, Max)

In [273]:
all_numeric_vars = ['year', 'speech_length_words', 'english_official_language',
                    'security_council_permanent', 'gender_dummy'] + list(position_dummies.columns)

perm_members = ["RUS", "FRA", "GBR", "USA", "CHN"]
for c in perm_members:
    un_corpus_scored[f"perm_{c}"] = (un_corpus_scored["country_code"] == c).astype(int)
perm_dummies = [f"perm_{c}" for c in perm_members]

sc_index = all_numeric_vars.index("security_council_permanent") + 1
all_numeric_vars = (
    all_numeric_vars[:sc_index] +
    perm_dummies +
    all_numeric_vars[sc_index:]
)

summary_table = pd.DataFrame({
    "Variable": all_numeric_vars,
    "N": un_corpus_scored[all_numeric_vars].count().astype(int),
    "Mean": un_corpus_scored[all_numeric_vars].mean().round(3),
    "SD": un_corpus_scored[all_numeric_vars].std().round(3),
    "Min": un_corpus_scored[all_numeric_vars].min(),
    "Max": un_corpus_scored[all_numeric_vars].max()
})

position_header = pd.DataFrame({
    "Variable": ["Position"],
    "N": [""],
    "Mean": [""],
    "SD": [""],
    "Min": [""],
    "Max": [""]
})

sc_header = pd.DataFrame({
    "Variable": ["Permanent Members of the Security Council"],
    "N": [""],
    "Mean": [""],
    "SD": [""],
    "Min": [""],
    "Max": [""]
})

sc_loc = summary_table.index.get_indexer(summary_table.index[summary_table["Variable"] == "security_council_permanent"])[0] + 1

summary_table = pd.concat([
    summary_table.iloc[:sc_loc],
    sc_header,
    summary_table.iloc[sc_loc:]
]).reset_index(drop=True)

insert_idx = summary_table.index[summary_table["Variable"] == "gender_dummy"][0] + 1
summary_table = pd.concat([summary_table.iloc[:insert_idx],
                           position_header,
                           summary_table.iloc[insert_idx:]]).reset_index(drop=True)

var_labels = {
    "year": "Year",
    "speech_length_words": "Number of Words",
    "english_official_language": "English as Official Language (Yes = 1)",
    "security_council_permanent": "Permanent Membership of Security Council (Yes = 1)",
    "gender_dummy": "Gender (Female = 1)",
    "(Deputy) Minister for Foreign Affairs": "&nbsp;&nbsp;&nbsp;&nbsp;(Deputy) Minister for Foreign Affairs",
    "(Deputy) Prime Minister": "&nbsp;&nbsp;&nbsp;&nbsp;(Deputy) Prime Minister",
    "(Vice-) President": "&nbsp;&nbsp;&nbsp;&nbsp;(Vice-) President",
    "Diplomatic Representative": "&nbsp;&nbsp;&nbsp;&nbsp;Diplomatic Representative",
    "Others": "&nbsp;&nbsp;&nbsp;&nbsp;Others",

    "perm_RUS": "&nbsp;&nbsp;&nbsp;&nbsp;Russia",
    "perm_FRA": "&nbsp;&nbsp;&nbsp;&nbsp;France",
    "perm_GBR": "&nbsp;&nbsp;&nbsp;&nbsp;United Kingdom",
    "perm_USA": "&nbsp;&nbsp;&nbsp;&nbsp;United States",
    "perm_CHN": "&nbsp;&nbsp;&nbsp;&nbsp;China",

    "Permanent Members of the Security Council": "Permanent Members of the Security Council"
}

summary_table['Variable'] = summary_table['Variable'].replace(var_labels)

numeric_cols = ['Mean','SD','Min','Max']
summary_table[numeric_cols] = summary_table[numeric_cols].replace("", pd.NA)

summary_table[['Min', 'Max']] = summary_table[['Min', 'Max']].astype('Int64')

styled_table = summary_table.style \
    .hide(axis="index") \
    .set_table_styles([
        {'selector': 'th', 'props': [
            ('border-bottom', '3px solid black'), 
            ('color', 'black'),
            ('font-weight', 'bold'),
            ('text-align', 'center'),
            ('background-color', 'white')
        ]},
        {'selector': 'th.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col1', 'props': [('min-width', '80px')]},
        {'selector': 'td.col2', 'props': [('min-width', '80px')]},
        {'selector': 'td.col3', 'props': [('min-width', '80px')]},
        {'selector': 'td.col4', 'props': [('min-width', '80px')]},
        {'selector': 'td.col5', 'props': [('min-width', '80px')]}
    ]) \
    .set_properties(**{'text-align': 'center'}, subset=['N','Mean','SD','Min','Max']) \
    .format({"Mean": "{:.3f}", "SD": "{:.3f}"})

note_text = "Note: Gender and position information comes from supplementary data and is not available for all speeches."

# --- EXPORT HTML ---
html_table = styled_table.to_html()
html_table_with_note = html_table + f'<p style="text-align:center; font-style:italic;">{note_text}</p>'
with open("Summary_Statistics_Table.html", "w", encoding="utf-8") as f:
    f.write(html_table_with_note)

# --- EXPORT LaTeX ---
latex_ready = summary_table.copy()
latex_ready["Variable"] = latex_ready["Variable"].apply(
    lambda x: re.sub(r"&nbsp;+", r"\\hspace*{1em}", str(x)) if isinstance(x, str) else x
)

latex_table = latex_ready.to_latex(
    index=False,
    na_rep="",
    float_format="%.3f",
    column_format="lrrrrr",
    caption="Summary Statistics",
    label="tab:summary_stats",
    header=["Variable", "Obs", "Mean", "SD", "Min", "Max"],
    bold_rows=False,
    escape=False  
)

latex_table = latex_table.replace(
    "\\toprule",
    "\\hline\\hline"
).replace(
    "\\midrule",
    "\\hline"
).replace(
    "\\bottomrule",
    "\\hline\\hline"
)

note_text = "Note: Gender and position information comes from supplementary data and is not available for all speeches."

latex_table_with_note = latex_table.replace(
    r"\end{tabular}",
    rf"\end{{tabular}}\\[1ex] {{\centering \textit{{{note_text}}} \\}}"
)


with open("Summary_Variables.tex", "w", encoding="utf-8") as f:
    f.write(latex_table_with_note)

styled_table

Variable,N,Mean,SD,Min,Max
Year,10952.0,1993.296,20.186,1946.0,2024.0
Number of Words,10952.0,2913.75,1502.019,423.0,22003.0
English as Official Language (Yes = 1),10952.0,0.239,0.426,0.0,1.0
Permanent Membership of Security Council (Yes = 1),10952.0,0.035,0.185,0.0,1.0
Permanent Members of the Security Council,,,,,
Russia,10952.0,0.007,0.084,0.0,1.0
France,10952.0,0.007,0.082,0.0,1.0
United Kingdom,10952.0,0.007,0.085,0.0,1.0
United States,10952.0,0.007,0.085,0.0,1.0
China,10952.0,0.007,0.084,0.0,1.0


### Table: Counts and Shares of Categorial Variables

In [275]:
# --- Ensure numeric dummies ---
#un_corpus_scored['gender_dummy'] = pd.to_numeric(un_corpus_scored['gender_dummy'], errors='coerce')
#un_corpus_scored['english_official_language'] = pd.to_numeric(un_corpus_scored['english_official_language'], errors='coerce')
#un_corpus_scored['security_council_permanent'] = pd.to_numeric(un_corpus_scored['security_council_permanent'], errors='coerce')

# Create Dummies for permanent members of the security council
perm_members = ["RUS", "FRA", "GBR", "USA", "CHN"]
perm_labels = ["Russia", "France", "United Kingdom", "United States", "China"]
for c in perm_members:
    un_corpus_scored[f"perm_{c}"] = (un_corpus_scored["country_code"] == c).astype(int)

rows = []

gender_available = un_corpus_scored['gender_dummy'].notna().sum()
gender_counts = un_corpus_scored['gender_dummy'].value_counts(dropna=True)
rows.append(['Gender', '', gender_available, ''])
rows.append(['', 'Male', gender_counts.get(0, 0), f"{gender_counts.get(0,0)/gender_available:.1%}"])
rows.append(['', 'Female', gender_counts.get(1, 0), f"{gender_counts.get(1,0)/gender_available:.1%}"])

positions = list(position_dummies.columns)
position_available = un_corpus_scored[positions].notna().all(axis=1).sum()
position_counts = un_corpus_scored[positions].sum()
rows.append(['Position', '', position_available, ''])
for pos in positions:
    rows.append(['', pos, position_counts[pos], f"{position_counts[pos]/position_available:.1%}"])

sc_available = un_corpus_scored['security_council_permanent'].notna().sum()
sc_counts = un_corpus_scored['security_council_permanent'].value_counts(dropna=True)
rows.append(['Permanent Membership of the Security Council', '', sc_available, ''])
rows.append(['', 'No', sc_counts.get(0,0), f"{sc_counts.get(0,0)/sc_available:.1%}"])
rows.append(['', 'Yes', sc_counts.get(1,0), f"{sc_counts.get(1,0)/sc_available:.1%}"])

sc_country_counts = [un_corpus_scored.get(f'perm_{c}', pd.Series(0)).sum() for c in perm_members]
total_p5_count = sum(sc_country_counts)
rows.append(['Permanent Members of the Security Council', '', total_p5_count, ''])
for label, cnt in zip(perm_labels, sc_country_counts):
    rows.append(['', label, cnt, f"{cnt/sc_available:.1%}"])

eng_available = un_corpus_scored['english_official_language'].notna().sum()
eng_counts = un_corpus_scored['english_official_language'].value_counts(dropna=True)
rows.append(['English as Official Language', '', eng_available, ''])
rows.append(['', 'No', eng_counts.get(0,0), f"{eng_counts.get(0,0)/eng_available:.1%}"])
rows.append(['', 'Yes', eng_counts.get(1,0), f"{eng_counts.get(1,0)/eng_available:.1%}"])

summary_hierarchical = pd.DataFrame(rows, columns=['Category', 'Subcategory', 'N', 'Share'])

# --- HTML ---
def top_border_html(row):
    return ['border-top:1px solid black;' if row['Category'] != '' else '' for _ in row]

styled_table = summary_hierarchical.style \
    .hide(axis="index") \
    .set_table_styles([
        {'selector':'th','props':[('border-bottom','3px solid black'),
                                  ('text-align','center'),
                                  ('font-weight','bold')]},
        {'selector':'td.col0','props':[('text-align','left')]},
        {'selector':'td.col1','props':[('text-align','left')]},
        {'selector':'td','props':[('text-align','center')]}
    ]) \
    .apply(top_border_html, axis=1)


html_file = "Frequencies_Cat_Var_.html"
styled_table.to_html(html_file)

# --- LaTeX ---
latex_ready = summary_hierarchical.copy()

latex_ready['Subcategory'] = latex_ready.apply(
    lambda x: '\\hspace*{1em}' + str(x['Subcategory']) if x['Category']=='' else x['Subcategory'], axis=1
)
latex_ready['Share'] = latex_ready['Share'].str.replace('%', r'\%', regex=False)

latex_table = latex_ready.to_latex(
    index=False,
    na_rep='',
    column_format='p{6cm}lrr',
    caption="Frequencies of Categorial Variables",
    label="tab:frequencies_cat_var_summary",
    escape=False
)

latex_table = latex_table.replace("\\toprule","\\hline\\hline") \
                         .replace("\\midrule","") \
                         .replace("\\bottomrule","\\hline\\hline")

lines = latex_table.splitlines()
category_rows = summary_hierarchical[summary_hierarchical['Category'] != ''].index.tolist()
for idx in category_rows[::-1]:  # reverse so insertion doesn't shift lines
    for i, l in enumerate(lines):
        if re.match(rf"{summary_hierarchical.loc[idx,'Category']}", l):
            lines.insert(i, '\\hline')  # insert above
            break
latex_table = "\n".join(lines)

note_text = "Note: Gender, position, and Security Council information comes from supplementary data and is not available for all speeches."
latex_table_with_note = latex_table.replace(
    "\\end{tabular}",
    f"\\end{{tabular}}\n\\\\[1ex] {{\\centering \\textit{{{note_text}}} \\\\}}"
)

latex_file = "Frequencies_Cat_Var_Table.tex"
with open(latex_file, "w", encoding="utf-8") as f:
    f.write(latex_table_with_note)

styled_table

Category,Subcategory,N,Share
Gender,,4704,
,Male,4521,96.1%
,Female,183,3.9%
Position,,6273,
,(Deputy) Minister for Foreign Affairs,2387,38.1%
,(Deputy) Prime Minister,1239,19.8%
,(Vice-) President,2060,32.8%
,Diplomatic Representative,339,5.4%
,Others,248,4.0%
Permanent Membership of the Security Council,,10952,


-----

## Tables Emotionality Scoring

### Table: Emotionality Scoring - per Decade

In [249]:
decade_start = (np.floor((un_corpus_scored['year'] - 1946) / 10) * 10 + 1946).astype(int)
decade_end = decade_start + 9
decade_end = decade_end.where(decade_end < 2024, 2024)

un_corpus_scored['Decade'] = decade_start.astype(str) + "–" + decade_end.astype(str)

decade_summary = (
    un_corpus_scored.groupby('Decade')['score']
    .agg(Obs='count', Mean='mean', SD='std', Min='min', Max='max')
    .reset_index()
)

numeric_cols = ['Mean', 'SD', 'Min', 'Max']
decade_summary[numeric_cols] = decade_summary[numeric_cols].round(3)

styled_decade_table = (
    decade_summary.style
    .hide(axis="index")
    .set_table_styles([
        {'selector': 'th', 'props': [
            ('border-bottom', '3px solid black'),
            ('color', 'black'),
            ('font-weight', 'bold'),
            ('text-align', 'center'),
            ('background-color', 'white')
        ]},
        {'selector': 'th.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col1', 'props': [('min-width', '80px')]},
        {'selector': 'td.col2', 'props': [('min-width', '80px')]},
        {'selector': 'td.col3', 'props': [('min-width', '80px')]},
        {'selector': 'td.col4', 'props': [('min-width', '80px')]},
        {'selector': 'td.col5', 'props': [('min-width', '80px')]}
    ])
    .set_properties(**{'text-align': 'center'}, subset=['Obs', 'Mean', 'SD', 'Min', 'Max'])
    .format({
        'Obs': '{:.0f}',
        'Mean': '{:.3f}',
        'SD': '{:.3f}',
        'Min': '{:.3f}',
        'Max': '{:.3f}'
    }, na_rep='-')
)

# --- EXPORT HTML ---
styled_decade_table.to_html("Scoring_per_Decade.html")

# --- EXPORT LaTeX ---

latex_table = decade_summary.to_latex(
    index=False,
    na_rep="",
    float_format="%.3f",
    column_format="lrrrrr",
    escape=False
)

latex_table = (
    latex_table.replace("\\toprule", "\\hline\\hline")
               .replace("\\midrule", "\\hline")
               .replace("\\bottomrule", "\\hline\\hline")
)

latex_str = (
    "\\begin{table}[htbp]\n"
    "\\centering\n"
    "\\caption{Emotionality Scoring by Decade}\n"
    "\\label{tab:summary_decade}\n"
    + latex_table +
    "\n\\end{table}"
)

with open("Scoring_per_Decade.tex", "w", encoding="utf-8") as f:
    f.write(latex_str)

styled_decade_table

Decade,Obs,Mean,SD,Min,Max
1946–1955,421,0.847,0.177,0.5,1.416
1956–1965,833,0.85,0.187,0.44,1.358
1966–1975,1132,0.866,0.203,0.426,1.554
1976–1985,1436,0.839,0.185,0.435,1.527
1986–1995,1618,0.766,0.186,0.364,1.51
1996–2005,1840,0.76,0.209,0.31,1.66
2006–2015,1927,0.789,0.214,0.316,1.512
2016–2024,1745,0.829,0.197,0.345,1.515


### Table: Emotionality Scoring - Subsamples

In [104]:
entire = un_corpus_scored["score"]

gender_sample = un_corpus_scored.loc[
    un_corpus_scored["gender_dummy"].notna(),
    "score"
]

position_sample = un_corpus_scored.loc[
    un_corpus_scored["position"].notna(),
    "score"
]

p5_sample = un_corpus_scored.loc[
    un_corpus_scored["security_council_permanent"] == 1,
    "score"
]

def summarize(series, name):
    return pd.DataFrame({
        "Sample": [name],
        "N": [series.count()],
        "Mean": [round(series.mean(),3)],
        "SD": [round(series.std(),3)],
        "Min": [round(series.min(),3)],
        "Max": [round(series.max(),3)]
    })

summary = pd.concat([
    summarize(entire, "Entire Sample"),
    summarize(gender_sample, "Gender Sample"),
    summarize(position_sample, "Position Sample"),
    summarize(p5_sample, "Permanent Members of the Security Council Sample")
], ignore_index=True)

summary = summary[["Sample", "N", "Mean", "SD", "Min", "Max"]]


# --- EXPORT HTML ---

html_file = "Scoring_Subsamples.html"

styled = summary.style \
    .hide(axis="index") \
    .set_table_styles([
        {'selector': 'th', 'props': [
            ('border-bottom', '3px solid black'),
            ('font-weight', 'bold'),
            ('text-align', 'center')
        ]},
        {'selector': 'td.col0', 'props': [('text-align', 'left')]},  # left-align Sample column
        {'selector': 'td', 'props': [('text-align', 'center')]}      # center numeric columns
    ]) \
    .format({
        "Mean": "{:.3f}",
        "SD": "{:.3f}",
        "Min": "{:.3f}",
        "Max": "{:.3f}"
    })

styled.to_html(html_file)

# --- EXPORT LaTeX -

latex_table = summary.to_latex(
    index=False,
    column_format="lrrrrr",
    caption="Scoring Subsamples",
    label="tab:scoring_subsamples",
    float_format="%.3f",
    escape=False
)

latex_table = (
    latex_table.replace("\\toprule","\\hline\\hline")
               .replace("\\midrule","\\hline")
               .replace("\\bottomrule","\\hline\\hline")
)

latex_file = "Scoring_Subsamples.tex"
with open(latex_file, "w", encoding="utf-8") as f:
    f.write(latex_table)


summary

Unnamed: 0,Sample,N,Mean,SD,Min,Max
0,Entire Sample,10952,0.808,0.202,0.31,1.66
1,Gender Sample,4704,0.82,0.196,0.345,1.554
2,Position Sample,6273,0.793,0.208,0.31,1.66
3,Permanent Members of the Security Council Sample,388,0.838,0.211,0.398,1.484


### Table: Emotionaly Scoring - Categorial Variables

In [269]:
perm_members = ["RUS", "FRA", "GBR", "USA", "CHN"]
perm_labels = ["Russia", "France", "United Kingdom", "United States", "China"]

for c in perm_members:
    un_corpus_scored[f"perm_{c}"] = (un_corpus_scored["country_code"] == c).astype(int)

summary_list = []

overall_row = pd.DataFrame({
    'Variable': ['Overall'],
    'N': [un_corpus_scored['score'].count()],
    'Mean': [un_corpus_scored['score'].mean()],
    'SD': [un_corpus_scored['score'].std()],
    'Min': [un_corpus_scored['score'].min()],
    'Max': [un_corpus_scored['score'].max()]
})

summary_list.append(overall_row)

summary_list.append(pd.DataFrame({
    'Variable': ['English as Official Language'],
    'N': [''], 'Mean': [''], 'SD': [''], 'Min': [''], 'Max': ['']
}))
for val in sorted(un_corpus_scored['english_official_language'].dropna().unique()):
    subset = un_corpus_scored[un_corpus_scored['english_official_language'] == val]
    summary_list.append(pd.DataFrame({
        'Variable': [f"&nbsp;&nbsp;&nbsp;&nbsp;{'Yes (=1)' if val==1 else 'No (=0)'}"],
        'N': [subset['score'].count()],
        'Mean': [subset['score'].mean()],
        'SD': [subset['score'].std()],
        'Min': [subset['score'].min()],
        'Max': [subset['score'].max()]
    }))

summary_list.append(pd.DataFrame({
    'Variable': ['Permanent Membership of the Security Council'],
    'N': [''], 'Mean': [''], 'SD': [''], 'Min': [''], 'Max': ['']
}))
for val in sorted(un_corpus_scored['security_council_permanent'].dropna().unique()):
    subset = un_corpus_scored[un_corpus_scored['security_council_permanent'] == val]
    summary_list.append(pd.DataFrame({
        'Variable': [f"&nbsp;&nbsp;&nbsp;&nbsp;{'Yes (=1)' if val==1 else 'No (=0)'}"],
        'N': [subset['score'].count()],
        'Mean': [subset['score'].mean()],
        'SD': [subset['score'].std()],
        'Min': [subset['score'].min()],
        'Max': [subset['score'].max()]
    }))

summary_list.append(pd.DataFrame({
    'Variable': ['Permanent Members of the Security Council'],
    'N': [''], 'Mean': [''], 'SD': [''], 'Min': [''], 'Max': ['']
}))
for label, c in zip(perm_labels, perm_members):
    subset = un_corpus_scored[un_corpus_scored[f'perm_{c}']==1]
    summary_list.append(pd.DataFrame({
        'Variable': [f"&nbsp;&nbsp;&nbsp;&nbsp;{label}"],
        'N': [subset['score'].count()],
        'Mean': [subset['score'].mean()],
        'SD': [subset['score'].std()],
        'Min': [subset['score'].min()],
        'Max': [subset['score'].max()]
    }))

summary_list.append(pd.DataFrame({
    'Variable': ['Gender'],
    'N': [''], 'Mean': [''], 'SD': [''], 'Min': [''], 'Max': ['']
}))
for val in sorted(un_corpus_scored['gender_dummy'].dropna().unique()):
    subset = un_corpus_scored[un_corpus_scored['gender_dummy']==val]
    summary_list.append(pd.DataFrame({
        'Variable': [f"&nbsp;&nbsp;&nbsp;&nbsp;{'Female (=1)' if val==1 else 'Male (=0)'}"],
        'N': [subset['score'].count()],
        'Mean': [subset['score'].mean()],
        'SD': [subset['score'].std()],
        'Min': [subset['score'].min()],
        'Max': [subset['score'].max()]
    }))

summary_list.append(pd.DataFrame({
    'Variable': ['Position'],
    'N': [''], 'Mean': [''], 'SD': [''], 'Min': [''], 'Max': ['']
}))

for pos in position_dummies.columns:
    subset = un_corpus_scored[un_corpus_scored[pos]==1]
    summary_list.append(pd.DataFrame({
        'Variable': [f"&nbsp;&nbsp;&nbsp;&nbsp;{pos}"],
        'N': [subset['score'].count()],
        'Mean': [subset['score'].mean()],
        'SD': [subset['score'].std()],
        'Min': [subset['score'].min()],
        'Max': [subset['score'].max()]
    }))

score_summary_table = pd.concat(summary_list, ignore_index=True)

numeric_cols = ['Mean', 'SD', 'Min', 'Max']
score_summary_table[numeric_cols] = score_summary_table[numeric_cols].round(3)
score_summary_table[numeric_cols] = score_summary_table[numeric_cols].replace("", pd.NA)

styled_score_table = (
    score_summary_table.style
    .hide(axis="index")
    .set_table_styles([
        {'selector': 'th', 'props': [
            ('border-bottom', '3px solid black'),
            ('color', 'black'),
            ('font-weight', 'bold'),
            ('text-align', 'center'),
            ('background-color', 'white')
        ]},
        {'selector': 'th.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td', 'props': [('text-align', 'center')]}
    ])
    .set_properties(**{'text-align': 'center'}, subset=['N','Mean','SD','Min','Max'])
    .format({col: "{:.3f}" for col in numeric_cols})
)

# --- EXPORT HTML ---
styled_score_table.to_html("Scoring_categorial_variable.html")

# --- EXPORT LaTeX ---
latex_table = score_summary_table.copy()
latex_table['Variable'] = latex_table['Variable'].apply(
    lambda x: str(x).replace("&nbsp;&nbsp;&nbsp;&nbsp;", "\\hspace*{1em}") if isinstance(x, str) else x
)

latex_str = latex_table.to_latex(
    index=False,
    na_rep="",
    float_format="%.3f",
    column_format="p{7cm}rrrrr",
    caption="Emotionality Scoring for the categorial variables",
    label="tab:conditional_vars",
    escape=False
)

latex_str = latex_str.replace("\\toprule", "\\hline\\hline") \
                     .replace("\\midrule", "\\hline") \
                     .replace("\\bottomrule", "\\hline\\hline")

with open("Scoring_categorial_variable.tex", "w", encoding="utf-8") as f:
    f.write(latex_str)

styled_score_table

Variable,N,Mean,SD,Min,Max
Overall,10952.0,0.808,0.202,0.31,1.66
English as Official Language,,,,,
No (=0),8339.0,0.805,0.206,0.31,1.66
Yes (=1),2613.0,0.82,0.189,0.391,1.635
Permanent Membership of the Security Council,,,,,
No (=0),10564.0,0.807,0.202,0.31,1.66
Yes (=1),388.0,0.838,0.211,0.398,1.484
Permanent Members of the Security Council,,,,,
Russia,78.0,0.733,0.142,0.432,1.022
France,74.0,0.817,0.176,0.496,1.35


### Table: Emotionality Scoring - Position (From 1994)

In [243]:
un_corpus_scored_since_1994 = un_corpus_scored[un_corpus_scored['year'] >= 1994]

position_vars = list(position_dummies.columns)

summary_list = []

overall_subset = un_corpus_scored_since_1994['score']
overall_row = pd.DataFrame({
    'Variable': ['Overall Sample since 1994'],
    'N': [overall_subset.count()],
    'Mean': [overall_subset.mean()],
    'SD': [overall_subset.std()],
    'Min': [overall_subset.min()],
    'Max': [overall_subset.max()]
})
summary_list.append(overall_row)

position_header = pd.DataFrame({
    'Variable': ['Position since 1994'],
    'N': [""],
    'Mean': [""],
    'SD': [""],
    'Min': [""],
    'Max': [""]
})
summary_list.append(position_header)

for var in position_vars:
    subset = un_corpus_scored_since_1994[un_corpus_scored_since_1994[var] == 1]
    summary_list.append(pd.DataFrame({
        'Variable': [f"&nbsp;&nbsp;&nbsp;&nbsp;{var_labels.get(var, var)}"],
        'N': [subset['score'].count()],
        'Mean': [subset['score'].mean()],
        'SD': [subset['score'].std()],
        'Min': [subset['score'].min()],
        'Max': [subset['score'].max()]
    }))

score_summary_table = pd.concat(summary_list, ignore_index=True)

numeric_cols = ['Mean', 'SD', 'Min', 'Max']
score_summary_table[numeric_cols] = score_summary_table[numeric_cols].round(3)
score_summary_table[numeric_cols] = score_summary_table[numeric_cols].replace("", pd.NA)

styled_score_table = (
    score_summary_table.style
    .hide(axis="index")
    .set_table_styles([
         {'selector': 'table', 'props': [('width', '100%')]}, 
        {'selector': 'th', 'props': [
            ('border-bottom', '3px solid black'),
            ('color', 'black'),
            ('font-weight', 'bold'),
            ('text-align', 'center'),
            ('background-color', 'white')
        ]},
        {'selector': 'th.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td', 'props': [('text-align', 'center')]}
    ])
    .set_properties(**{'text-align': 'center'}, subset=['N','Mean','SD','Min','Max'])
    .format({col: "{:.3f}" for col in numeric_cols})
)


# --- EXPORT HTML ---
styled_score_table.to_html("Scoring_positions_from1994.html")

# --- EXPORT LaTeX ---
latex_table = score_summary_table.copy()
latex_table['Variable'] = latex_table['Variable'].apply(
    lambda x: str(x).replace("&nbsp;&nbsp;&nbsp;&nbsp;", "\\hspace*{1em}") if isinstance(x, str) else x
)

# Prepare tabular only
tabular_only = latex_table.to_latex(
    index=False,
    na_rep="",
    float_format="%.3f",
    column_format="lrrrrr",
    escape=False
)

# Replace default rules with hlines
tabular_only = tabular_only.replace("\\toprule", "\\hline\\hline") \
                           .replace("\\midrule", "\\hline") \
                           .replace("\\bottomrule", "\\hline\\hline")

# Wrap in standard table environment without resizing
latex_str_centered = (
    "\\begin{table}[ht]\n"
    "\\centering\n"
    "\\caption{Emotionality Scoring for positions from 1994 till 2024}\n"
    "\\label{tab:positions_1994}\n"
    + tabular_only +
    "\n\\end{table}"
)

with open("Scoring_positions_from1994.tex", "w", encoding="utf-8") as f:
    f.write(latex_str_centered)



styled_score_table

Variable,N,Mean,SD,Min,Max
Overall Sample since 1994,5862.0,0.788,0.209,0.31,1.66
Position since 1994,,,,,
(Deputy) Minister for Foreign Affairs,2371.0,0.751,0.193,0.31,1.66
(Deputy) Prime Minister,1108.0,0.794,0.214,0.316,1.498
(Vice-) President,1834.0,0.837,0.211,0.356,1.514
Diplomatic Representative,339.0,0.742,0.208,0.354,1.635
Others,191.0,0.843,0.226,0.472,1.515


## Other Tables 

## T-Test Subsamples

In [178]:
test_vars = ['gender_dummy', 'position']
table_labels = {'gender_dummy': 'Gender', 'position': 'Position'}

summary_list = []

for var in test_vars:
    scores = un_corpus_scored['score']

    group_non_missing = scores[un_corpus_scored[var].notna()]
    group_missing = scores[un_corpus_scored[var].isna()]

    mean_non_missing = round(group_non_missing.mean(), 3)
    mean_missing = round(group_missing.mean(), 3)

    t_stat, p_val = stats.ttest_ind(group_non_missing, group_missing, nan_policy='omit')
    t_stat = round(t_stat, 3)
    p_val = round(p_val, 3)  

    summary_list.append({
        'Variable': table_labels[var],
        'N (Non-Missing)': len(group_non_missing),
        'N (Missing)': len(group_missing),
        'Mean (Non-Missing)': mean_non_missing,
        'Mean (Missing)': mean_missing,
        't-test': t_stat,
        'p-value': p_val
    })

summary_df = pd.DataFrame(summary_list)

styled_table = (
    summary_df.style
    .hide(axis="index")
    .set_table_styles([
        {'selector': 'th', 'props': [
            ('border-bottom', '3px solid black'), 
            ('color', 'black'),
            ('font-weight', 'bold'),
            ('text-align', 'center'),
            ('background-color', 'white')
        ]},
        {'selector': 'th.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td.col0', 'props': [('text-align', 'left')]},
        {'selector': 'td', 'props': [('text-align', 'center')]},
    ])
    .set_properties(**{'text-align': 'center'})
    .format({
        "Obs (Non-Missing)": "{:.0f}",
        "Obs (Missing)": "{:.0f}",
        "Mean (Non-Missing)": "{:.3f}",
        "Mean (Missing)": "{:.3f}",
        "t-test": "{:.3f}",
        "p-value": "{:.3f}"
    }, na_rep="-")
)

# --- Export HTML ---
styled_table.to_html("TTest_Scoring_Gender_Position.html")

# --- Export LaTeX ---
latex_ready = summary_df.copy()

latex_table = latex_ready.to_latex(
    index=False,
    na_rep="",
    float_format="%.3f",
    column_format="lrrrrrr",
    caption="T-Test of Subsample for the Emotionality Scoring",
    label="tab:summary_stats_ttest",
    header=["Variable", "Obs (Non-Missing)", "Obs (Missing)", 
            "Mean (Non-Missing)", "Mean (Missing)", "t-test", "p-value"],
    bold_rows=False,
    escape=False
)

latex_table = (
    latex_table.replace("\\toprule", "\\hline\\hline")
               .replace("\\midrule", "\\hline")
               .replace("\\bottomrule", "\\hline\\hline")
)

with open("TTest_Scoring_Gender_Position.tex", "w", encoding="utf-8") as f:
    f.write(latex_table)

styled_table

Variable,N (Non-Missing),N (Missing),Mean (Non-Missing),Mean (Missing),t-test,p-value
Gender,4704,6248,0.82,0.8,5.136,0.0
Position,6273,4679,0.793,0.829,-9.42,0.0


### Table: Years with more than 5 female speakers

In [260]:
female_threshold = 5

counts = un_corpus_scored.groupby('year')['gender_dummy'].value_counts().unstack(fill_value=0)

filtered_counts = counts[counts.get(1, 0) > female_threshold].copy()

filtered_years = pd.DataFrame({
    'Year': filtered_counts.index,
    'Male': filtered_counts.get(0, 0).values,
    'Female': filtered_counts.get(1, 0).values
})

# --- HTML ---
html_table = filtered_years.to_html(index=False)
with open("Female_Speeches_Years.html", "w", encoding="utf-8") as f:
    f.write(html_table)

# --- LaTeX ---
tabular_only = filtered_years.to_latex(
    index=False,
    column_format="lrr",
    na_rep="0",
    escape=False
)

tabular_only = tabular_only.replace("\\toprule", "\\hline\\hline") \
                           .replace("\\midrule", "\\hline") \
                           .replace("\\bottomrule", "\\hline\\hline")

latex_str_resized = (
    "\\begin{table}[htbp]\n"
    "\\centering\n"
    "\\caption{Years with More Than 5 Female Speakers}\n"
    "\\label{tab:female_years}\n"
    + tabular_only +
    "\n\\end{table}"
)

with open("Female_Speeches_Years.tex", "w", encoding="utf-8") as f:
    f.write(latex_str_resized)

filtered_years

Unnamed: 0,Year,Male,Female
0,1993,152,8
1,1994,153,8
2,1995,157,9
3,2006,124,15
4,2015,134,17
5,2016,165,18
6,2017,163,19
7,2018,165,18
8,2019,166,15
9,2020,165,8


### Table: Number of (Unique) Tokens

In [223]:
# --- Tokenize 'speech' column --- (Temporarily)
un_corpus_scored["speech_tokenized"] = un_corpus_scored["speech"].apply(
    lambda x: x.split() if isinstance(x, str) else []
)

print(un_corpus_scored["speech_tokenized"].head())

0    [At, the, resumption, of, the, first, session,...
1    [The, General, Assembly, of, the, United, Nati...
2    [The, principal, organs, of, the, United, Nati...
3    [As, more, than, a, year, has, elapsed, since,...
4    [Coming, to, this, platform, where, so, many, ...
Name: speech_tokenized, dtype: object


In [None]:
def to_list(val):
    if isinstance(val, list):
        return val
    elif isinstance(val, str):
        try:
            return ast.literal_eval(val)
        except:
            return []
    else:
        return []

columns = ["speech_tokenized", "speech_preprocessed", "speech_final"]
summary = []

for col in columns:
    all_tokens = []
    for val in un_corpus_scored[col].dropna():
        tokens = to_list(val)  # convert string to list if needed
        all_tokens.extend(tokens)
    total_tokens = len(all_tokens)
    unique_tokens = len(set(all_tokens))
    summary.append([col, total_tokens, unique_tokens])

table_names = [
    "Raw Speech",
    "Preprocessed Speech",
    "Final Speech (Frequency > 10)"
]

summary_df = pd.DataFrame(summary, columns=["Column", "Total Tokens", "Total Unique Tokens"])
summary_df.insert(0, "Speech Type", table_names)
summary_df = summary_df.drop(columns=["Column"])

# --- HTML ---
html_table = summary_df.to_html(index=False)
with open("Token_Summary.html", "w", encoding="utf-8") as f:
    f.write(html_table)

# --- LaTeX ---
summary_df.iloc[0, 0] = "Raw Speech"
summary_df.iloc[1, 0] = "Preprocessed Speech"
summary_df.iloc[2, 0] = "Final Speech"

tabular_only = summary_df.to_latex(
    index=False,
    column_format="lrr",
    na_rep="0",
    escape=False
)

tabular_only = tabular_only.replace("\\toprule", "\\hline\\hline") \
                           .replace("\\midrule", "\\hline") \
                           .replace("\\bottomrule", "\\hline\\hline")

latex_str_resized = (
    "\\begin{table}[htbp]\n"
    "\\centering\n"
    "\\caption{Token Counts by Cleaning Steps}\n"
    "\\label{tab:token_summary}\n"
    + tabular_only +
    "\n\\end{table}"
)

with open("Token_Summary.tex", "w", encoding="utf-8") as f:
    f.write(latex_str_resized)

summary_df

### Speeches with the highest and lowest score

In [265]:
# TABLE

In [None]:
# Top 5 speeches by score
top5 = un_corpus_scored.nlargest(5, 'score')[['country_name', 'year', 'score']]
print("Top 5 speeches by score:")
print(top5)

print("\n" + "="*50 + "\n")

# Bottom 5 speeches by score
bottom5 = un_corpus_scored.nsmallest(5, 'score')[['country_name', 'year','score']]
print("Bottom 5 speeches by score:")
print(bottom5)

#### Speeches of permanent security council members  with the highest and lowest scores

# TABLE

In [None]:
# Filter for permanent members of the Security Council
permanent_members = un_corpus_scored[un_corpus_scored['security_council_permanent'] == 1]

# Top 5 speeches by score
top5 = permanent_members.nlargest(5, 'score')[['country_name', 'year', 'speech', 'score']]
print("Top 5 speeches by score (Security Council permanent members):")
print(top5)

print("\n" + "="*50 + "\n")

# Bottom 5 speeches by score
bottom5 = permanent_members.nsmallest(5, 'score')[['country_name', 'year', 'speech', 'score']]
print("Bottom 5 speeches by score (Security Council permanent members):")
print(bottom5)

# TABLE

In [None]:
# Filter for permanent members of the Security Council
permanent_members = un_corpus_scored[un_corpus_scored['security_council_permanent'] == 1]

# Top 5 speeches by score
top5 = permanent_members.nlargest(5, 'score')[['country_name', 'year', 'speech', 'score']]
print("Top 5 speeches by score (Security Council permanent members):")
print(top5)

print("\n" + "="*50 + "\n")

# Bottom 5 speeches by score
bottom5 = permanent_members.nsmallest(5, 'score')[['country_name', 'year', 'speech', 'score']]
print("Bottom 5 speeches by score (Security Council permanent members):")
print(bottom5)

## Appendix

#### Yearly Emotionality Score

In [None]:
score_table = (
    un_corpus_scored
    .groupby('year')['score']
    .agg(['mean', 'count'])
    .reset_index()
    .rename(columns={'mean':'avg_score', 'count':'n'})
)

score_table['avg_score'] = score_table['avg_score'].round(3)

with pd.option_context('display.max_rows', None):
    display(score_table)

highest_year = score_table.loc[score_table['avg_score'].idxmax()]
lowest_year = score_table.loc[score_table['avg_score'].idxmin()]

print(f"Years with the highest average score: {highest_year['avg_score']} in {int(highest_year['year'])}")
print(f"Years with the lowest average score: {lowest_year['avg_score']} in {int(lowest_year['year'])}")

#### Years with over 0.08 change in the emotionality score

In [None]:
# Compute year-over-year difference
score_table['diff'] = score_table['avg_score'].diff()

# Find years where increase >= 0.08
increased_years = score_table[score_table['diff'] >= 0.08]

# Find years where decrease <= -0.08
decreased_years = score_table[score_table['diff'] <= -0.08]

# Display increases
print("Years with an increase of >= 0.08:")
with pd.option_context('display.max_rows', None):
    display(increased_years[['year', 'avg_score', 'diff']])

# Display decreases
print("Years with a decrease of >= 0.08:")
with pd.option_context('display.max_rows', None):
    display(decreased_years[['year', 'avg_score', 'diff']])


#### Speeches with the highest and lowest score

In [None]:
# Closer look at the two most emotional speeches
def print_speech(country, year):
    speech_row = un_corpus_scored[
        (un_corpus_scored['country_name'] == country) &
        (un_corpus_scored['year'] == year)
    ]
    if not speech_row.empty:
        print(f"Speech from {country} in {year}:\n")
        print(speech_row.iloc[0]['speech'])
        print("\n" + "="*50 + "\n")
    else:
        print(f"No speech found for {country} in {year}.\n")

print_speech('Cameroon', 2001)

print_speech('Democratic Republic of Congo', 1999)

# Topic Cameroon: 
# -terrorist attacks, Taliban, Afghanistan
# -Condolences to 9/11 victims
# -unite forces to fight terrorism
# - Nobel Peace Price
# -mentions other conflicts/wars: Angola, Palestine, Dem, Rep. Congo
# -"demons", "profound", "sadness", "dismay", "terrible", "mourning", "urge", "brutal"


# Topic Dem. Rep. of Congo: 
# - congrats newly elected President of the General Assembly
# - quotes the UN Charter and criticizes double standards
# - criticizes members that violate the UN Charta
# - Blitzkrieg invasion of Bururndi, ruanda and Uganda in their country
# - criticizes exploitation of diamonds, cobalt, copper, and gold
# - Urges peaceful resolution and national reconstruction
# - "happy", "warmest", "love", "provocation", "violation", "torture", "attacked"

In [None]:
# Closer look at the two most rational speeches
def print_speech(country, year):
    speech_row = un_corpus_scored[
        (un_corpus_scored['country_name'] == country) &
        (un_corpus_scored['year'] == year)
    ]
    if not speech_row.empty:
        print(f"Speech from {country} in {year}:\n")
        print(speech_row.iloc[0]['speech'])
        print("\n" + "="*50 + "\n")
    else:
        print(f"No speech found for {country} in {year}.\n")

print_speech('Moldova', 2000)

print_speech('Turkmenistan', 2013)

# Topic Moldova:

# Topic Turkmenistan: