In [None]:
# !pip install torch==2.7.1+cpu torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
# !pip install pandas==2.2.3 scikit-learn==1.6.1 evaluate==0.4.3 datasets==3.0.1 matplotlib
# !pip install transformers==4.54.1 accelerate==1.10.1
#!pip install pandas numpy matplotlib seaborn matplotlib plotly plotly_express scikit-learn plotly imbalanced-learn

: 

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoModel
import csv
import os
import re
import numpy as np
import torch.nn.functional as F
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA 
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from matplotlib.ticker import MaxNLocator
from scipy.stats import shapiro,  mannwhitneyu
from scipy.stats import levene

In [None]:
palette = {"Left": "#7ac8ff", "Right": "#e74c4c"}

In [None]:
def convertList(df):
    def parse_probs(x):
        # Ensure string
        if isinstance(x, str):
            # Replace multiple spaces with one, add commas, then evaluate safely
            x = re.sub(r'\s+', ',', x.strip('[] '))
            return np.fromstring(x, sep=',')
        elif isinstance(x, (list, np.ndarray)):
            return np.array(x)
        else:
            return np.array([np.nan, np.nan, np.nan])
    
    df['probs_list'] = df['sentiment_probs'].apply(parse_probs)
    df[['neg_prob', 'neu_prob', 'pos_prob']] = pd.DataFrame(df['probs_list'].tolist(), index=df.index)
    return df

In [None]:
def computeSentimentScore(row):
    weights = np.array([-1, 0, 1])  # negative, neutral, positive
    probs = np.array([row['neg_prob'], row['neu_prob'], row['pos_prob']])
    return np.dot(weights, probs)   # weighted sentiment score

In [None]:
df_2020 = pd.read_csv("analysis/2020_sentiment_robertaBase.csv")
df_2020 = convertList(df_2020)
df_2020['year'] = 2020
df_2020['group'] = df_2020['label'] + " " + df_2020['year'].astype(str)
df_2020.info()

In [None]:
df_2020.head()

In [None]:
df_2024 = pd.read_csv("analysis/2024_sentiment_robertaBase.csv")
df_2024 = convertList(df_2024)
df_2024['year'] = 2024
df_2024['group'] = df_2024['label'] + " " + df_2024['year'].astype(str)
df_2024.info()

In [None]:
df_2024.head()

##### **Evaluate Distribution of Sentiment for Left and Right Groups by Election Period**

In [None]:
# create dataset by group and year for analysis and plotting
df_left_2024 = df_2024.loc[df_2024['label'] == 'Left'].copy()
df_left_2020 = df_2020.loc[df_2020['label'] == 'Left'].copy()
df_right_2024 = df_2024.loc[df_2024['label'] == 'Right'].copy()
df_right_2020 = df_2020.loc[df_2020['label'] == 'Right'].copy()

In [None]:
sentiData = {
    "Left 2020 Election Period": df_left_2020,
    "Left 2024 Election Period": df_left_2024,
    "Right 2020 Election Period": df_right_2020,
    "Right 2024 Election Period": df_right_2024
}

In [None]:
for subset in [df_left_2024, df_left_2020, df_right_2024, df_right_2020]:
    subset['sentiScore'] = subset.apply(computeSentimentScore, axis=1)


In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10, 8), sharex=True, sharey=True)
sns.kdeplot(df_left_2020['sentiScore'], fill=True, ax=axes[0,0], color="#7ac8ff")
axes[0,0].set_title(df_left_2020['group'].iloc[0])
sns.kdeplot(df_left_2024['sentiScore'], fill=True, ax=axes[0,1], color="#4046B9")
axes[0,1].set_title(df_left_2024['group'].iloc[0])
sns.kdeplot(df_right_2020['sentiScore'], fill=True, ax=axes[1,0], color="#fc8282")
axes[1,0].set_title(df_right_2020['group'].iloc[0])
sns.kdeplot(df_right_2024['sentiScore'], fill=True, ax=axes[1,1], color="#E60F0F")
axes[1,1].set_title(df_right_2024['group'].iloc[0])
plt.suptitle("Sentiment Score Distributions by Group and Year", y=1.02)
ylabels = ['Density', 'Density', 'Density', 'Density']
xlabels = ['Sentiment Score', 'Sentiment Score', 'Sentiment Score', 'Sentiment Score']
for ax, ylabel, xlabel in zip(axes.flatten(), ylabels, xlabels):
    ax.set_ylabel(ylabel)
    ax.set_xlabel(xlabel)
plt.tight_layout()
plt.show()

##### **Confirm Uneven Distibution with Shapiro**

In [None]:
# Function to test normality
def test_normality(df, group_name):
    # For large samples (>5000)
    sample = df['sentiScore']
    if len(sample) > 5000:
        sample = sample.sample(5000, random_state=42)
    stat, p = shapiro(sample)
    print(f"{group_name}: W={stat:.3f}, p={p:.10f}")
    if p > 0.05:
        print("• Normal Distribution\n")
    else:
        print("• Uneven Distribution\n")

In [None]:
print(f"Test Distribution by Group:\n{'-'*45}")
for name, df in sentiData.items():
    test_normality(df, name)

##### **Measure Central Tendency (Median) for Left and Right Groups by Election Period**

In [None]:
print(f"Median Sentiment Values by Group:\n{'-'*45}")
for name, df in sentiData.items():
    print(f"{name}:\n • Median sentiment = {df['sentiScore'].median():.3f}")
print('-'*45)

In [None]:
sentiPlot = pd.concat([df_left_2020, df_left_2024, df_right_2020, df_right_2024])

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(data=sentiPlot,x='year', y='sentiScore', hue='label', palette=palette, width=0.6, fliersize=4)
plt.title("Distribution of Sentiment Scores by Group and Year")
plt.ylabel("Sentiment Score (-1 = Negative, +1 = Positive)")
plt.xlabel("Election Year")
plt.axhline(0, color='gray', linestyle='--')
plt.grid(alpha=0.3)
plt.legend([],[], frameon=False) 
plt.tight_layout()
plt.show()


In [None]:
def mannWhitneyTest(df1, df2, label1, label2):
    stat, p = mannwhitneyu(df1['sentiScore'], df2['sentiScore'])
    print(f"{label1} vs {label2}: U={stat:.2f}, p={p:.5f}")
    if p < 0.05:
        print("• Significant difference in sentiment.\n")
    else:
        print("• No significant difference in sentiment.\n")

In [None]:
# temporal shift for both left and right groups
mannWhitneyTest(df_left_2020, df_left_2024, "Left 2020 Election Period", "Left 2024 Election Period")
mannWhitneyTest(df_right_2020, df_right_2024, "Right 2020 Election Period", "Right 2024 Election Period")

##### **Levene’s test for equality of variances**

In [None]:
print(f"Variance in Sentiment by Group:\n{'-'*45}")
for name, df in sentiData.items():
    print(f"{name}: \n • variance = {df['sentiScore'].var():.4f}")
print('-'*45)


In [None]:
# levene’s test for equality of variances within each group across time
# calulate variance changes over time within each group
# - input datasets for each group ( left and right ) and year ( 2020 and 2024)
# - output test statistic and p-value
# - determine if variance changed significantly (p < 0.05) indicating polarization shift
stat_left, p_left = levene(df_left_2020['sentiScore'], df_left_2024['sentiScore'])
stat_right, p_right = levene(df_right_2020['sentiScore'], df_right_2024['sentiScore'])

print(f"Left 2020 vs 2024:\n• W={stat_left:.3f}, p={p_left:.5f}")
if p_left < 0.05:
    print("• Left group variance changed significantly (polarization shift).")
else:
    print("• Left group variance remained consistent (no polarization shift).")
print('-'*70)  
print(f"Right 2020 vs 2024:\n• W={stat_right:.3f}, p={p_right:.5f}")
if p_right < 0.05:
    print("• Right group variance changed significantly (polarization shift).")
else:
    print("• Right group variance remained consistent (no polarization shift).")
