In [25]:
import pandas as pd
from datetime import datetime
import warnings
# from googletrans import Translator
# translator = Translator()
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
# import researchpy as rp
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import f_oneway
from scipy.stats import ttest_ind
from scipy.stats import levene
import ast

In [26]:
df = pd.read_csv("antiKremlin_dataset.csv")

In [27]:
df["post_datetime"] = pd.to_datetime(df['post_datetime'])
df['post_date'] = pd.to_datetime(df['post_datetime'].dt.date)

df['reactions'] = df['reactions'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)

In [28]:
def stat_test(*data):
    levene_f_statistic, levene_p_value = levene(*data)
    # print(f'Levene F-statistic: {levene_f_statistic}')
    print(f'Levene p-value: {levene_p_value}')

    if levene_p_value > 0.05:
        t, p = ttest_ind(*data)
        print(f'Equal variance t-test stat: {t}')
        print(f'Equal variance t-test p-value: {p}')

        # Calculate degrees of freedom for equal variances
        n1 = len(data[0])
        n2 = len(data[1])
        dof = n1 + n2 - 2
        print("Degrees of freedom (equal variance):", dof)

    else:
        t, p = ttest_ind(*data, equal_var=False)
        print(f'Unequal variance t-test stat: {t}')
        print(f'Unequal variance t-test p-value: {p}')

        group1 = np.array(data[0])
        group2 = np.array(data[1])

        n1 = len(group1)
        n2 = len(group2)

        var1 = np.var(group1, ddof=1)
        var2 = np.var(group2, ddof=1)

        # Welch-Satterthwaite equation for degrees of freedom
        numerator = (var1/n1 + var2/n2)**2
        denominator = ((var1/n1)**2 / (n1 - 1)) + ((var2/n2)**2 / (n2 - 1))
        dof = numerator / denominator
        
        print("degree of freedom (unequal variance):", dof)

    if p < 0.05:
        print('Reject the null hypothesis - significant difference between groups')
    else:
        print('Fail to reject the null hypothesis - no significant difference between groups')

### Topic 1: International Politics	

In [29]:
# net reactions for economy category in phase 1
int_df = df[(df['category'] == "International Politics") & (df['phase'] == "phase 6")]

int_df['positive_reactions'] = int_df['reactions'].apply(lambda x: x.get('👍', 0) + x.get('😁', 0) + x.get('🔥', 0) + x.get('❤', 0) + x.get('🎉', 0) + x.get('🤣', 0) 
                                                        + x.get('🙏', 0) if pd.notna(x) else 0)

int_df['negative_reactions'] = int_df['reactions'].apply(lambda x: x.get('👎', 0) + x.get('🤡', 0) + x.get('🤬', 0) + x.get('💩', 0) + x.get('😢', 0) + x.get('🤮', 0) 
                                                         if pd.notna(x) else 0)

print("positive %", round(int_df['positive_reactions'].sum() / (int_df['positive_reactions'].sum() + int_df['negative_reactions'].sum()) * 100,1))
print("negative %", round(int_df['negative_reactions'].sum() / (int_df['positive_reactions'].sum() + int_df['negative_reactions'].sum()) * 100,1))

positive % 68.6
negative % 31.4


In [30]:
# shortllist International Politics categories
int_df = df[df['category'] == "International Politics"]
int_df['week_year'] = int_df['post_datetime'].dt.strftime('%Y-%U')

int_df['positive_reactions'] = int_df['reactions'].apply(lambda x: x.get('👍', 0) + x.get('😁', 0) + x.get('🔥', 0) + x.get('❤', 0) + x.get('🎉', 0) + x.get('🤣', 0) 
                                                        + x.get('🙏', 0) if pd.notna(x) else 0)


int_df['negative_reactions'] = int_df['reactions'].apply(lambda x: x.get('👎', 0) + x.get('🤡', 0) + x.get('🤬', 0) + x.get('💩', 0) + x.get('😢', 0) + x.get('🤮', 0) 
                                                         if pd.notna(x) else 0)

int_df['net_reactions'] = int_df['positive_reactions'] - int_df['negative_reactions']

int_result = int_df.groupby(['phase', 'week_year']).agg({'net_reactions': 'sum'}).reset_index()

int_result['mean_net_reactions'] = int_result['net_reactions'] // 7

int_result = int_result[1:]
int_result = int_result.reset_index(drop=True)

int_grouped_data = [int_result['mean_net_reactions'][int_result['phase'] == phase] for phase in int_result['phase'].unique()]

In [31]:
int_f_statistic, int_p_value = f_oneway(*int_grouped_data)

# Print the results
print(f'F-statistic: {int_f_statistic}')
print(f'P-value: {int_p_value}\n')

model = ols('mean_net_reactions ~ C(phase)', int_result).fit()
# Perform ANOVA and print the results
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

F-statistic: 11.199596865196956
P-value: 2.3789566737215596e-08

                sum_sq    df          F        PR(>F)
C(phase)  6.863148e+10   6.0  11.199597  2.378957e-08
Residual  6.128031e+10  60.0        NaN           NaN


In [32]:
# contrast 1: 0 vs rest
print("Contrast 1: 0 vs rest\n")

stat_test(int_result['mean_net_reactions'][int_result['phase'].isin(["phase 0"])], 
     int_result['mean_net_reactions'][int_result['phase'].isin(["phase 1", "phase 2", "phase 3", "phase 4", "phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6
print("Contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6\n")

stat_test(int_result['mean_net_reactions'][int_result['phase'].isin(["phase 1", "phase 2", "phase 3"])], 
     int_result['mean_net_reactions'][int_result['phase'].isin(["phase 4", "phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 3: phase1, phase2 vs phase3
print("Contrast 3: phase1, phase2 vs phase3\n")

stat_test(int_result['mean_net_reactions'][int_result['phase'].isin(["phase 1", "phase 2"])],
        int_result['mean_net_reactions'][int_result['phase'].isin(["phase 3"])])

print("=====================================================================================================")

# contrast 4: phase4 vs phase5, phase6
print("Contrast 4: phase4 vs phase5, phase6\n")

stat_test(int_result['mean_net_reactions'][int_result['phase'].isin(["phase 4"])],
        int_result['mean_net_reactions'][int_result['phase'].isin(["phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 5: phase1 vs phase2
print("Contrast 5: phase1 vs phase2\n")

stat_test(int_result['mean_net_reactions'][int_result['phase'].isin(["phase 1"])],
        int_result['mean_net_reactions'][int_result['phase'].isin(["phase 2"])])

print("=====================================================================================================")

# contrast 6: phase5 vs phase6
print("Contrast 6: phase5 vs phase6\n")

stat_test(int_result['mean_net_reactions'][int_result['phase'].isin(["phase 5"])],
        int_result['mean_net_reactions'][int_result['phase'].isin(["phase 6"])])

print("=====================================================================================================")

Contrast 1: 0 vs rest

Levene p-value: 0.01308744670891666
Unequal variance t-test stat: -8.804726998738523
Unequal variance t-test p-value: 2.7061413496171147e-12
degree of freedom (unequal variance): 58.210249347270285
Reject the null hypothesis - significant difference between groups
Contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6

Levene p-value: 0.6573672705704408
Equal variance t-test stat: 1.7006033511285108
Equal variance t-test p-value: 0.09446797593918407
Degrees of freedom (equal variance): 57
Fail to reject the null hypothesis - no significant difference between groups
Contrast 3: phase1, phase2 vs phase3

Levene p-value: 0.085074644627184
Equal variance t-test stat: 2.4470687836149945
Equal variance t-test p-value: 0.023285398648978845
Degrees of freedom (equal variance): 21
Reject the null hypothesis - significant difference between groups
Contrast 4: phase4 vs phase5, phase6

Levene p-value: 0.6010975691942888
Equal variance t-test stat: 0.7892079114144707
E

In [33]:
# from statsmodels.stats.multicomp import pairwise_tukeyhsd
# from statsmodels.stats.multicomp import MultiComparison
# from statsmodels.stats.multicomp import tukeyhsd

# # Run Tukey's test
# tukey = pairwise_tukeyhsd(endog=int_result['mean_net_reactions'],  # Data
#                           groups=int_result['phase'],  # Groups
#                           alpha=0.05)  # Significance level

# # Test summary
# tukey.summary()              

In [34]:
# # For tableau visualization: Area chart of mean post count, net positive reactions, and net negative reactions

# post_int_df = int_df.groupby(['phase', 'week_year']).size().reset_index(name='posts_count')

# post_int_df['mean_post_count'] = post_int_df['posts_count'] // 7


# grouped = int_df.groupby(['phase', 'week_year']).agg({'positive_reactions': 'sum', 'negative_reactions': 'sum'})

# # Divide sum of reactions by 7 to get daily average
# grouped['net_positive_reactions'] = grouped['positive_reactions'] // 7
# grouped['net_negative_reactions'] = grouped['negative_reactions'] // 7

# # Reset index to make 'week_year' a column
# grouped.reset_index(inplace=True)

# grouped.shape

# # grouped[['phase', 'week_year', 'net_positive_reactions', 
# #          'net_negative_reactions']].to_csv('topic_russian_domestic_reactions.csv', index=False)

# pd.merge(post_int_df, grouped, on=['phase', 'week_year'])[['phase', 'week_year', 'mean_post_count', 
#                                                            'net_positive_reactions', 
#                                                            'net_negative_reactions']].to_csv('topic_international_reactions.csv', 
#                                                                                              index=False)

### Topic 2: Russian Domestic Affairs

In [35]:
# net reactions for economy category in phase 1
rus_df = df[(df['category'] == "Russian domestic politics") & (df['phase'] == "phase 4")]

rus_df['positive_reactions'] = rus_df['reactions'].apply(lambda x: x.get('👍', 0) + x.get('😁', 0) + x.get('🔥', 0) + x.get('❤', 0) + x.get('🎉', 0) + x.get('🤣', 0) 
                                                        + x.get('🙏', 0) if pd.notna(x) else 0)

rus_df['negative_reactions'] = rus_df['reactions'].apply(lambda x: x.get('👎', 0) + x.get('🤡', 0) + x.get('🤬', 0) + x.get('💩', 0) + x.get('😢', 0) + x.get('🤮', 0) 
                                                         if pd.notna(x) else 0)


print("positive %", round(rus_df['positive_reactions'].sum() / (rus_df['positive_reactions'].sum() + rus_df['negative_reactions'].sum()) * 100,1))
print("negative %", round(rus_df['negative_reactions'].sum() / (rus_df['positive_reactions'].sum() + rus_df['negative_reactions'].sum()) * 100,1))

positive % 66.7
negative % 33.3


In [36]:
rus_df = df[df['category'] == "Russian domestic politics"]
rus_df['week_year'] = rus_df['post_datetime'].dt.strftime('%Y-%U')

rus_df['positive_reactions'] = rus_df['reactions'].apply(lambda x: x.get('👍', 0) + x.get('😁', 0) + x.get('🔥', 0) + x.get('❤', 0) + x.get('🎉', 0) + x.get('🤣', 0) 
                                                        + x.get('🙏', 0) if pd.notna(x) else 0)


rus_df['negative_reactions'] = rus_df['reactions'].apply(lambda x: x.get('👎', 0) + x.get('🤡', 0) + x.get('🤬', 0) + x.get('💩', 0) + x.get('😢', 0) + x.get('🤮', 0) 
                                                         if pd.notna(x) else 0)

rus_df['net_reactions'] = rus_df['positive_reactions'] - rus_df['negative_reactions']

rus_result = rus_df.groupby(['phase', 'week_year']).agg({'net_reactions': 'sum'}).reset_index()

rus_result['mean_net_reactions'] = rus_result['net_reactions'] // 7

rus_result = rus_result[1:]
rus_result = rus_result.reset_index(drop=True)

rus_grouped_data = [rus_result['mean_net_reactions'][rus_result['phase'] == phase] for phase in rus_result['phase'].unique()]

In [37]:
# one way ANOVA
rus_f_statistic, rus_p_value = f_oneway(*rus_grouped_data)

# Prrus the results
print(f'F-statistic: {rus_f_statistic}')
print(f'P-value: {rus_p_value}\n')

model = ols('mean_net_reactions ~ C(phase)', rus_result).fit()
# Perform ANOVA and prrus the results
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

F-statistic: 13.789978595039475
P-value: 8.918591357244732e-10

                sum_sq    df          F        PR(>F)
C(phase)  4.212380e+10   6.0  13.789979  8.918591e-10
Residual  3.054667e+10  60.0        NaN           NaN


In [38]:
# contrast 1: 0 vs rest
print("Contrast 1: 0 vs rest\n")

stat_test(rus_result['mean_net_reactions'][rus_result['phase'].isin(["phase 0"])], 
     rus_result['mean_net_reactions'][rus_result['phase'].isin(["phase 1", "phase 2", "phase 3", "phase 4", "phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6
print("Contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6\n")

stat_test(rus_result['mean_net_reactions'][rus_result['phase'].isin(["phase 1", "phase 2", "phase 3"])], 
     rus_result['mean_net_reactions'][rus_result['phase'].isin(["phase 4", "phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 3: phase1, phase2 vs phase3
print("Contrast 3: phase1, phase2 vs phase3\n")

stat_test(rus_result['mean_net_reactions'][rus_result['phase'].isin(["phase 1", "phase 2"])],
        rus_result['mean_net_reactions'][rus_result['phase'].isin(["phase 3"])])

print("=====================================================================================================")

# contrast 4: phase4 vs phase5, phase6
print("Contrast 4: phase4 vs phase5, phase6\n")

stat_test(rus_result['mean_net_reactions'][rus_result['phase'].isin(["phase 4"])],
        rus_result['mean_net_reactions'][rus_result['phase'].isin(["phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 5: phase1 vs phase2
print("Contrast 5: phase1 vs phase2\n")

stat_test(rus_result['mean_net_reactions'][rus_result['phase'].isin(["phase 1"])],
        rus_result['mean_net_reactions'][rus_result['phase'].isin(["phase 2"])])

print("=====================================================================================================")

# contrast 6: phase5 vs phase6
print("Contrast 6: phase5 vs phase6\n")

stat_test(rus_result['mean_net_reactions'][rus_result['phase'].isin(["phase 5"])],
        rus_result['mean_net_reactions'][rus_result['phase'].isin(["phase 6"])])

print("=====================================================================================================")

Contrast 1: 0 vs rest

Levene p-value: 0.013037106497217297
Unequal variance t-test stat: -7.476178754412521
Unequal variance t-test p-value: 4.216931352065625e-10
degree of freedom (unequal variance): 59.0089674571505
Reject the null hypothesis - significant difference between groups
Contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6

Levene p-value: 0.6977872296351229
Equal variance t-test stat: 0.3259749937756639
Equal variance t-test p-value: 0.7456370352353872
Degrees of freedom (equal variance): 57
Fail to reject the null hypothesis - no significant difference between groups
Contrast 3: phase1, phase2 vs phase3

Levene p-value: 0.07101499194147903
Equal variance t-test stat: 3.273766067086226
Equal variance t-test p-value: 0.003625044283932286
Degrees of freedom (equal variance): 21
Reject the null hypothesis - significant difference between groups
Contrast 4: phase4 vs phase5, phase6

Levene p-value: 0.2488485763673823
Equal variance t-test stat: 5.1611698357554525
Equ

In [39]:
# For tableau visualization: Area chart of mean post count, net positive reactions, and net negative reactions

post_rus_df = rus_df.groupby(['phase', 'week_year']).size().reset_index(name='posts_count')

post_rus_df['mean_post_count'] = post_rus_df['posts_count'] // 7


grouped = rus_df.groupby(['phase', 'week_year']).agg({'positive_reactions': 'sum', 'negative_reactions': 'sum'})

# Divide sum of reactions by 7 to get daily average
grouped['net_positive_reactions'] = grouped['positive_reactions'] // 7
grouped['net_negative_reactions'] = grouped['negative_reactions'] // 7

# Reset index to make 'week_year' a column
grouped.reset_index(inplace=True)

grouped.shape

# grouped[['phase', 'week_year', 'net_positive_reactions', 
#          'net_negative_reactions']].to_csv('topic_russian_domestic_reactions.csv', index=False)

pd.merge(post_rus_df, grouped, on=['phase', 'week_year'])[['phase', 'week_year', 'mean_post_count', 
                                                           'net_positive_reactions', 
                                                           'net_negative_reactions']].to_csv('topic_russian_domestic_reactions.csv', 
                                                                                             index=False)

### Topic 3: Combat and Frontline Updates

In [40]:
# net reactions for economy category in phase 1
war_df = df[(df['category'] == "Combat and Frontline updates") & (df['phase'] == "phase 6")]

war_df['positive_reactions'] = war_df['reactions'].apply(lambda x: x.get('👍', 0) + x.get('😁', 0) + x.get('🔥', 0) + x.get('❤', 0) + x.get('🎉', 0) + x.get('🤣', 0) 
                                                        + x.get('🙏', 0) if pd.notna(x) else 0)

war_df['negative_reactions'] = war_df['reactions'].apply(lambda x: x.get('👎', 0) + x.get('🤡', 0) + x.get('🤬', 0) + x.get('💩', 0) + x.get('😢', 0) + x.get('🤮', 0) 
                                                         if pd.notna(x) else 0)

war_df['positive_reactions'].sum(), war_df['negative_reactions'].sum(), (war_df['positive_reactions'].sum() + war_df['negative_reactions'].sum())

print("positive %", round(war_df['positive_reactions'].sum() / (war_df['positive_reactions'].sum() + war_df['negative_reactions'].sum()) * 100,1))
print("negative %", round(war_df['negative_reactions'].sum() / (war_df['positive_reactions'].sum() + war_df['negative_reactions'].sum()) * 100,1))

positive % 67.4
negative % 32.6


In [41]:
war_df = df[df['category'] == "Combat and Frontline updates"]
war_df['week_year'] = war_df['post_datetime'].dt.strftime('%Y-%U')

war_df['positive_reactions'] = war_df['reactions'].apply(lambda x: x.get('👍', 0) + x.get('😁', 0) + x.get('🔥', 0) + x.get('❤', 0) + x.get('🎉', 0) + x.get('🤣', 0) 
                                                        + x.get('🙏', 0) if pd.notna(x) else 0)


war_df['negative_reactions'] = war_df['reactions'].apply(lambda x: x.get('👎', 0) + x.get('🤡', 0) + x.get('🤬', 0) + x.get('💩', 0) + x.get('😢', 0) + x.get('🤮', 0) 
                                                         if pd.notna(x) else 0)

war_df['net_reactions'] = war_df['positive_reactions'] - war_df['negative_reactions']

war_result = war_df.groupby(['phase', 'week_year']).agg({'net_reactions': 'sum'}).reset_index()

war_result['mean_net_reactions'] = war_result['net_reactions'] // 7

war_result = war_result[1:]
war_result = war_result.reset_index(drop=True)

war_grouped_data = [war_result['mean_net_reactions'][war_result['phase'] == phase] for phase in war_result['phase'].unique()]

In [42]:
# one way ANOVA
war_f_statistic, war_p_value = f_oneway(*war_grouped_data)

# print the results
print(f'F-statistic: {war_f_statistic}')
print(f'P-value: {war_p_value}\n')

model = ols('mean_net_reactions ~ C(phase)', war_result).fit()
# Perform ANOVA and print the results
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

F-statistic: 8.11208779207447
P-value: 1.026685335702324e-05

                sum_sq    df         F   PR(>F)
C(phase)  9.836179e+10   5.0  8.112088  0.00001
Residual  1.261035e+11  52.0       NaN      NaN


In [43]:
# contrast 1: 0 vs rest
print("Contrast 1: 0 vs rest\n")

stat_test(war_result['mean_net_reactions'][war_result['phase'].isin(["phase 0"])], 
     war_result['mean_net_reactions'][war_result['phase'].isin(["phase 1", "phase 2", "phase 3", "phase 4", "phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6
print("Contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6\n")

stat_test(war_result['mean_net_reactions'][war_result['phase'].isin(["phase 1", "phase 2", "phase 3"])], 
     war_result['mean_net_reactions'][war_result['phase'].isin(["phase 4", "phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 3: phase1, phase2 vs phase3
print("Contrast 3: phase1, phase2 vs phase3\n")

stat_test(war_result['mean_net_reactions'][war_result['phase'].isin(["phase 1", "phase 2"])],
        war_result['mean_net_reactions'][war_result['phase'].isin(["phase 3"])])

print("=====================================================================================================")

# contrast 4: phase4 vs phase5, phase6
print("Contrast 4: phase4 vs phase5, phase6\n")

stat_test(war_result['mean_net_reactions'][war_result['phase'].isin(["phase 4"])],
        war_result['mean_net_reactions'][war_result['phase'].isin(["phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 5: phase1 vs phase2
print("Contrast 5: phase1 vs phase2\n")

stat_test(war_result['mean_net_reactions'][war_result['phase'].isin(["phase 1"])],
        war_result['mean_net_reactions'][war_result['phase'].isin(["phase 2"])])

print("=====================================================================================================")

# contrast 6: phase5 vs phase6
print("Contrast 6: phase5 vs phase6\n")

stat_test(war_result['mean_net_reactions'][war_result['phase'].isin(["phase 5"])],
        war_result['mean_net_reactions'][war_result['phase'].isin(["phase 6"])])

print("=====================================================================================================")

Contrast 1: 0 vs rest

Levene p-value: nan
Unequal variance t-test stat: nan
Unequal variance t-test p-value: nan
degree of freedom (unequal variance): nan
Fail to reject the null hypothesis - no significant difference between groups
Contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6

Levene p-value: 0.013548488526069744
Unequal variance t-test stat: 1.403709287931254
Unequal variance t-test p-value: 0.17158387832499955
degree of freedom (unequal variance): 27.534502861658538
Fail to reject the null hypothesis - no significant difference between groups
Contrast 3: phase1, phase2 vs phase3

Levene p-value: 0.004628839522930628
Unequal variance t-test stat: 4.02634639411036
Unequal variance t-test p-value: 0.0009507815975587982
degree of freedom (unequal variance): 16.237320855404114
Reject the null hypothesis - significant difference between groups
Contrast 4: phase4 vs phase5, phase6

Levene p-value: 0.055436833086217566
Equal variance t-test stat: -4.142011656986755
Equal va

In [44]:
# # Run Tukey's test
# tukey = pairwise_tukeyhsd(endog=war_result['mean_net_reactions'],  # Data
#                           groups=war_result['phase'],  # Groups
#                           alpha=0.05)  # Significance level

# # Test summary
# tukey.summary()              

In [45]:
# # For tableau visualization: Area chart of mean post count, net positive reactions, and net negative reactions

# post_war_df = war_df.groupby(['phase', 'week_year']).size().reset_index(name='posts_count')

# post_war_df['mean_post_count'] = post_war_df['posts_count'] // 7


# grouped = war_df.groupby(['phase', 'week_year']).agg({'positive_reactions': 'sum', 'negative_reactions': 'sum'})

# # Divide sum of reactions by 7 to get daily average
# grouped['net_positive_reactions'] = grouped['positive_reactions'] // 7
# grouped['net_negative_reactions'] = grouped['negative_reactions'] // 7

# # Reset index to make 'week_year' a column
# grouped.reset_index(inplace=True)

# pd.merge(post_war_df, grouped, on=['phase', 'week_year'])[['phase', 'week_year', 'mean_post_count', 
#                                                            'net_positive_reactions', 
#                                                            'net_negative_reactions']].to_csv('topic_combat_reactions.csv', 
#                                                                                              index=False)

### Topic 4: Ukrainian Domestic Affairs

In [46]:
# net reactions for economy category in phase 1
ukr_df = df[(df['category'] == "Ukranian domestic affairs") & (df['phase'] == "phase 5")]

ukr_df['positive_reactions'] = ukr_df['reactions'].apply(lambda x: x.get('👍', 0) + x.get('😁', 0) + x.get('🔥', 0) + x.get('❤', 0) + x.get('🎉', 0) + x.get('🤣', 0) 
                                                        + x.get('🙏', 0) if pd.notna(x) else 0)

ukr_df['negative_reactions'] = ukr_df['reactions'].apply(lambda x: x.get('👎', 0) + x.get('🤡', 0) + x.get('🤬', 0) + x.get('💩', 0) + x.get('😢', 0) + x.get('🤮', 0) 
                                                         if pd.notna(x) else 0)


print("positive %", round(ukr_df['positive_reactions'].sum() / (ukr_df['positive_reactions'].sum() + ukr_df['negative_reactions'].sum()) * 100,1))
print("negative %", round(ukr_df['negative_reactions'].sum() / (ukr_df['positive_reactions'].sum() + ukr_df['negative_reactions'].sum()) * 100,1))

positive % 63.5
negative % 36.5


In [47]:
ukr_df = df[df['category'] == "Ukranian domestic affairs"]
ukr_df['week_year'] = ukr_df['post_datetime'].dt.strftime('%Y-%U')

ukr_df['positive_reactions'] = ukr_df['reactions'].apply(lambda x: x.get('👍', 0) + x.get('😁', 0) + x.get('🔥', 0) + x.get('❤', 0) + x.get('🎉', 0) + x.get('🤣', 0) 
                                                        + x.get('🙏', 0) if pd.notna(x) else 0)


ukr_df['negative_reactions'] = ukr_df['reactions'].apply(lambda x: x.get('👎', 0) + x.get('🤡', 0) + x.get('🤬', 0) + x.get('💩', 0) + x.get('😢', 0) + x.get('🤮', 0) 
                                                         if pd.notna(x) else 0)

ukr_df['net_reactions'] = ukr_df['positive_reactions'] - ukr_df['negative_reactions']

ukr_result = ukr_df.groupby(['phase', 'week_year']).agg({'net_reactions': 'sum'}).reset_index()

ukr_result['mean_net_reactions'] = ukr_result['net_reactions'] // 7

ukr_result = ukr_result[1:]
ukr_result = ukr_result.reset_index(drop=True)

ukr_grouped_data = [ukr_result['mean_net_reactions'][ukr_result['phase'] == phase] for phase in ukr_result['phase'].unique()]

In [48]:
# one way ANOVA
ukr_f_statistic, ukr_p_value = f_oneway(*ukr_grouped_data)

# print the results
print(f'F-statistic: {ukr_f_statistic}')
print(f'P-value: {ukr_p_value}\n')

model = ols('mean_net_reactions ~ C(phase)', ukr_result).fit()
# Perform ANOVA and print the results
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

F-statistic: 14.64474305999538
P-value: 3.241765765957123e-10

                sum_sq    df          F        PR(>F)
C(phase)  5.174006e+10   6.0  14.644743  3.241766e-10
Residual  3.533012e+10  60.0        NaN           NaN


In [49]:
# contrast 1: 0 vs rest
print("Contrast 1: 0 vs rest\n")

stat_test(ukr_result['mean_net_reactions'][ukr_result['phase'].isin(["phase 0"])], 
     ukr_result['mean_net_reactions'][ukr_result['phase'].isin(["phase 1", "phase 2", "phase 3", "phase 4", "phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6
print("Contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6\n")

stat_test(ukr_result['mean_net_reactions'][ukr_result['phase'].isin(["phase 1", "phase 2", "phase 3"])], 
     ukr_result['mean_net_reactions'][ukr_result['phase'].isin(["phase 4", "phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 3: phase1, phase2 vs phase3
print("Contrast 3: phase1, phase2 vs phase3\n")

stat_test(ukr_result['mean_net_reactions'][ukr_result['phase'].isin(["phase 1", "phase 2"])],
        ukr_result['mean_net_reactions'][ukr_result['phase'].isin(["phase 3"])])

print("=====================================================================================================")

# contrast 4: phase4 vs phase5, phase6
print("Contrast 4: phase4 vs phase5, phase6\n")

stat_test(ukr_result['mean_net_reactions'][ukr_result['phase'].isin(["phase 4"])],
        ukr_result['mean_net_reactions'][ukr_result['phase'].isin(["phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 5: phase1 vs phase2
print("Contrast 5: phase1 vs phase2\n")

stat_test(ukr_result['mean_net_reactions'][ukr_result['phase'].isin(["phase 1"])],
        ukr_result['mean_net_reactions'][ukr_result['phase'].isin(["phase 2"])])

print("=====================================================================================================")

# contrast 6: phase5 vs phase6
print("Contrast 6: phase5 vs phase6\n")

stat_test(ukr_result['mean_net_reactions'][ukr_result['phase'].isin(["phase 5"])],
        ukr_result['mean_net_reactions'][ukr_result['phase'].isin(["phase 6"])])

print("=====================================================================================================")

Contrast 1: 0 vs rest

Levene p-value: 0.01139871539130816
Unequal variance t-test stat: -8.208281336926365
Unequal variance t-test p-value: 2.7438543901293394e-11
degree of freedom (unequal variance): 58.00046531393476
Reject the null hypothesis - significant difference between groups
Contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6

Levene p-value: 0.009236491778450127
Unequal variance t-test stat: -3.5113586011255666
Unequal variance t-test p-value: 0.000935188226693393
degree of freedom (unequal variance): 51.620946895719115
Reject the null hypothesis - significant difference between groups
Contrast 3: phase1, phase2 vs phase3

Levene p-value: 0.6096675781342513
Equal variance t-test stat: -5.116944581390314
Equal variance t-test p-value: 4.549795187631351e-05
Degrees of freedom (equal variance): 21
Reject the null hypothesis - significant difference between groups
Contrast 4: phase4 vs phase5, phase6

Levene p-value: 0.4478945134979282
Equal variance t-test stat: 0.096

In [50]:
# # Run Tukey's test
# tukey = pairwise_tukeyhsd(endog=ukr_result['mean_net_reactions'],  # Data
#                           groups=ukr_result['phase'],  # Groups
#                           alpha=0.05)  # Significance level

# # Test summary
# tukey.summary()              

In [51]:
# # For tableau visualization: Area chart of mean post count, net positive reactions, and net negative reactions

# post_ukr_df = ukr_df.groupby(['phase', 'week_year']).size().reset_index(name='posts_count')

# post_ukr_df['mean_post_count'] = post_ukr_df['posts_count'] // 7


# grouped = ukr_df.groupby(['phase', 'week_year']).agg({'positive_reactions': 'sum', 'negative_reactions': 'sum'})

# # Divide sum of reactions by 7 to get daily average
# grouped['net_positive_reactions'] = grouped['positive_reactions'] // 7
# grouped['net_negative_reactions'] = grouped['negative_reactions'] // 7

# # Reset index to make 'week_year' a column
# grouped.reset_index(inplace=True)

# pd.merge(post_ukr_df, grouped, on=['phase', 'week_year'])[['phase', 'week_year', 'mean_post_count', 
#                                                            'net_positive_reactions', 
#                                                            'net_negative_reactions']].to_csv('topic_ukraine_reactions.csv', 
#                                                                                              index=False)

### Topic 5: Economy 

In [52]:
# net reactions for economy category in phase 1
eco_df = df[(df['category'] == "Economy") & (df['phase'] == "phase 3")]

eco_df['positive_reactions'] = eco_df['reactions'].apply(lambda x: x.get('👍', 0) + x.get('😁', 0) + x.get('🔥', 0) + x.get('❤', 0) + x.get('🎉', 0) + x.get('🤣', 0) 
                                                        + x.get('🙏', 0) if pd.notna(x) else 0)

eco_df['negative_reactions'] = eco_df['reactions'].apply(lambda x: x.get('👎', 0) + x.get('🤡', 0) + x.get('🤬', 0) + x.get('💩', 0) + x.get('😢', 0) + x.get('🤮', 0) 
                                                         if pd.notna(x) else 0)

eco_df['positive_reactions'].sum(), eco_df['negative_reactions'].sum(), (eco_df['positive_reactions'].sum() + eco_df['negative_reactions'].sum())

print("positive %", eco_df['positive_reactions'].sum() / (eco_df['positive_reactions'].sum() + eco_df['negative_reactions'].sum()) * 100)
print("negative %", eco_df['negative_reactions'].sum() / (eco_df['positive_reactions'].sum() + eco_df['negative_reactions'].sum()) * 100)

positive % 66.1767605811374
negative % 33.82323941886259


In [53]:
eco_df = df[df['category'] == "Economy"]
eco_df['week_year'] = eco_df['post_datetime'].dt.strftime('%Y-%U')

eco_df['positive_reactions'] = eco_df['reactions'].apply(lambda x: x.get('👍', 0) + x.get('😁', 0) + x.get('🔥', 0) + x.get('❤', 0) + x.get('🎉', 0) + x.get('🤣', 0) 
                                                        + x.get('🙏', 0) if pd.notna(x) else 0)


eco_df['negative_reactions'] = eco_df['reactions'].apply(lambda x: x.get('👎', 0) + x.get('🤡', 0) + x.get('🤬', 0) + x.get('💩', 0) + x.get('😢', 0) + x.get('🤮', 0) 
                                                         if pd.notna(x) else 0)

eco_df['net_reactions'] = eco_df['positive_reactions'] - eco_df['negative_reactions']

eco_result = eco_df.groupby(['phase', 'week_year']).agg({'net_reactions': 'sum'}).reset_index()

eco_result['mean_net_reactions'] = eco_result['net_reactions'] // 7

eco_result = eco_result[1:]
eco_result = eco_result.reset_index(drop=True)

eco_grouped_data = [eco_result['mean_net_reactions'][eco_result['phase'] == phase] for phase in eco_result['phase'].unique()]

In [54]:
# one way ANOVA
eco_f_statistic, eco_p_value = f_oneway(*eco_grouped_data)

# print the results
print(f'F-statistic: {eco_f_statistic}')
print(f'P-value: {eco_p_value}\n')

model = ols('mean_net_reactions ~ C(phase)', eco_result).fit()
# Perform ANOVA and print the results
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

F-statistic: 16.30051030980239
P-value: 4.991199495491504e-11

                sum_sq    df         F        PR(>F)
C(phase)  6.150627e+10   6.0  16.30051  4.991199e-11
Residual  3.773273e+10  60.0       NaN           NaN


In [55]:
# contrast 1: 0 vs rest
print("Contrast 1: 0 vs rest\n")

stat_test(eco_result['mean_net_reactions'][eco_result['phase'].isin(["phase 0"])], 
     eco_result['mean_net_reactions'][eco_result['phase'].isin(["phase 1", "phase 2", "phase 3", "phase 4", "phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6
print("Contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6\n")

stat_test(eco_result['mean_net_reactions'][eco_result['phase'].isin(["phase 1", "phase 2", "phase 3"])], 
     eco_result['mean_net_reactions'][eco_result['phase'].isin(["phase 4", "phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 3: phase1, phase2 vs phase3
print("Contrast 3: phase1, phase2 vs phase3\n")

stat_test(eco_result['mean_net_reactions'][eco_result['phase'].isin(["phase 1", "phase 2"])],
        eco_result['mean_net_reactions'][eco_result['phase'].isin(["phase 3"])])

print("=====================================================================================================")

# contrast 4: phase4 vs phase5, phase6
print("Contrast 4: phase4 vs phase5, phase6\n")

stat_test(eco_result['mean_net_reactions'][eco_result['phase'].isin(["phase 4"])],
        eco_result['mean_net_reactions'][eco_result['phase'].isin(["phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 5: phase1 vs phase2
print("Contrast 5: phase1 vs phase2\n")

stat_test(eco_result['mean_net_reactions'][eco_result['phase'].isin(["phase 1"])],
        eco_result['mean_net_reactions'][eco_result['phase'].isin(["phase 2"])])

print("=====================================================================================================")

# contrast 6: phase5 vs phase6
print("Contrast 6: phase5 vs phase6\n")

stat_test(eco_result['mean_net_reactions'][eco_result['phase'].isin(["phase 5"])],
        eco_result['mean_net_reactions'][eco_result['phase'].isin(["phase 6"])])

print("=====================================================================================================")

Contrast 1: 0 vs rest

Levene p-value: 0.1906054498696895
Equal variance t-test stat: -1.7952365266445665
Equal variance t-test p-value: 0.07726484838956252
Degrees of freedom (equal variance): 65
Fail to reject the null hypothesis - no significant difference between groups
Contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6

Levene p-value: 0.009491332103066689
Unequal variance t-test stat: 2.889173293387674
Unequal variance t-test p-value: 0.008354136960176119
degree of freedom (unequal variance): 22.664393736961657
Reject the null hypothesis - significant difference between groups
Contrast 3: phase1, phase2 vs phase3

Levene p-value: 0.1165308720756376
Equal variance t-test stat: 2.0712426787075686
Equal variance t-test p-value: 0.05084771554176285
Degrees of freedom (equal variance): 21
Fail to reject the null hypothesis - no significant difference between groups
Contrast 4: phase4 vs phase5, phase6

Levene p-value: 0.6763241084111056
Equal variance t-test stat: 1.52409586

In [56]:
# # Run Tukey's test
# tukey = pairwise_tukeyhsd(endog=eco_result['mean_net_reactions'],  # Data
#                           groups=eco_result['phase'],  # Groups
#                           alpha=0.05)  # Significance level

# # Test summary
# tukey.summary()              

In [57]:
# grouped = eco_df.groupby(['phase', 'week_year']).agg({'positive_reactions': 'sum', 'negative_reactions': 'sum'})

# # Divide sum of reactions by 7 to get daily average
# grouped['net_positive_reactions'] = grouped['positive_reactions'] // 7
# grouped['net_negative_reactions'] = grouped['negative_reactions'] // 7

# # Reset index to make 'week_year' a column
# grouped.reset_index(inplace=True)

# grouped[['phase', 'week_year', 'net_positive_reactions', 'net_negative_reactions']]

In [58]:
# # For tableau visualization: Area chart of mean post count, net positive reactions, and net negative reactions

# post_eco_df = eco_df.groupby(['phase', 'week_year']).size().reset_index(name='posts_count')

# post_eco_df['mean_post_count'] = post_eco_df['posts_count'] // 7

# grouped = eco_df.groupby(['phase', 'week_year']).agg({'positive_reactions': 'sum', 'negative_reactions': 'sum'})

# # Divide sum of reactions by 7 to get daily average
# grouped['net_positive_reactions'] = grouped['positive_reactions'] // 7
# grouped['net_negative_reactions'] = grouped['negative_reactions'] // 7

# # Reset index to make 'week_year' a column
# grouped.reset_index(inplace=True)

# pd.merge(post_eco_df, grouped, on=['phase', 'week_year'])[['phase', 'week_year', 'mean_post_count', 
#                                                            'net_positive_reactions', 
#                                                            'net_negative_reactions']].to_csv('topic_economy_reactions.csv', 
#                                                                                              index=False)