### ANOVA Analysis on the avg. post volumes and net user reactions across the phases for top 5 categories

In [1]:
import pandas as pd
from datetime import datetime
import warnings
# from googletrans import Translator
# translator = Translator()
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
# import researchpay as rp
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import f_oneway
from scipy.stats import ttest_ind
from scipy.stats import levene
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [2]:
df = pd.read_csv("antiKremlin_dataset.csv")

In [4]:
df["post_datetime"] = pd.to_datetime(df['post_datetime'])
df['post_date'] = pd.to_datetime(df['post_datetime'].dt.date)

In [5]:
def stat_test(*data):
    levene_f_statistic, levene_p_value = levene(*data)
    # print(f'Levene F-statistic: {levene_f_statistic}')
    print(f'Levene p-value: {levene_p_value}')

    if levene_p_value > 0.05:
        t, p = ttest_ind(*data)
        print(f'Equal variance t-test stat: {round(t, 3)} ----', t)
        print(f'Equal variance t-test p-value: {round(p, 3)}')

        # Calculate degrees of freedom for equal variances
        n1 = len(data[0])
        n2 = len(data[1])
        dof = n1 + n2 - 2
        print(f"n1: {n1}, n2: {n2}, Degrees of freedom (equal variance): {dof}")

    else:
        t, p = ttest_ind(*data, equal_var=False)
        print(f'Unequal variance t-test stat: {round(t, 3)} ----', t)
        print(f'Unequal variance t-test p-value: {p}')

        group1 = np.array(data[0])
        group2 = np.array(data[1])

        n1 = len(group1)
        n2 = len(group2)

        var1 = np.var(group1, ddof=1)
        var2 = np.var(group2, ddof=1)

        # Welch-Satterthwaite equation for degrees of freedom
        numerator = (var1/n1 + var2/n2)**2
        denominator = ((var1/n1)**2 / (n1 - 1)) + ((var2/n2)**2 / (n2 - 1))
        dof = numerator / denominator
        
        print(f"n1: {n1}, n2: {n2}, degree of freedom (unequal variance): {dof}")

    if p < 0.05:
        print('Reject the null hypothesis - SIGNIFICANT difference between groups')
    else:
        print('Fail to reject the null hypothesis - NO-SIGNIFICANT difference between groups')

### Topic 1: International Politics	

In [6]:
# International Politics categories
int_df = df[df['category'] == "International Politics"]
int_df['week_year'] = int_df['post_datetime'].dt.strftime('%Y-%W')
int_df['weekday_number'] = int_df['post_datetime'].dt.weekday

int_result = int_df.groupby(['phase', 'week_year']).agg(post_count=('post_id', 'count'),
                                                    days_captured=('weekday_number', 
                                                                   lambda x: x.max() - x.min() + 1)).reset_index()

int_result['mean_posts'] = round(int_result['post_count'] / int_result['days_captured']).astype(int)
int_result = int_result[1:]
int_result = int_result[~((int_result['phase'] == "phase 6") & (int_result['week_year'] == '2023-00'))]

int_result = int_result.reset_index(drop=True)

int_grouped_data = [int_result['mean_posts'][int_result['phase'] == phase] 
                    for phase in int_result['phase'].unique()]

In [7]:
# int_f_statistic, int_p_value = f_oneway(*int_grouped_data)

#### ANOVA results

In [8]:
model = ols('mean_posts ~ C(phase)', int_result).fit()

# Perform ANOVA and print the results
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

                 sum_sq    df        F    PR(>F)
C(phase)   54099.093362   6.0  1.74326  0.126874
Residual  305160.679365  59.0      NaN       NaN


#### T-test results

In [10]:
# contrast 1: 0 vs rest
print("Contrast 1: 0 vs rest\n")

stat_test(int_result['mean_posts'][int_result['phase'].isin(["phase 0"])], 
     int_result['mean_posts'][int_result['phase'].isin(["phase 1", "phase 2", "phase 3", "phase 4", "phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6
print("Contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6\n")

stat_test(int_result['mean_posts'][int_result['phase'].isin(["phase 1", "phase 2", "phase 3"])], 
     int_result['mean_posts'][int_result['phase'].isin(["phase 4", "phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 3: phase1, phase2 vs phase3
print("Contrast 3: phase1, phase2 vs phase3\n")

stat_test(int_result['mean_posts'][int_result['phase'].isin(["phase 1", "phase 2"])],
        int_result['mean_posts'][int_result['phase'].isin(["phase 3"])])

print("=====================================================================================================")

# contrast 4: phase4 vs phase5, phase6
print("Contrast 4: phase4 vs phase5, phase6\n")

stat_test(int_result['mean_posts'][int_result['phase'].isin(["phase 4"])],
        int_result['mean_posts'][int_result['phase'].isin(["phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 5: phase1 vs phase2
print("Contrast 5: phase1 vs phase2\n")

stat_test(int_result['mean_posts'][int_result['phase'].isin(["phase 1"])],
        int_result['mean_posts'][int_result['phase'].isin(["phase 2"])])

print("=====================================================================================================")

# contrast 6: phase5 vs phase6
print("Contrast 6: phase5 vs phase6\n")

stat_test(int_result['mean_posts'][int_result['phase'].isin(["phase 5"])],
        int_result['mean_posts'][int_result['phase'].isin(["phase 6"])])

print("=====================================================================================================")

Contrast 1: 0 vs rest

Levene p-value: 0.128332966793341
Equal variance t-test stat: 0.247 ---- 0.24723333946026232
Equal variance t-test p-value: 0.806
n1: 8, n2: 58, Degrees of freedom (equal variance): 64
Fail to reject the null hypothesis - NO-SIGNIFICANT difference between groups
Contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6

Levene p-value: 0.43802143071223487
Equal variance t-test stat: -0.448 ---- -0.4480159511772867
Equal variance t-test p-value: 0.656
n1: 22, n2: 36, Degrees of freedom (equal variance): 56
Fail to reject the null hypothesis - NO-SIGNIFICANT difference between groups
Contrast 3: phase1, phase2 vs phase3

Levene p-value: 0.14302820157757826
Equal variance t-test stat: 1.516 ---- 1.5159847775748998
Equal variance t-test p-value: 0.145
n1: 13, n2: 9, Degrees of freedom (equal variance): 20
Fail to reject the null hypothesis - NO-SIGNIFICANT difference between groups
Contrast 4: phase4 vs phase5, phase6

Levene p-value: 0.6466554498004444
Equal vari

### Topic 2: Russian Domestic Affairs

In [11]:
rus_df = df[df['category'] == "Russian domestic politics"]
rus_df['week_year'] = rus_df['post_datetime'].dt.strftime('%Y-%W')
rus_df['weekday_number'] = rus_df['post_datetime'].dt.weekday

rus_result = rus_df.groupby(['phase', 'week_year']).agg(post_count=('post_id', 'count'),
                                                    days_captured=('weekday_number', 
                                                                   lambda x: x.max() - x.min() + 1)).reset_index()

rus_result['mean_posts'] = round(rus_result['post_count'] / rus_result['days_captured']).astype(int)
rus_result = rus_result[1:]
rus_result = rus_result[~((rus_result['phase'] == "phase 6") & (rus_result['week_year'] == '2023-00'))]

rus_result = rus_result.reset_index(drop=True)

res_grouped_data = [rus_result['mean_posts'][rus_result['phase'] == phase] 
                    for phase in rus_result['phase'].unique()]

In [12]:
# res_p_value = f_oneway(*res_grouped_data)

#### ANOVA results

In [13]:
model = ols('mean_posts ~ C(phase)', rus_result).fit()

# Perform ANOVA and print the results
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

                 sum_sq    df         F    PR(>F)
C(phase)  185051.756066   6.0  5.954099  0.000065
Residual  305617.274237  59.0       NaN       NaN


#### t-test results

In [14]:
# contrast 1: 0 vs rest
print("Contrast 1: 0 vs rest\n")

stat_test(rus_result['mean_posts'][rus_result['phase'].isin(["phase 0"])], 
     rus_result['mean_posts'][rus_result['phase'].isin(["phase 1", "phase 2", "phase 3", "phase 4", "phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6
print("Contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6\n")

stat_test(rus_result['mean_posts'][rus_result['phase'].isin(["phase 1", "phase 2", "phase 3"])], 
     rus_result['mean_posts'][rus_result['phase'].isin(["phase 4", "phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 3: phase1, phase2 vs phase3
print("Contrast 3: phase1, phase2 vs phase3\n")

stat_test(rus_result['mean_posts'][rus_result['phase'].isin(["phase 1", "phase 2"])],
        rus_result['mean_posts'][rus_result['phase'].isin(["phase 3"])])

print("=====================================================================================================")

# contrast 4: phase4 vs phase5, phase6
print("Contrast 4: phase4 vs phase5, phase6\n")

stat_test(rus_result['mean_posts'][rus_result['phase'].isin(["phase 4"])],
        rus_result['mean_posts'][rus_result['phase'].isin(["phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 5: phase1 vs phase2
print("Contrast 5: phase1 vs phase2\n")

stat_test(rus_result['mean_posts'][rus_result['phase'].isin(["phase 1"])],
        rus_result['mean_posts'][rus_result['phase'].isin(["phase 2"])])

print("=====================================================================================================")

# contrast 6: phase5 vs phase6
print("Contrast 6: phase5 vs phase6\n")

stat_test(rus_result['mean_posts'][rus_result['phase'].isin(["phase 5"])],
        rus_result['mean_posts'][rus_result['phase'].isin(["phase 6"])])

print("=====================================================================================================")

Contrast 1: 0 vs rest

Levene p-value: 0.3202943542231444
Equal variance t-test stat: -2.012 ---- -2.012128433034128
Equal variance t-test p-value: 0.048
n1: 8, n2: 58, Degrees of freedom (equal variance): 64
Reject the null hypothesis - SIGNIFICANT difference between groups
Contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6

Levene p-value: 0.2781811180031395
Equal variance t-test stat: 1.104 ---- 1.1043453536549581
Equal variance t-test p-value: 0.274
n1: 22, n2: 36, Degrees of freedom (equal variance): 56
Fail to reject the null hypothesis - NO-SIGNIFICANT difference between groups
Contrast 3: phase1, phase2 vs phase3

Levene p-value: 0.13359836139533843
Equal variance t-test stat: 1.486 ---- 1.485803233403784
Equal variance t-test p-value: 0.153
n1: 13, n2: 9, Degrees of freedom (equal variance): 20
Fail to reject the null hypothesis - NO-SIGNIFICANT difference between groups
Contrast 4: phase4 vs phase5, phase6

Levene p-value: 0.10386415911966984
Equal variance t-test s

### Topic 3: Combat and Frontline Updates

In [15]:
war_df = df[df['category'] == "Combat and Frontline updates"]
war_df['week_year'] = war_df['post_datetime'].dt.strftime('%Y-%W')
war_df['weekday_number'] = war_df['post_datetime'].dt.weekday

war_result = war_df.groupby(['phase', 'week_year']).agg(post_count=('post_id', 'count'),
                                                    days_captured=('weekday_number', 
                                                                   lambda x: x.max() - x.min() + 1)).reset_index()

war_result['mean_posts'] = round(war_result['post_count'] / war_result['days_captured']).astype(int)
war_result = war_result[1:]
war_result = war_result[~((war_result['phase'] == "phase 6") & (war_result['week_year'] == '2023-00'))]

war_result = war_result.reset_index(drop=True)

war_grouped_data = [war_result['mean_posts'][war_result['phase'] == phase] 
                    for phase in war_result['phase'].unique()]

In [16]:
# war_f_statistic, war_p_value = f_oneway(*war_grouped_data)

#### ANOVA results

In [17]:
model = ols('mean_posts ~ C(phase)', war_result).fit()

# Perform ANOVA and print the results
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

                 sum_sq    df          F        PR(>F)
C(phase)  308127.706600   5.0  66.007934  4.526769e-21
Residual   47614.012698  51.0        NaN           NaN


#### t-test for contrast weights

In [18]:
# contrast 1: 0 vs rest
print("Contrast 1: 0 vs rest\n")

stat_test(war_result['mean_posts'][war_result['phase'].isin(["phase 0"])], 
     war_result['mean_posts'][war_result['phase'].isin(["phase 1", "phase 2", "phase 3", "phase 4", "phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6
print("Contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6\n")

stat_test(war_result['mean_posts'][war_result['phase'].isin(["phase 1", "phase 2", "phase 3"])], 
     war_result['mean_posts'][war_result['phase'].isin(["phase 4", "phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 3: phase1, phase2 vs phase3
print("Contrast 3: phase1, phase2 vs phase3\n")

stat_test(war_result['mean_posts'][war_result['phase'].isin(["phase 1", "phase 2"])],
        war_result['mean_posts'][war_result['phase'].isin(["phase 3"])])

print("=====================================================================================================")

# contrast 4: phase4 vs phase5, phase6
print("Contrast 4: phase4 vs phase5, phase6\n")

stat_test(war_result['mean_posts'][war_result['phase'].isin(["phase 4"])],
        war_result['mean_posts'][war_result['phase'].isin(["phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 5: phase1 vs phase2
print("Contrast 5: phase1 vs phase2\n")

stat_test(war_result['mean_posts'][war_result['phase'].isin(["phase 1"])],
        war_result['mean_posts'][war_result['phase'].isin(["phase 2"])])

print("=====================================================================================================")

# contrast 6: phase5 vs phase6
print("Contrast 6: phase5 vs phase6\n")

stat_test(war_result['mean_posts'][war_result['phase'].isin(["phase 5"])],
        war_result['mean_posts'][war_result['phase'].isin(["phase 6"])])

print("=====================================================================================================")

Contrast 1: 0 vs rest

Levene p-value: nan
Unequal variance t-test stat: nan ---- nan
Unequal variance t-test p-value: nan
n1: 0, n2: 57, degree of freedom (unequal variance): nan
Fail to reject the null hypothesis - NO-SIGNIFICANT difference between groups
Contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6

Levene p-value: 0.0078282126264377
Unequal variance t-test stat: 8.002 ---- 8.001844700810185
Unequal variance t-test p-value: 1.0260983162288306e-08
n1: 21, n2: 36, degree of freedom (unequal variance): 28.0104090927572
Reject the null hypothesis - SIGNIFICANT difference between groups
Contrast 3: phase1, phase2 vs phase3

Levene p-value: 0.055043334142563224
Equal variance t-test stat: 3.16 ---- 3.1604260324130147
Equal variance t-test p-value: 0.005
n1: 12, n2: 9, Degrees of freedom (equal variance): 19
Reject the null hypothesis - SIGNIFICANT difference between groups
Contrast 4: phase4 vs phase5, phase6

Levene p-value: 0.00013510854892727377
Unequal variance t-test 

### Topic 4: Ukrainian Domestic Affairs

In [20]:
ukr_df = df[df['category'] == "Ukranian domestic affairs"]
ukr_df['week_year'] = ukr_df['post_datetime'].dt.strftime('%Y-%W')
ukr_df['weekday_number'] = ukr_df['post_datetime'].dt.weekday

ukr_result = ukr_df.groupby(['phase', 'week_year']).agg(post_count=('post_id', 'count'),
                                                    days_captured=('weekday_number', 
                                                                   lambda x: x.max() - x.min() + 1)).reset_index()

ukr_result['mean_posts'] = round(ukr_result['post_count'] / ukr_result['days_captured']).astype(int)
ukr_result = ukr_result[1:]
ukr_result = ukr_result[~((ukr_result['phase'] == "phase 6") & (ukr_result['week_year'] == '2023-00'))]

ukr_result = ukr_result.reset_index(drop=True)

ukr_grouped_data = [ukr_result['mean_posts'][ukr_result['phase'] == phase] 
                    for phase in ukr_result['phase'].unique()]

In [22]:
# ukr_f_statistic, ukr_p_value = f_oneway(*ukr_grouped_data)
# # Print the results
# print(f'F-statistic: {ukr_f_statistic}')
# print(f'P-value: {ukr_p_value}')

In [23]:
model = ols('mean_posts ~ C(phase)', ukr_result).fit()

# Perform ANOVA and print the results
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

                 sum_sq    df          F        PR(>F)
C(phase)  447742.278211   6.0  24.902783  1.719406e-14
Residual  176799.479365  59.0        NaN           NaN


### Topic 5: Economy 

In [24]:
eco_df = df[df['category'] == "Economy"]
eco_df['week_year'] = eco_df['post_datetime'].dt.strftime('%Y-%W')
eco_df['weekday_number'] = eco_df['post_datetime'].dt.weekday

eco_result = eco_df.groupby(['phase', 'week_year']).agg(post_count=('post_id', 'count'),
                                                    days_captured=('weekday_number', 
                                                                   lambda x: x.max() - x.min() + 1)).reset_index()

eco_result['mean_posts'] = round(eco_result['post_count'] / eco_result['days_captured']).astype(int)
eco_result = eco_result[1:]
eco_result = eco_result[~((eco_result['phase'] == "phase 6") & (eco_result['week_year'] == '2023-00'))]

eco_result = eco_result.reset_index(drop=True)

eco_grouped_data = [eco_result['mean_posts'][eco_result['phase'] == phase] 
                    for phase in eco_result['phase'].unique()]

#### ANOVA results

In [25]:
model = ols('mean_posts ~ C(phase)', eco_result).fit()

# Perform ANOVA and print the results
aov_table = sm.stats.anova_lm(model, typ=2)
print(aov_table)

                 sum_sq    df          F        PR(>F)
C(phase)  140352.970596   6.0  30.668319  2.057637e-16
Residual   45002.059707  59.0        NaN           NaN


#### t-test results

In [26]:
# contrast 1: 0 vs rest
print("Contrast 1: 0 vs rest\n")

stat_test(eco_result['mean_posts'][eco_result['phase'].isin(["phase 0"])], 
     eco_result['mean_posts'][eco_result['phase'].isin(["phase 1", "phase 2", "phase 3", "phase 4", "phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6
print("Contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6\n")

stat_test(eco_result['mean_posts'][eco_result['phase'].isin(["phase 1", "phase 2", "phase 3"])], 
     eco_result['mean_posts'][eco_result['phase'].isin(["phase 4", "phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 3: phase1, phase2 vs phase3
print("Contrast 3: phase1, phase2 vs phase3\n")

stat_test(eco_result['mean_posts'][eco_result['phase'].isin(["phase 1", "phase 2"])],
        eco_result['mean_posts'][eco_result['phase'].isin(["phase 3"])])

print("=====================================================================================================")

# contrast 4: phase4 vs phase5, phase6
print("Contrast 4: phase4 vs phase5, phase6\n")

stat_test(eco_result['mean_posts'][eco_result['phase'].isin(["phase 4"])],
        eco_result['mean_posts'][eco_result['phase'].isin(["phase 5", "phase 6"])])

print("=====================================================================================================")

# contrast 5: phase1 vs phase2
print("Contrast 5: phase1 vs phase2\n")

stat_test(eco_result['mean_posts'][eco_result['phase'].isin(["phase 1"])],
        eco_result['mean_posts'][eco_result['phase'].isin(["phase 2"])])

print("=====================================================================================================")

# contrast 6: phase5 vs phase6
print("Contrast 6: phase5 vs phase6\n")

stat_test(eco_result['mean_posts'][eco_result['phase'].isin(["phase 5"])],
        eco_result['mean_posts'][eco_result['phase'].isin(["phase 6"])])

print("=====================================================================================================")

Contrast 1: 0 vs rest

Levene p-value: 0.2104789983120505
Equal variance t-test stat: -1.996 ---- -1.995703723306359
Equal variance t-test p-value: 0.05
n1: 8, n2: 58, Degrees of freedom (equal variance): 64
Fail to reject the null hypothesis - NO-SIGNIFICANT difference between groups
Contrast 2: phase1, phase2, phase3 vs phase4, phase5, phase6

Levene p-value: 0.0016011093310993052
Unequal variance t-test stat: 4.935 ---- 4.934972686518643
Unequal variance t-test p-value: 5.695977394513173e-05
n1: 22, n2: 36, degree of freedom (unequal variance): 22.667528980704525
Reject the null hypothesis - SIGNIFICANT difference between groups
Contrast 3: phase1, phase2 vs phase3

Levene p-value: 0.054921361918412945
Equal variance t-test stat: 0.502 ---- 0.5020032320709616
Equal variance t-test p-value: 0.621
n1: 13, n2: 9, Degrees of freedom (equal variance): 20
Fail to reject the null hypothesis - NO-SIGNIFICANT difference between groups
Contrast 4: phase4 vs phase5, phase6

Levene p-value: 0.7

##### Tukey's test

In [45]:
# Run Tukey's test
tukey = pairwise_tukeyhsd(endog=eco_result['mean_posts'],  # Data
                          groups=eco_result['phase'],  # Groups
                          alpha=0.05)  # Significance level

# Order of results: meandiff, p-adj, lower, upper, reject
print("Tukey result comparision between phase 3 and phase 4:", tukey._results_table.data[15])

print("Tukey result comparision between phase 4 and phase 5:", tukey._results_table.data[-3])

Tukey result comparision between phase 3 and phase 4: ['phase 2', 'phase 6', -28.7033, 0.3024, -68.2231, 10.8166, False]
Tukey result comparision between phase 4 and phase 5: ['phase 4', 'phase 5', -2.8154, 1.0, -38.2733, 32.6425, False]
