# Statistical Tests and Aggregation
### Initial Setup

In [62]:
# Import Depencies
import pandas as pd 

In [66]:
# Load saved and analyzed tweets
df = pd.read_csv('csv/tweets.csv')
df.tail()

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,verified,status_count,geo,coords,retweet_count,tweet_favourite_count,Subjectivity,Polarity,Sentiment
232053,2021-03-15 00:03:00,1371250540449988610,f0lake: No i dont think u understand i need to...,1078018972975476736,jenflowerr,"California, USA",20,2299,,False,False,1808,,,142,0,0.0,0.0,neutral
232054,2021-03-15 00:02:59,1371250536633208834,PeterHotez: It’s why we might eventually move ...,828324944522141696,karenh7463,"Indiana, USA",641,188950,,True,False,53018,,,495,0,0.5,0.25,positive
232055,2021-03-15 00:02:58,1371250532568956935,web_rant: mikeallen axios kadiagoba Gosh Mike ...,831767249723731981,randypilsr,"Arkansas, USA",456,22502,,True,False,12150,,,2,0,0.1,0.0,neutral
232056,2021-03-15 00:02:57,1371250531516223488,Reuters: AstraZeneca finds no evidence of incr...,290224215,cadenjames1,Netherlands,141,172,,True,False,11705,,,124,0,0.0,0.0,neutral
232057,2021-03-15 00:02:57,1371250530123718660,Found my old WHO vaccine cert booklet from whe...,92157908,Juliagoolia1982,Anywhere but here,761,16382,,True,False,13107,,,0,0,0.2,0.1,positive


In [67]:
# Function to highlight which drug company was mentioned in the tweet (if any)
def GetManufacturer(txt):
    txt = txt.lower()
    if 'moderna' in txt:
        return 'mo'
    elif 'pfizer' in txt: 
        return 'pf'
    elif 'astra' in txt:
        return 'az'
    else:
        return '0'

In [68]:
# Applying the function above
df['manufacturer'] = df['text'].apply(GetManufacturer)
df.tail()

Unnamed: 0,created,id_str,text,user_id,screen_name,location,followers_count,user_favourites_count,time_zone,geo_enabled,verified,status_count,geo,coords,retweet_count,tweet_favourite_count,Subjectivity,Polarity,Sentiment,manufacturer
232053,2021-03-15 00:03:00,1371250540449988610,f0lake: No i dont think u understand i need to...,1078018972975476736,jenflowerr,"California, USA",20,2299,,False,False,1808,,,142,0,0.0,0.0,neutral,pf
232054,2021-03-15 00:02:59,1371250536633208834,PeterHotez: It’s why we might eventually move ...,828324944522141696,karenh7463,"Indiana, USA",641,188950,,True,False,53018,,,495,0,0.5,0.25,positive,mo
232055,2021-03-15 00:02:58,1371250532568956935,web_rant: mikeallen axios kadiagoba Gosh Mike ...,831767249723731981,randypilsr,"Arkansas, USA",456,22502,,True,False,12150,,,2,0,0.1,0.0,neutral,0
232056,2021-03-15 00:02:57,1371250531516223488,Reuters: AstraZeneca finds no evidence of incr...,290224215,cadenjames1,Netherlands,141,172,,True,False,11705,,,124,0,0.0,0.0,neutral,az
232057,2021-03-15 00:02:57,1371250530123718660,Found my old WHO vaccine cert booklet from whe...,92157908,Juliagoolia1982,Anywhere but here,761,16382,,True,False,13107,,,0,0,0.2,0.1,positive,0


In [69]:
# Select only the columns being used in this workbook
df = df[['id_str','retweet_count','tweet_favourite_count','Subjectivity','Polarity','manufacturer']]

### Making the Polarity of tweets easier to visualize 

In [71]:
# Create a new dataframe for the percentage polarity analysis
polarity_df = df
polarity_df['Subjectivity'] = polarity_df['Subjectivity'].round(decimals=1)
polarity_df['Polarity'] = polarity_df['Polarity'].round(decimals=1)
polarity_df = polarity_df.drop(columns=['id_str'])
polarity_df.tail(3)

Unnamed: 0,retweet_count,tweet_favourite_count,Subjectivity,Polarity,manufacturer
232055,2,0,0.1,0.0,0
232056,124,0,0.0,0.0,az
232057,0,0,0.2,0.1,0


In [72]:
# exploring the dataset
df2 = polarity_df.groupby(by =['manufacturer','Polarity']).sum()
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,retweet_count,tweet_favourite_count,Subjectivity
manufacturer,Polarity,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,-1.0,100855,159,182.4
0,-0.9,616,24,13.8
0,-0.8,3698,477,142.6
0,-0.7,880,140,67.6
0,-0.6,44104,416,277.8
...,...,...,...,...
pf,0.6,89742,2916,1036.3
pf,0.7,158,99,36.2
pf,0.8,5492,473,174.9
pf,0.9,352,73,19.8


In [73]:
# Can remove even more columns
polarity_df = polarity_df[['manufacturer','Polarity']]

In [74]:
# Create groupby object by manufacturer
by_man = polarity_df.groupby(by=['manufacturer'])

In [75]:
# Perform value counts by polarity
# Normalised because of the large gaps between polarity points
pf = by_man['Polarity'].value_counts(normalize=True).loc['pf']
az = by_man['Polarity'].value_counts(normalize=True).loc['az']
mo = by_man['Polarity'].value_counts(normalize=True).loc['mo']
un = by_man['Polarity'].value_counts(normalize=True).loc['0']

In [76]:
# Combine polarity counts into datafram and visualize
polarity = pd.concat(
    [pf,az,mo,un], 
    axis=1, 
    keys=['Pfizer-BioNTech','AstraZeneca','Moderna','Unknown']
)
polarity = polarity.reset_index()
polarity

Unnamed: 0,Polarity,Pfizer-BioNTech,AstraZeneca,Moderna,Unknown
0,-1.0,0.000715,0.00041,0.000517,0.001169
1,-0.9,0.000204,7.2e-05,0.000222,0.000102
2,-0.8,0.000817,0.000868,0.002661,0.000978
3,-0.7,0.000664,0.000554,0.001922,0.000559
4,-0.6,0.001123,0.004121,0.00643,0.002013
5,-0.5,0.005463,0.002988,0.003769,0.00658
6,-0.4,0.009394,0.009254,0.006652,0.007774
7,-0.3,0.009854,0.011182,0.009534,0.009222
8,-0.2,0.032217,0.028436,0.025868,0.024281
9,-0.1,0.032727,0.097646,0.039394,0.038147


In [None]:
# save a copy
# polarity.to_csv('csv/polarity.csv',index=False, encoding='UTF-8')

In [49]:

polarity

Unnamed: 0,Polarity,Pfizer-BioNTech,AstraZeneca,Moderna,Unknown
0,-1.0,0.000715,0.00041,0.000517,0.001169
1,-0.9,0.000204,7.2e-05,0.000222,0.000102
2,-0.8,0.000817,0.000868,0.002661,0.000978
3,-0.7,0.000664,0.000554,0.001922,0.000559
4,-0.6,0.001123,0.004121,0.00643,0.002013
5,-0.5,0.005463,0.002988,0.003769,0.00658
6,-0.4,0.009394,0.009254,0.006652,0.007774
7,-0.3,0.009854,0.011182,0.009534,0.009222
8,-0.2,0.032217,0.028436,0.025868,0.024281
9,-0.1,0.032727,0.097646,0.039394,0.038147


### Comparing the polarity scores across Manufacturers

In [77]:
# For stats 
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html
from scipy import stats

In [79]:
df2 = df[['retweet_count','tweet_favourite_count','Polarity','manufacturer']]
df2

Unnamed: 0,retweet_count,tweet_favourite_count,Polarity,manufacturer
0,0,1,0.0,pf
1,92,0,0.2,0
2,0,1,0.5,0
3,102,0,0.2,pf
4,1,0,0.0,0
...,...,...,...,...
232053,142,0,0.0,pf
232054,495,0,0.2,mo
232055,2,0,0.0,0
232056,124,0,0.0,az


In [80]:
# create a new groupby ojbect by manufacturer 
df2_man = df.groupby(by=['manufacturer'])

In [82]:
df2_man.mean()

Unnamed: 0_level_0,id_str,retweet_count,tweet_favourite_count,Subjectivity,Polarity
manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1.370442e+18,3663.978164,4.151971,0.339735,0.172315
az,1.37056e+18,359.724414,2.170494,0.216722,0.043027
mo,1.37056e+18,1486.531633,4.978271,0.29184,0.09252
pf,1.370516e+18,167.42704,5.882569,0.346579,0.130522


In [84]:
small_df = df[['Polarity','manufacturer']]

In [85]:
az_df = small_df.loc[small_df.manufacturer =='az']
mo_df = small_df.loc[small_df.manufacturer =='mo']
pf_df = small_df.loc[small_df.manufacturer =='pf']
un_df = small_df.loc[small_df.manufacturer =='0']

In [87]:
# Perform welch's ttest (assumes unequal variances)
az_mo = stats.ttest_ind(az_df.Polarity, mo_df.Polarity, equal_var = False)
az_mo

Ttest_indResult(statistic=-23.059598150244078, pvalue=2.988406103494073e-116)

In [88]:
az_pf = stats.ttest_ind(az_df.Polarity, pf_df.Polarity, equal_var = False)
az_pf

Ttest_indResult(statistic=-44.17353918862547, pvalue=0.0)

In [89]:
az_un = stats.ttest_ind(az_df.Polarity, un_df.Polarity, equal_var = False)
az_un

Ttest_indResult(statistic=-102.87225840343122, pvalue=0.0)

In [90]:
mo_pf = stats.ttest_ind(mo_df.Polarity, pf_df.Polarity, equal_var = False)
mo_pf

Ttest_indResult(statistic=-14.884349261786252, pvalue=6.238421682411701e-50)

In [91]:
mo_un = stats.ttest_ind(mo_df.Polarity, un_df.Polarity, equal_var = False)
mo_un

Ttest_indResult(statistic=-39.053674950792164, pvalue=0.0)

In [92]:
pf_un = stats.ttest_ind(pf_df.Polarity, un_df.Polarity, equal_var = False)
pf_un

Ttest_indResult(statistic=-22.36760411450008, pvalue=7.527063949692831e-110)