In [20]:
import numpy as np
from tqdm import tqdm
from scipy.stats import pearsonr, spearmanr
import scipy.sparse as sparse
from scipy.stats import bernoulli, poisson
import analysis_utils_mine as utils

import json
import pandas as pd
import ast
from datetime import datetime
import torch
import pandas as pd
from datetime import datetime, timedelta
import pickle

import matplotlib.pyplot as plt
import xlsxwriter
from sklearn.metrics import cohen_kappa_score

In [2]:
import os

In [3]:
def get_dataframe_from_annotated_xlsx_file_path(path):
    df = pd.read_excel(path,
                       sheet_name=None,
                       engine='openpyxl')
    df = df['Sheet1']
    df = df[~pd.isnull(df['Topic Name'])]
    return df

In [7]:
annotator1_pre_tbip_speeches = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/pre_tbip_annotation_results/annotator_1/topics_for_annotation.xlsx_frazier_speech.xlsx')
annotator1_pre_tbip_speeches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 1 to 1569
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Topic           50 non-null     object 
 1   Unnamed: 1      0 non-null      float64
 2   Coherence       50 non-null     float64
 3   Polarization    48 non-null     float64
 4   Topic Name      50 non-null     object 
 5   Description     48 non-null     object 
 6   Notes/Comments  10 non-null     object 
dtypes: float64(3), object(4)
memory usage: 3.1+ KB


In [37]:
annotator1_pre_tbip_tweets = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/pre_tbip_annotation_results/annotator_1/topics_for_annotation.xlsx_Frazier_tweet.xlsx')
annotator1_pre_tbip_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 1 to 1569
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Topic           50 non-null     object 
 1   Unnamed: 1      0 non-null      float64
 2   Coherence       50 non-null     float64
 3   Polarization    50 non-null     float64
 4   Topic Name      50 non-null     object 
 5   Description     50 non-null     object 
 6   Notes/Comments  50 non-null     object 
dtypes: float64(3), object(4)
memory usage: 3.1+ KB


In [38]:
annotator2_pre_tbip_speeches = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/pre_tbip_annotation_results/annotator_2/topics_for_annotation_Hightower_speech.xlsx')
annotator2_pre_tbip_speeches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 1 to 1569
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Topic           50 non-null     object 
 1   Unnamed: 1      0 non-null      float64
 2   Coherence       50 non-null     float64
 3   Polarization    42 non-null     float64
 4   Topic Name      50 non-null     object 
 5   Description     43 non-null     object 
 6   Notes/Comments  16 non-null     object 
dtypes: float64(3), object(4)
memory usage: 3.1+ KB


In [39]:
annotator2_pre_tbip_tweets = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/pre_tbip_annotation_results/annotator_2/topics_for_annotation_Hightower_tweets.xlsx')
annotator2_pre_tbip_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 1 to 1569
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Topic           50 non-null     object 
 1   Unnamed: 1      0 non-null      float64
 2   Coherence       50 non-null     float64
 3   Polarization    39 non-null     float64
 4   Topic Name      50 non-null     object 
 5   Description     38 non-null     object 
 6   Notes/Comments  22 non-null     object 
dtypes: float64(3), object(4)
memory usage: 3.1+ KB


In [11]:
discard_labels_to_one_discard_label_map = {'DISCARD': 'DISCARD',
                                           'Discard': 'DISCARD',
                                           'Disgard': 'DISCARD',
                                           'discard': 'DISCARD'}
                                           

![image.png](attachment:4141b89e-16d5-492d-a639-a79545d856b6.png)![image.png](attachment:a57cf57f-c0da-4381-9312-368689384f69.png)

## Pre-TBIP results

_Coherence:_ Rate the topic, on a 1-3 scale, on its coherence (1 being not coherent, 3 being very coherent). Does that topic represent an easily identifiable category or a meaningful concept? The top words for the topic, along with the top documents associated with that topic help make this judgment. Broadly, a set of items can be said to be coherent if they enable human recognition of an identifiable category when viewed together. 


_Polarization_: If the topic is rated above 1 for coherence, rate the topic, on a 1-3 scale (1 being not polarized, 3 being polarized), on the expected polarization of this topic: do you expect meaningful ideological differences in the way liberals and conservatives would talk about the category or concept or issue or the stance they would hold on that issue? The top words for the topic, along with the top documents associated with that topic, as well as your personal knowledge of American politics will help make this judgment. 

Speeches

In [12]:
set(annotator1_pre_tbip_speeches['Coherence'])

{1.0, 2.0, 3.0}

In [26]:
set(annotator2_pre_tbip_speeches['Coherence'])

{1.0, 2.0, 3.0}

In [13]:
set(annotator1_pre_tbip_speeches['Polarization'])

{nan, 1.0, 3.0, nan}

In [27]:
set(annotator2_pre_tbip_speeches['Polarization'])

{nan, 1.0, 2.0, 3.0, nan, nan, nan, nan}

In [40]:
def get_stats_col(df, colname):
    n = len(df)
    l = list(df[colname])
    print('% of Topics rated 1 on ' + colname + ' = ' + str(round(100*(l.count(1.0)/n), 2)) + '%')
    print('% of Topics rated 2 on ' + colname + ' = ' + str(round(100*(l.count(2.0)/n), 2)) + '%')
    print('% of Topics rated 3 on ' + colname + ' = ' + str(round(100*(l.count(3.0)/n), 2)) + '%')

In [41]:
print('Annotator 1 - Speeches')
get_stats_col(annotator1_pre_tbip_speeches, 'Coherence')

Annotator 1 - Speeches
% of Topics rated 1 on Coherence = 4.0%
% of Topics rated 2 on Coherence = 2.0%
% of Topics rated 3 on Coherence = 94.0%


In [42]:
print('Annotator 2 - Speeches')
get_stats_col(annotator2_pre_tbip_speeches, 'Coherence')

Annotator 2 - Speeches
% of Topics rated 1 on Coherence = 16.0%
% of Topics rated 2 on Coherence = 6.0%
% of Topics rated 3 on Coherence = 78.0%


In [47]:
cohen_kappa_score(list(annotator1_pre_tbip_speeches['Coherence']),
                  list(annotator2_pre_tbip_speeches['Coherence']))

0.15123456790123457

In [44]:
print('Annotator 1 - Tweets')
get_stats_col(annotator1_pre_tbip_tweets, 'Coherence')

Annotator 1 - Tweets
% of Topics rated 1 on Coherence = 0.0%
% of Topics rated 2 on Coherence = 2.0%
% of Topics rated 3 on Coherence = 98.0%


In [45]:
print('Annotator 2 - Tweets')
get_stats_col(annotator2_pre_tbip_tweets, 'Coherence')

Annotator 2 - Tweets
% of Topics rated 1 on Coherence = 22.0%
% of Topics rated 2 on Coherence = 8.0%
% of Topics rated 3 on Coherence = 70.0%


In [46]:
cohen_kappa_score(list(annotator1_pre_tbip_tweets['Coherence']),
                  list(annotator2_pre_tbip_tweets['Coherence']))

0.03969270166453276

In [48]:
print('Annotator 1 - Speeches')
get_stats_col(annotator1_pre_tbip_speeches, 'Polarization')
print('Annotator 2 - Speeches')
get_stats_col(annotator2_pre_tbip_speeches, 'Polarization')
print('Annotator 1 - Tweets')
get_stats_col(annotator1_pre_tbip_tweets, 'Polarization')
print('Annotator 2 - Tweets')
get_stats_col(annotator2_pre_tbip_tweets, 'Polarization')

Annotator 1 - Speeches
% of Topics rated 1 on Polarization = 44.0%
% of Topics rated 2 on Polarization = 0.0%
% of Topics rated 3 on Polarization = 52.0%
Annotator 2 - Speeches
% of Topics rated 1 on Polarization = 46.0%
% of Topics rated 2 on Polarization = 8.0%
% of Topics rated 3 on Polarization = 30.0%
Annotator 1 - Tweets
% of Topics rated 1 on Polarization = 48.0%
% of Topics rated 2 on Polarization = 0.0%
% of Topics rated 3 on Polarization = 52.0%
Annotator 2 - Tweets
% of Topics rated 1 on Polarization = 52.0%
% of Topics rated 2 on Polarization = 10.0%
% of Topics rated 3 on Polarization = 16.0%


In [49]:
annotator1_pre_tbip_speeches['Polarization'] = annotator1_pre_tbip_speeches['Polarization'].fillna(0.0)
print(annotator1_pre_tbip_speeches.info())
annotator1_pre_tbip_tweets['Polarization'] = annotator1_pre_tbip_tweets['Polarization'].fillna(0.0)
print(annotator1_pre_tbip_tweets.info())
annotator2_pre_tbip_speeches['Polarization'] = annotator2_pre_tbip_speeches['Polarization'].fillna(0.0)
print(annotator2_pre_tbip_speeches.info())
annotator2_pre_tbip_tweets['Polarization'] = annotator2_pre_tbip_tweets['Polarization'].fillna(0.0)
print(annotator2_pre_tbip_tweets.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 1 to 1569
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Topic           50 non-null     object 
 1   Unnamed: 1      0 non-null      float64
 2   Coherence       50 non-null     float64
 3   Polarization    50 non-null     float64
 4   Topic Name      50 non-null     object 
 5   Description     48 non-null     object 
 6   Notes/Comments  10 non-null     object 
dtypes: float64(3), object(4)
memory usage: 3.1+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 1 to 1569
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Topic           50 non-null     object 
 1   Unnamed: 1      0 non-null      float64
 2   Coherence       50 non-null     float64
 3   Polarization    50 non-null     float64
 4   Topic Name      50 non-null     object 
 5   Descriptio

In [53]:
print('Speeches - Polarization')
cohen_kappa_score(list(annotator1_pre_tbip_speeches['Polarization']),
                  list(annotator2_pre_tbip_speeches['Polarization']))

Speeches - Polarization


0.37027707808564225

In [56]:
print('Tweets - Polarization')
cohen_kappa_score(list(annotator1_pre_tbip_tweets['Polarization']),
                  list(annotator2_pre_tbip_tweets['Polarization']))

Tweets - Polarization


0.22062350119904073

In [57]:
def get_topic_to_annotator_to_ratings(df1, df2):
    l1_topics = list(df1['Topic'])
    l2_topics = list(df2['Topic'])
    assert l1_topics == l2_topics
    
    l1_c = list(df1['Coherence'])
    l2_c = list(df2['Coherence'])
    
    l1_p = list(df1['Polarization'])
    l2_p = list(df2['Polarization'])
    
    out = {}
    for t, c1, c2, p1, p2 in zip(l1_topics, l1_c, l2_c, l1_p, l2_p):
        out[t] = {'annotator1':{'Coherence':c1, 'Polarization':p1}, 
                  'annotator2':{'Coherence':c2, 'Polarization':p2}}
    return out

In [58]:
speech_topic_to_annotator_to_ratings = get_topic_to_annotator_to_ratings(annotator1_pre_tbip_speeches,
                                                                         annotator2_pre_tbip_speeches)

tweet_topic_to_annotator_to_ratings = get_topic_to_annotator_to_ratings(annotator1_pre_tbip_tweets,
                                                                         annotator2_pre_tbip_tweets)

In [67]:
def get_dataframe_from_annotated_xlsx_file_path(path):
    df = pd.read_excel(path,
                       sheet_name=None,
                       engine='openpyxl')
    df = df['Sheet1']
    df = df[~pd.isnull(df['Issue'])]
    return df

In [68]:
annotator1_post_tbip_speeches = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/post_tbip_annotation_results/annotation_file_speeches_fg.xlsx')
annotator1_post_tbip_speeches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42 entries, 1 to 944
Data columns (total 12 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Unnamed: 0                            0 non-null      object 
 1   Unnamed: 1                            0 non-null      float64
 2   Issue                                 42 non-null     object 
 3   Unnamed: 3                            0 non-null      object 
 4   Unnamed: 4                            0 non-null      float64
 5   Unnamed: 5                            0 non-null      float64
 6   Label Applicability for a)            42 non-null     object 
 7   Label Applicability for b)            42 non-null     object 
 8   Ideological Polarization              42 non-null     object 
 9   Ideological Position expressed in a)  32 non-null     object 
 10  Ideological Position expressed in b)  32 non-null     object 
 11  Notes/Comments      

In [69]:
annotator2_post_tbip_speeches = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/post_tbip_annotation_results/annotation_file_speeches_completed.xlsx')
annotator2_post_tbip_speeches.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42 entries, 1 to 944
Data columns (total 12 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Unnamed: 0                            0 non-null      object 
 1   Unnamed: 1                            0 non-null      float64
 2   Issue                                 42 non-null     object 
 3   Unnamed: 3                            0 non-null      object 
 4   Unnamed: 4                            0 non-null      float64
 5   Unnamed: 5                            0 non-null      float64
 6   Label Applicability for a)            42 non-null     object 
 7   Label Applicability for b)            42 non-null     object 
 8   Ideological Polarization              42 non-null     object 
 9   Ideological Position expressed in a)  34 non-null     object 
 10  Ideological Position expressed in b)  34 non-null     object 
 11  Notes/Comments      

In [70]:
annotator1_post_tbip_tweets = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/post_tbip_annotation_results/annotation_file_tweets_fg.xlsx')
annotator1_post_tbip_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40 entries, 1 to 898
Data columns (total 12 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Unnamed: 0                            0 non-null      object 
 1   Unnamed: 1                            0 non-null      float64
 2   Issue                                 40 non-null     object 
 3   Unnamed: 3                            0 non-null      object 
 4   Unnamed: 4                            0 non-null      float64
 5   Unnamed: 5                            0 non-null      float64
 6   Label Applicability for a)            40 non-null     object 
 7   Label Applicability for b)            40 non-null     object 
 8   Ideological Polarization              40 non-null     object 
 9   Ideological Position expressed in a)  30 non-null     object 
 10  Ideological Position expressed in b)  30 non-null     object 
 11  Notes/Comments      

In [71]:
annotator2_post_tbip_tweets = get_dataframe_from_annotated_xlsx_file_path('venue_diff_polsci/post_tbip_annotation_results/annotation_file_tweets_completed.xlsx')
annotator2_post_tbip_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40 entries, 1 to 898
Data columns (total 12 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Unnamed: 0                            0 non-null      object 
 1   Unnamed: 1                            0 non-null      float64
 2   Issue                                 40 non-null     object 
 3   Unnamed: 3                            0 non-null      object 
 4   Unnamed: 4                            0 non-null      float64
 5   Unnamed: 5                            0 non-null      float64
 6   Label Applicability for a)            40 non-null     object 
 7   Label Applicability for b)            40 non-null     object 
 8   Ideological Polarization              40 non-null     object 
 9   Ideological Position expressed in a)  33 non-null     object 
 10  Ideological Position expressed in b)  33 non-null     object 
 11  Notes/Comments      

In [74]:
def get_label_app_stats(df, colname):
    n = len(df)
    l = list(df[colname])
    is_about, might_about, not_about = 0, 0, 0
    for x in l:
        if 'IS about' in x:
            is_about += 1
        elif 'MIGHT' in x:
            might_about += 1
        elif 'IS NOT' in x:
            not_about += 1
    print('% of Topics that are about the issue wrt ' + colname + ' = ' + str(round(100*(is_about/n), 2)) + '%')
    print('% of Topics that might be about the issue wrt ' + colname + ' = ' + str(round(100*(might_about/n), 2)) + '%')
    print('% of Topics that are NOT about the issue wrt ' + colname + ' = ' + str(round(100*(not_about/n), 2)) + '%')

In [75]:
get_label_app_stats(annotator1_post_tbip_speeches, 
                    'Label Applicability for a)')

% of Topics that are about the issue wrt Label Applicability for a) = 90.48%
% of Topics that might be about the issue wrt Label Applicability for a) = 7.14%
% of Topics that are NOT about the issue wrt Label Applicability for a) = 2.38%


In [76]:
get_label_app_stats(annotator2_post_tbip_speeches, 
                    'Label Applicability for a)')

% of Topics that are about the issue wrt Label Applicability for a) = 83.33%
% of Topics that might be about the issue wrt Label Applicability for a) = 14.29%
% of Topics that are NOT about the issue wrt Label Applicability for a) = 2.38%


In [77]:
print('Speeches - Label applicability for a)')
cohen_kappa_score(list(annotator1_post_tbip_speeches['Label Applicability for a)']),
                  list(annotator2_post_tbip_speeches['Label Applicability for a)']))

Speeches - Label applicability for a)


0.8057803468208092

In [79]:
get_label_app_stats(annotator1_post_tbip_tweets, 
                    'Label Applicability for a)')

% of Topics that are about the issue wrt Label Applicability for a) = 92.5%
% of Topics that might be about the issue wrt Label Applicability for a) = 7.5%
% of Topics that are NOT about the issue wrt Label Applicability for a) = 0.0%


In [80]:
get_label_app_stats(annotator2_post_tbip_tweets, 
                    'Label Applicability for a)')

% of Topics that are about the issue wrt Label Applicability for a) = 57.5%
% of Topics that might be about the issue wrt Label Applicability for a) = 40.0%
% of Topics that are NOT about the issue wrt Label Applicability for a) = 2.5%


In [81]:
print('Tweets - Label applicability for a)')
cohen_kappa_score(list(annotator1_post_tbip_tweets['Label Applicability for a)']),
                  list(annotator2_post_tbip_tweets['Label Applicability for a)']))

Tweets - Label applicability for a)


0.5939086294416244

In [82]:
get_label_app_stats(annotator1_post_tbip_speeches, 
                    'Label Applicability for b)')

% of Topics that are about the issue wrt Label Applicability for b) = 90.48%
% of Topics that might be about the issue wrt Label Applicability for b) = 9.52%
% of Topics that are NOT about the issue wrt Label Applicability for b) = 0.0%


In [83]:
get_label_app_stats(annotator2_post_tbip_speeches, 
                    'Label Applicability for b)')

% of Topics that are about the issue wrt Label Applicability for b) = 80.95%
% of Topics that might be about the issue wrt Label Applicability for b) = 14.29%
% of Topics that are NOT about the issue wrt Label Applicability for b) = 4.76%


In [84]:
print('Speeches - Label applicability for b)')
cohen_kappa_score(list(annotator1_post_tbip_speeches['Label Applicability for b)']),
                  list(annotator2_post_tbip_speeches['Label Applicability for b)']))

Speeches - Label applicability for b)


0.8299595141700404

In [85]:
get_label_app_stats(annotator1_post_tbip_tweets, 
                    'Label Applicability for b)')

% of Topics that are about the issue wrt Label Applicability for b) = 75.0%
% of Topics that might be about the issue wrt Label Applicability for b) = 22.5%
% of Topics that are NOT about the issue wrt Label Applicability for b) = 2.5%


In [86]:
get_label_app_stats(annotator2_post_tbip_tweets, 
                    'Label Applicability for b)')

% of Topics that are about the issue wrt Label Applicability for b) = 65.0%
% of Topics that might be about the issue wrt Label Applicability for b) = 30.0%
% of Topics that are NOT about the issue wrt Label Applicability for b) = 5.0%


In [88]:
print('Tweets - Label applicability for b)')
cohen_kappa_score(list(annotator1_post_tbip_tweets['Label Applicability for b)']),
                  list(annotator2_post_tbip_tweets['Label Applicability for b)']))

Tweets - Label applicability for b)


0.6190476190476191

In [90]:
def get_post_tbip_id_pol_stats(df):
    l_a = list(df['Label Applicability for a)'])
    l_b = list(df['Label Applicability for b)'])
    
    l_i = list(df['Ideological Polarization'])
    
    n = 0.0
    do_not, somewhat, do, unsure = 0, 0, 0, 0
    for a, b, i in zip(l_a, l_b, l_i):
        if 'IS NOT' in a or 'IS NOT' in b: #consider only cases where label at least might be applicable for both a) and b)
            continue
        n += 1.0
        if 'DO NOT' in i:
            do_not += 1
        elif 'SOMEWHAT' in i:
            somewhat += 1
        elif 'Unsure' in i:
            unsure += 1
        else:
            do += 1
    print('% of Topics for which a and b do represent polarized perspectives = ' + str(round(100*(do/n), 2)) + '%')
    print('% of Topics for which a and b represent SOMEWHAT polarized perspectives = ' + str(round(100*(somewhat/n), 2)) + '%')
    print('% of Topics for which a and b DO NOT represent polarized perspectives = ' + str(round(100*(do_not/n), 2)) + '%')
    print('% of Topics UNSURE = ' + str(round(100*(unsure/n), 2)) + '%')

In [91]:
get_post_tbip_id_pol_stats(annotator1_post_tbip_speeches)

% of Topics for which a and b do represent polarized perspectives = 31.71%
% of Topics for which a and b represent SOMEWHAT polarized perspectives = 41.46%
% of Topics for which a and b DO NOT represent polarized perspectives = 24.39%
% of Topics UNSURE = 2.44%


In [92]:
get_post_tbip_id_pol_stats(annotator2_post_tbip_speeches)

% of Topics for which a and b do represent polarized perspectives = 35.9%
% of Topics for which a and b represent SOMEWHAT polarized perspectives = 30.77%
% of Topics for which a and b DO NOT represent polarized perspectives = 17.95%
% of Topics UNSURE = 15.38%


In [93]:
print('Speeches - Ideological Polarization')
cohen_kappa_score(list(annotator1_post_tbip_speeches['Ideological Polarization']),
                  list(annotator2_post_tbip_speeches['Ideological Polarization']))

Speeches - Ideological Polarization


0.33009708737864063

In [94]:
get_post_tbip_id_pol_stats(annotator1_post_tbip_tweets)

% of Topics for which a and b do represent polarized perspectives = 28.21%
% of Topics for which a and b represent SOMEWHAT polarized perspectives = 28.21%
% of Topics for which a and b DO NOT represent polarized perspectives = 43.59%
% of Topics UNSURE = 0.0%


In [95]:
get_post_tbip_id_pol_stats(annotator2_post_tbip_tweets)

% of Topics for which a and b do represent polarized perspectives = 42.11%
% of Topics for which a and b represent SOMEWHAT polarized perspectives = 31.58%
% of Topics for which a and b DO NOT represent polarized perspectives = 15.79%
% of Topics UNSURE = 10.53%


In [96]:
print('Tweets - Ideological Polarization')
cohen_kappa_score(list(annotator1_post_tbip_tweets['Ideological Polarization']),
                  list(annotator2_post_tbip_tweets['Ideological Polarization']))

Tweets - Ideological Polarization


0.43494423791821557

for the topics pre-tbip expected to have political polarization by either annotator, how many were deemed ideologically polarized post-tbip by either annotator (%)?

In [103]:
def get_topic_nums_expected_to_be_polarized_by_either_annotator(df1, df2):
    out = []
    df1 = df1[~pd.isnull(df1['Polarization'])]
    df2 = df2[~pd.isnull(df2['Polarization'])]
    topics1, p1 = list(df1['Topic']), list(df1['Polarization'])
    topics2, p2 = list(df2['Topic']), list(df2['Polarization'])
    for t, p in zip(topics1, p1):
        if p==2.0 or p==3.0:
            out.append(t.split()[1])
        else:
            if t in topics2:
                i2 = topics2.index(t)
                if p2[i2]==2.0 or p2[i2]==3.0:
                    out.append(t.split()[1])
    return out

In [104]:
speeches_topic_nums_expected_to_be_polarized_by_either_annotator = get_topic_nums_expected_to_be_polarized_by_either_annotator(annotator1_pre_tbip_speeches,
                                                                                                                               annotator2_pre_tbip_speeches)
print(len(speeches_topic_nums_expected_to_be_polarized_by_either_annotator))

28


In [105]:
tweets_topic_nums_expected_to_be_polarized_by_either_annotator = get_topic_nums_expected_to_be_polarized_by_either_annotator(annotator1_pre_tbip_tweets,
                                                                                                                             annotator2_pre_tbip_tweets)
print(len(tweets_topic_nums_expected_to_be_polarized_by_either_annotator))

28


In [109]:
def get_post_tbip_polarized_topic_nums_by_either_annotator(df1, df2):
    
    out = set()
    
    l_topics = list(df1['Issue'])
    l_a = list(df1['Label Applicability for a)'])
    l_b = list(df1['Label Applicability for b)'])
    
    l_i = list(df1['Ideological Polarization'])
    
    for tn, a, b, i in zip(l_topics, l_a, l_b, l_i):
        if 'IS NOT' in a or 'IS NOT' in b: #consider only cases where label at least might be applicable for both a) and b)
            continue
        if 'DO NOT' in i:
            continue
        elif 'SOMEWHAT' in i:
            out.add(tn.split()[1])
        elif 'Unsure' in i:
            continue
        else:
            out.add(tn.split()[1])
    
    l_topics = list(df2['Issue'])
    l_a = list(df2['Label Applicability for a)'])
    l_b = list(df2['Label Applicability for b)'])
    
    l_i = list(df2['Ideological Polarization'])
    
    for tn, a, b, i in zip(l_topics, l_a, l_b, l_i):
        if 'IS NOT' in a or 'IS NOT' in b: #consider only cases where label at least might be applicable for both a) and b)
            continue
        if 'DO NOT' in i:
            continue
        elif 'SOMEWHAT' in i:
            out.add(tn.split()[1])
        elif 'Unsure' in i:
            continue
        else:
            out.add(tn.split()[1])
            
    return list(out)

In [111]:
speeches_topic_nums_polarized_per_either_annotator_post_tbip = get_post_tbip_polarized_topic_nums_by_either_annotator(annotator1_post_tbip_speeches,
                                                                                                                      annotator2_post_tbip_speeches)
print(len(speeches_topic_nums_polarized_per_either_annotator_post_tbip))

34


In [112]:
tweets_topic_nums_polarized_per_either_annotator_post_tbip = get_post_tbip_polarized_topic_nums_by_either_annotator(annotator1_post_tbip_tweets,
                                                                                                                    annotator2_post_tbip_tweets)
print(len(tweets_topic_nums_polarized_per_either_annotator_post_tbip))

30


In [158]:
issues_rated_post_tbip_for_speeches = list(annotator1_post_tbip_speeches['Issue'])
issues_rated_post_tbip_for_speeches = list(map(lambda x:x.split()[1], issues_rated_post_tbip_for_speeches))
issues_rated_post_tbip_for_tweets = list(annotator1_post_tbip_tweets['Issue'])
issues_rated_post_tbip_for_tweets = list(map(lambda x:x.split()[1], issues_rated_post_tbip_for_tweets))

In [159]:
print('For Floor Speeches - ')
speeches_topic_nums_expected_to_be_polarized_by_either_annotator = list(set(speeches_topic_nums_expected_to_be_polarized_by_either_annotator).intersection(list(issues_rated_post_tbip_for_speeches)))
n = len(speeches_topic_nums_expected_to_be_polarized_by_either_annotator)
c = 0.0
for tn in speeches_topic_nums_expected_to_be_polarized_by_either_annotator:
    if tn in speeches_topic_nums_polarized_per_either_annotator_post_tbip:
        c += 1.0
print(str(round(100*(c/n), 2)) + '% of pre-tbip expected to have political polarization by either annotator were deemed ideologically polarized post-tbip by either annotator')

For Floor Speeches - 
81.82% of pre-tbip expected to have political polarization by either annotator were deemed ideologically polarized post-tbip by either annotator


In [160]:
print('For Twitter - ')
tweets_topic_nums_expected_to_be_polarized_by_either_annotator = list(set(tweets_topic_nums_expected_to_be_polarized_by_either_annotator).intersection(list(issues_rated_post_tbip_for_tweets)))
n = len(tweets_topic_nums_expected_to_be_polarized_by_either_annotator)
c = 0.0
for tn in tweets_topic_nums_expected_to_be_polarized_by_either_annotator:
    if tn in tweets_topic_nums_polarized_per_either_annotator_post_tbip:
        c += 1.0
print(str(round(100*(c/n), 2)) + '% of pre-tbip expected to have political polarization by either annotator were deemed ideologically polarized post-tbip by either annotator')

For Twitter - 
72.73% of pre-tbip expected to have political polarization by either annotator were deemed ideologically polarized post-tbip by either annotator


In [115]:
topic_ind_to_a_b_info_speech = pickle.load(open('venue_diff_polsci/floor_speeches/topic_ind_to_a_b_info_post_tbip.pkl', 
                                                'rb'))
            

In [116]:
topic_ind_to_a_b_info_tweet = pickle.load(open('venue_diff_polsci/twitter/topic_ind_to_a_b_info_post_tbip.pkl', 
                                                'rb'))
            

In [118]:
def get_post_tbip_topic_ind_to_id_rating(df):
    df = df[~pd.isnull(df['Ideological Position expressed in a)'])]
    issues, id_a, id_b = list(df['Issue']), list(df['Ideological Position expressed in a)']), list(df['Ideological Position expressed in b)'])
    out = {}
    for t, rating_a, rating_b in zip(issues, id_a, id_b):
        topic_ind = int(t.split()[1]) - 1
        out[topic_ind] = {'a': rating_a, 
                          'b': rating_b}
        
    return out

In [120]:
post_tbip_topic_ind_to_a_b_rating_speech_ann1 = get_post_tbip_topic_ind_to_id_rating(annotator1_post_tbip_speeches)
print(len(post_tbip_topic_ind_to_a_b_rating_speech_ann1))

32


In [121]:
post_tbip_topic_ind_to_a_b_rating_speech_ann2 = get_post_tbip_topic_ind_to_id_rating(annotator2_post_tbip_speeches)
print(len(post_tbip_topic_ind_to_a_b_rating_speech_ann2))

34


In [122]:
post_tbip_topic_ind_to_a_b_rating_tweet_ann1 = get_post_tbip_topic_ind_to_id_rating(annotator1_post_tbip_tweets)
print(len(post_tbip_topic_ind_to_a_b_rating_tweet_ann1))

30


In [123]:
post_tbip_topic_ind_to_a_b_rating_tweet_ann2 = get_post_tbip_topic_ind_to_id_rating(annotator2_post_tbip_tweets)
print(len(post_tbip_topic_ind_to_a_b_rating_tweet_ann2))

33


In [142]:
liberal_tbip_speeches_breakdown1 = {'Conservative': 0,
                                    'Liberal': 0,
                                    'Unsure': 0}
conservative_tbip_speeches_breakdown1 = {'Conservative': 0,
                                         'Liberal': 0,
                                         'Unsure': 0}
#c = 0
on_issue = 0
for ti in topic_ind_to_a_b_info_speech:
    if on_issue in post_tbip_topic_ind_to_a_b_rating_speech_ann1:
        #c += 1
        if topic_ind_to_a_b_info_speech[ti]['a'] == -1:
            liberal_tbip_speeches_breakdown1[post_tbip_topic_ind_to_a_b_rating_speech_ann1[on_issue]['a']] += 1
            conservative_tbip_speeches_breakdown1[post_tbip_topic_ind_to_a_b_rating_speech_ann1[on_issue]['b']] += 1
        else:
            liberal_tbip_speeches_breakdown1[post_tbip_topic_ind_to_a_b_rating_speech_ann1[on_issue]['b']] += 1
            conservative_tbip_speeches_breakdown1[post_tbip_topic_ind_to_a_b_rating_speech_ann1[on_issue]['a']] += 1
    on_issue += 1
#print(c)
print('Annotator 1, Speeches')
print('')
print('Liberal per TBIP - ')
print(liberal_tbip_speeches_breakdown1)
print('')
print('Conservative per TBIP - ')
print(conservative_tbip_speeches_breakdown1)

Annotator 1, Speeches

Liberal per TBIP - 
{'Conservative': 0, 'Liberal': 29, 'Unsure': 3}

Conservative per TBIP - 
{'Conservative': 25, 'Liberal': 0, 'Unsure': 7}


In [144]:
liberal_tbip_speeches_breakdown2 = {'Conservative': 0,
                                    'Liberal': 0,
                                    'Unsure': 0}
conservative_tbip_speeches_breakdown2 = {'Conservative': 0,
                                         'Liberal': 0,
                                         'Unsure': 0}
#c = 0
on_issue = 0
for ti in topic_ind_to_a_b_info_speech:
    if on_issue in post_tbip_topic_ind_to_a_b_rating_speech_ann2:
        #c += 1
        if topic_ind_to_a_b_info_speech[ti]['a'] == -1:
            liberal_tbip_speeches_breakdown2[post_tbip_topic_ind_to_a_b_rating_speech_ann2[on_issue]['a']] += 1
            conservative_tbip_speeches_breakdown2[post_tbip_topic_ind_to_a_b_rating_speech_ann2[on_issue]['b']] += 1
        else:
            liberal_tbip_speeches_breakdown2[post_tbip_topic_ind_to_a_b_rating_speech_ann2[on_issue]['b']] += 1
            conservative_tbip_speeches_breakdown2[post_tbip_topic_ind_to_a_b_rating_speech_ann2[on_issue]['a']] += 1
    on_issue += 1
#print(c)
print('Annotator 2, Speeches')
print('')
print('Liberal per TBIP - ')
print(liberal_tbip_speeches_breakdown2)
print('')
print('Conservative per TBIP - ')
print(conservative_tbip_speeches_breakdown2)

Annotator 2, Speeches

Liberal per TBIP - 
{'Conservative': 3, 'Liberal': 23, 'Unsure': 8}

Conservative per TBIP - 
{'Conservative': 23, 'Liberal': 3, 'Unsure': 8}


In [147]:
liberal_tbip_tweets_breakdown1 = {'Conservative': 0,
                                    'Liberal': 0,
                                    'Unsure': 0}
conservative_tbip_tweets_breakdown1 = {'Conservative': 0,
                                         'Liberal': 0,
                                         'Unsure': 0}
#c = 0
on_issue = 0
for ti in topic_ind_to_a_b_info_tweet:
    if on_issue in post_tbip_topic_ind_to_a_b_rating_tweet_ann1:
        #c += 1
        if topic_ind_to_a_b_info_tweet[ti]['a'] == 1:
            liberal_tbip_tweets_breakdown1[post_tbip_topic_ind_to_a_b_rating_tweet_ann1[on_issue]['a']] += 1
            conservative_tbip_tweets_breakdown1[post_tbip_topic_ind_to_a_b_rating_tweet_ann1[on_issue]['b']] += 1
        else:
            liberal_tbip_tweets_breakdown1[post_tbip_topic_ind_to_a_b_rating_tweet_ann1[on_issue]['b']] += 1
            conservative_tbip_tweets_breakdown1[post_tbip_topic_ind_to_a_b_rating_tweet_ann1[on_issue]['a']] += 1
    on_issue += 1
#print(c)
print('Annotator 1, Tweets')
print('')
print('Liberal per TBIP - ')
print(liberal_tbip_tweets_breakdown1)
print('')
print('Conservative per TBIP - ')
print(conservative_tbip_tweets_breakdown1)

Annotator 1, Tweets

Liberal per TBIP - 
{'Conservative': 0, 'Liberal': 29, 'Unsure': 1}

Conservative per TBIP - 
{'Conservative': 23, 'Liberal': 2, 'Unsure': 5}


In [148]:
liberal_tbip_tweets_breakdown2 = {'Conservative': 0,
                                    'Liberal': 0,
                                    'Unsure': 0}
conservative_tbip_tweets_breakdown2 = {'Conservative': 0,
                                         'Liberal': 0,
                                         'Unsure': 0}
#c = 0
on_issue = 0
for ti in topic_ind_to_a_b_info_tweet:
    if on_issue in post_tbip_topic_ind_to_a_b_rating_tweet_ann2:
        #c += 1
        if topic_ind_to_a_b_info_tweet[ti]['a'] == 1:
            liberal_tbip_tweets_breakdown2[post_tbip_topic_ind_to_a_b_rating_tweet_ann2[on_issue]['a']] += 1
            conservative_tbip_tweets_breakdown2[post_tbip_topic_ind_to_a_b_rating_tweet_ann2[on_issue]['b']] += 1
        else:
            liberal_tbip_tweets_breakdown2[post_tbip_topic_ind_to_a_b_rating_tweet_ann2[on_issue]['b']] += 1
            conservative_tbip_tweets_breakdown2[post_tbip_topic_ind_to_a_b_rating_tweet_ann2[on_issue]['a']] += 1
    on_issue += 1
#print(c)
print('Annotator 2, Tweets')
print('')
print('Liberal per TBIP - ')
print(liberal_tbip_tweets_breakdown2)
print('')
print('Conservative per TBIP - ')
print(conservative_tbip_tweets_breakdown2)

Annotator 2, Tweets

Liberal per TBIP - 
{'Conservative': 0, 'Liberal': 28, 'Unsure': 5}

Conservative per TBIP - 
{'Conservative': 28, 'Liberal': 0, 'Unsure': 5}


for issues rated as liberal/conservative/unsure -- how many were accurate for both annotators?

In [149]:
29/32

0.90625

In [150]:
25/32

0.78125

In [151]:
23/34

0.6764705882352942

In [152]:
3/34

0.08823529411764706

In [153]:
29/30

0.9666666666666667

In [154]:
23/30

0.7666666666666667

In [155]:
2/30

0.06666666666666667

In [156]:
28/33

0.8484848484848485

In [128]:
speeches1 = annotator1_post_tbip_speeches[~pd.isnull(annotator1_post_tbip_speeches['Ideological Position expressed in a)'])]
speeches2 = annotator2_post_tbip_speeches[~pd.isnull(annotator2_post_tbip_speeches['Ideological Position expressed in a)'])]
issues_ap_to_both = list(set(speeches1['Issue']).intersection(speeches2['Issue']))
print('Issues applicable = ' + str(len(issues_ap_to_both)))
speeches1 = speeches1[speeches1['Issue'].isin(issues_ap_to_both)]
speeches2 = speeches2[speeches2['Issue'].isin(issues_ap_to_both)]
print('Speeches, perspective a')
print(cohen_kappa_score(list(speeches1['Ideological Position expressed in a)']),
                  list(speeches2['Ideological Position expressed in a)'])))
print('')
print('Speeches, perspective b')
print(cohen_kappa_score(list(speeches1['Ideological Position expressed in b)']),
                  list(speeches2['Ideological Position expressed in b)'])))

Issues applicable = 27
Speeches, perspective a
0.5324675324675325

Speeches, perspective b
0.6917808219178082


In [129]:
tweets1 = annotator1_post_tbip_tweets[~pd.isnull(annotator1_post_tbip_tweets['Ideological Position expressed in a)'])]
tweets2 = annotator2_post_tbip_tweets[~pd.isnull(annotator2_post_tbip_tweets['Ideological Position expressed in a)'])]
issues_ap_to_both = list(set(tweets1['Issue']).intersection(tweets2['Issue']))
print('Issues applicable = ' + str(len(issues_ap_to_both)))
tweets1 = tweets1[tweets1['Issue'].isin(issues_ap_to_both)]
tweets2 = tweets2[tweets2['Issue'].isin(issues_ap_to_both)]
print('Tweets, perspective a')
print(cohen_kappa_score(list(tweets1['Ideological Position expressed in a)']),
                  list(tweets2['Ideological Position expressed in a)'])))
print('')
print('Tweets, perspective b')
print(cohen_kappa_score(list(tweets1['Ideological Position expressed in b)']),
                  list(tweets2['Ideological Position expressed in b)'])))

Issues applicable = 28
Tweets, perspective a
0.7370892018779343

Tweets, perspective b
0.6207674943566591
