In [10]:
# Import the libraries
import pandas as pd
from sklearn.metrics import cohen_kappa_score
import os
from pathlib import Path
from itertools import combinations

In [11]:
# Path to change.org-topic and twitter-topic datasets
path1 = './change.org_topic'
path2 = './twitter_topic'

In [12]:
# Generating three files for "lockdowns", "masking and distancing", and "vaccination" for each file in the
# Change.org Topic Dataset
for file in os.listdir(path1):
    if file.endswith(".csv") and (("lockdowns" not in file) and ("masking_and_distancing" not in file) and ("vaccination" not in file))  :
        full_path = path1 + '/' + file
        %run expand_topic_csv_dataset.py --infile $full_path       

In [13]:
# Generating three files for "lockdowns", "masking and distancing", and "vaccination" for each file in the
# Twitter Topic Dataset
for file in os.listdir(path2):
    if file.endswith(".csv") and (("lockdowns" not in file) and ("masking_and_distancing" not in file) and ("vaccination" not in file))  :
        full_path = path2 + '/' + file
        %run expand_topic_csv_dataset.py --infile $full_path        

In [14]:
df = pd.read_csv('./change.org_topic/change.org_topic_0_lockdowns.csv')

In [15]:
df = df.fillna("Unknown")

In [16]:
# Get the Column names and number of annotators
annotators = list(df.columns[1:])
no_of_annotators = len(annotators)
      
# Generate all the pair of annotators 
combinations = []
for annotator1 in annotators:
    for annotator2 in annotators:
        if annotator1 != annotator2:
            combinations.append((annotator1, annotator2))    
    

In [17]:
combinations

[('annotation_96', 'annotation_53'),
 ('annotation_96', 'annotation_93'),
 ('annotation_96', 'annotation_94'),
 ('annotation_96', 'annotation_95'),
 ('annotation_53', 'annotation_96'),
 ('annotation_53', 'annotation_93'),
 ('annotation_53', 'annotation_94'),
 ('annotation_53', 'annotation_95'),
 ('annotation_93', 'annotation_96'),
 ('annotation_93', 'annotation_53'),
 ('annotation_93', 'annotation_94'),
 ('annotation_93', 'annotation_95'),
 ('annotation_94', 'annotation_96'),
 ('annotation_94', 'annotation_53'),
 ('annotation_94', 'annotation_93'),
 ('annotation_94', 'annotation_95'),
 ('annotation_95', 'annotation_96'),
 ('annotation_95', 'annotation_53'),
 ('annotation_95', 'annotation_93'),
 ('annotation_95', 'annotation_94')]

In [18]:
# Calculate average Kappa Score for each annotator
scores = dict()
count = 0
score = 0
for annotator in annotators:
    for combination in combinations:
        if combination[0] == annotator:
            kappa = cohen_kappa_score(df[combination[0]], df[combination[1]])
            print("Annotator: {}, Score: {}".format(annotator,kappa))
            score = score + kappa
            count = count + 1
    scores[annotator] = score/(len(annotators) - 1)        
    score = 0

Annotator: annotation_96, Score: 0.07163323782234965
Annotator: annotation_96, Score: 0.4598199399799934
Annotator: annotation_96, Score: 0.48709002093510123
Annotator: annotation_96, Score: 0.4885703215807826
Annotator: annotation_53, Score: 0.07163323782234965
Annotator: annotation_53, Score: -0.03214124038026256
Annotator: annotation_53, Score: 0.02791625124626107
Annotator: annotation_53, Score: 0.08338637810311889
Annotator: annotation_93, Score: 0.4598199399799934
Annotator: annotation_93, Score: -0.03214124038026256
Annotator: annotation_93, Score: 0.5129249968165033
Annotator: annotation_93, Score: 0.44218717719165346
Annotator: annotation_94, Score: 0.48709002093510123
Annotator: annotation_94, Score: 0.02791625124626107
Annotator: annotation_94, Score: 0.5129249968165033
Annotator: annotation_94, Score: 0.6851020703633994
Annotator: annotation_95, Score: 0.4885703215807826
Annotator: annotation_95, Score: 0.08338637810311889
Annotator: annotation_95, Score: 0.4421871771916534

In [5]:
def Part1(df):
    # Converting the datatype of label columns to String.
    for col in df.columns[1:]:
        df[col] = df[col].astype(str)
    
    # Replace missing values with Unknown
    df = df.fillna("Unknown")
    
    # Get the Column names and number of annotators
    annotators = list(df.columns[1:])
    no_of_annotators = len(annotators)
      
    # Generate all the pair of annotators 
    combinations = []
    for annotator1 in annotators:
        for annotator2 in annotators:
            if annotator1 != annotator2:
                combinations.append((annotator1, annotator2))    
        
    # Calculate Kappa Score for every combination.
    #kk_score = [cohen_kappa_score(df[c[0]], df[c[1]])for c in combinations] 
    
    # Calculate average Kappa Score for each annotator
    scores = dict()
    count = 0
    score = 0
    for annotator in annotators:
        for combination in combinations:
            if combination[0] == annotator:
                score = score + cohen_kappa_score(df[combination[0]], df[combination[1]])
                count = count + 1
        scores[annotator] = score/(len(annotators) - 1)        
        score = 0
        
    # Drop the annotators with average kappa score less than 0.2
    for key, value in scores.items():
        if value < 0.2:
            df = df.drop([key],axis=1)   
            
    # Get the name of remaining annotators
    annotators_final = list(df.columns[1:])
    annotators_final  
    
    return df, scores, annotators_final    

In [6]:
def Part2(df, scores, annotators_final):
    '''
       Maintaining three variables True_Count, False_Count, and Unknown_Count to store the count of "True", "False", and
       "Unknown" labels in each row.

       Also, maintaining three variables True_Kappa, False_Kappa, and Unknown_Kappa to store the average Kappa score for
       "True", "False", and "Unknown" label in each row.

       If the count of one label is highest the corresponding label is considered. 
       If the count of two labels are equal and non-zero, the label with highest average Kappa score is considered.
    '''
    True_Count = 0
    False_Count = 0
    Unknown_Count = 0
    True_Kappa = 0
    False_Kappa = 0
    Unknown_Kappa = 0
    for index, row in df.iterrows():
        for annotator in annotators_final:
            if row[annotator] == "True":
                True_Count += 1
                True_Kappa += scores[annotator]
            if row[annotator] == "False":
                False_Count += 1
                False_Kappa += scores[annotator]
            if row[annotator] == "Unknown":
                Unknown_Count += 1
                Unknown_Kappa += scores[annotator]
     
        # Calculating average Kappa Score for each label
        if True_Count != 0:
            True_Kappa = True_Kappa / True_Count
        if False_Count != 0:
            False_Kappa = False_Kappa / False_Count
        if Unknown_Count != 0:
            Unknown_Kappa = Unknown_Kappa / Unknown_Count

        # Checking if any one label has highest count.
        if True_Count > False_Count and  True_Count > Unknown_Count:
            label = "True"
        if False_Count > True_Count  and False_Count > Unknown_Count:
            label = "False"
        if Unknown_Count > True_Count  and Unknown_Count > False_Count:  
            label = "Unknown"
    
        # Checking if two labels have same count and not equal to 0
        if True_Count == False_Count and True_Count != 0:
            if True_Kappa > False_Kappa:
                label = "True"
            else:
                label = "False"
    
        if True_Count == Unknown_Count and True_Count != 0:
            if True_Kappa > Unknown_Kappa:
                label = "True"
            else:
                label = "Unknown"
    
        if False_Count == Unknown_Count and False_Count != 0:
            if False_Kappa > Unknown_Kappa:
                label = "False"
            else: 
                label = "Unknown"
        # Store the label value for each row in a new "label" column of the DataFrame.        
        df.at[index,'label'] = label
    
        # Reset the variable for processing the next row.
        True_Count = 0
        False_Count = 0
        Unknown_Count = 0
        True_Kappa = 0
        False_Kappa = 0
        Unknown_Kappa = 0
    # Dropping the labels corresponding to different anotators
    df = df.drop(annotators_final,axis=1)     
    return df

In [7]:
def execution(file_name):
    df = pd.read_csv(file_name)
    df, scores, annotators_final = Part1(df)
    df = Part2(df, scores, annotators_final)
    df.to_csv(file_name, index=False)

In [8]:
# Performing execution on every file in change.org dataset and writing the dataset back with same name.
for file in os.listdir(path1):
    if file.endswith(".csv") and (("lockdowns" in file) or ("masking_and_distancing" in file) or ("vaccination" in file)):
        full_path = path1 + '/' + file
        execution(full_path)

In [9]:
# Performing execution on every file in twitter dataset and writing the dataset back with same name.
for file in os.listdir(path2):
    if file.endswith(".csv") and (("lockdowns" in file) or ("masking_and_distancing" in file) or ("vaccination" in file)):
        full_path = path2 + '/' + file
        execution(full_path)

In [10]:
#############################  Output file 1 ###################################
# Merging all the files corresponding to lockdowns for change.org
frames = []
for file in os.listdir(path1):
    if file.endswith(".csv") and (("lockdowns" in file)) :
        full_path = path1 + '/' + file
        frames.append(pd.read_csv(full_path))

result = pd.concat(frames)
file_name = path1 + '/' + "change.org_topic_lockdowns.csv"
result.to_csv(file_name, index=False)

In [11]:
#############################  Output file 2 ###################################
# Merging all the files corresponding to masking_and_distancing for change.org
frames = []
for file in os.listdir(path1):
    if file.endswith(".csv") and (("masking_and_distancing" in file)) :
        full_path = path1 + '/' + file
        frames.append(pd.read_csv(full_path))

result = pd.concat(frames)
file_name = path1 + '/' + "change.org_topic_masking_and_distancing.csv"
result.to_csv(file_name, index=False)

In [12]:
#############################  Output file 3 ###################################
# Merging all the files corresponding to vaccination for change.org
frames = []
for file in os.listdir(path1):
    if file.endswith(".csv") and (("vaccination" in file)) :
        full_path = path1 + '/' + file
        frames.append(pd.read_csv(full_path))

result = pd.concat(frames)
result.head()
file_name = path1 + '/' + "change.org_topic_vaccination.csv"
result.to_csv(file_name, index=False)

In [13]:
#############################  Output file 4 ###################################
# Merging all the files corresponding to lockdowns for Twitter
frames = []
for file in os.listdir(path2):
    if file.endswith(".csv") and (("lockdowns" in file)) :
        full_path = path2 + '/' + file
        frames.append(pd.read_csv(full_path))

result = pd.concat(frames)
file_name = path2 + '/' + "twitter_topic_lockdowns.csv"
result.to_csv(file_name, index=False)

In [14]:
#############################  Output file 5 ###################################
# Merging all the files corresponding to masking_and_distancing for Twitter
frames = []
for file in os.listdir(path2):
    if file.endswith(".csv") and (("masking_and_distancing" in file)) :
        full_path = path2 + '/' + file
        frames.append(pd.read_csv(full_path))

result = pd.concat(frames)
file_name = path2 + '/' + "twitter_topic_masking_and_distancing.csv"
result.to_csv(file_name, index=False)

In [15]:
#############################  Output file 4 ###################################
# Merging all the files corresponding to vaccination for Twitter
frames = []
for file in os.listdir(path2):
    if file.endswith(".csv") and (("vaccination" in file)) :
        full_path = path2 + '/' + file
        frames.append(pd.read_csv(full_path))

result = pd.concat(frames)
file_name = path2 + '/' + "twitter_topic_vaccination.csv"
result.to_csv(file_name, index=False)