In [3]:
import numpy as np
import pandas as pd
from fleiss import fleissKappa
from statsmodels.stats.inter_rater import fleiss_kappa

### Preprocess

In [143]:
# import datasets of all three annotators
tweets_deepak = pd.read_csv('tweets_deepak.csv')
tweets_kishen = pd.read_csv('tweets_kishen.csv')
tweets_tiffany = pd.read_csv('tweets_tiffany.csv')

In [144]:
# check dimensions
print(tweets_deepak.shape)
print(tweets_kishen.shape)
print(tweets_tiffany.shape)

(5000, 4)
(5000, 4)
(5000, 4)


In [145]:
# check duplicated records
tweets_kishen[tweets_kishen.duplicated('tweet_id', keep=False)]
tweets_deepak[tweets_deepak.duplicated('tweet_id', keep=False)]
tweets_tiffany[tweets_tiffany.duplicated('tweet_id', keep=False)]

Unnamed: 0,tweet_id,tweet_text,year,isRelevant


In [146]:
# concat the three datasets
tweets_tiff_kishen = pd.merge(tweets_tiffany, tweets_kishen, on="tweet_id", how='inner')
tweets_all = pd.merge(tweets_tiff_kishen, tweets_deepak, on="tweet_id", how='inner')
tweets_all.rename(columns={'isRelevant':'tiffany', 'relevant_x':'kishen', 'relevant':'deepak'}, inplace=True)

# create rating dataframe
rating_df = tweets_all[['tiffany', 'kishen', 'deepak']].copy()
rating_df['relevant'] = rating_df[['tiffany', 'kishen', 'deepak']].sum(axis=1) # sum along all columns for all rows
rating_df['irrelevant'] = (3 - rating_df['relevant'])


In [147]:
# create rating matrix of only the two count columns for relevant & irrelevant
rating_mat = rating_df.iloc[:, -2:].to_numpy(dtype=np.int32)

### Calculate Fleiss Kappa score

In [148]:
# statsmodels
fleiss_kappa(table=rating_mat, method='fleiss')

0.8497744837204804

In [149]:
# fleiss code by skarumbaiah - https://github.com/Shamya/FleissKappa/tree/master
fleissKappa(rate=rating_mat, n=3)

#raters =  3 , #subjects =  5000 , #categories =  2
PA =  0.9257333333333371
PE = 0.5056321422222223
Fleiss' Kappa = 0.85


0.85

### Export Unmatched Tweets

In [190]:
unmatched = tweets_all[~((rating_df['tiffany'] == rating_df['kishen']) & (rating_df['kishen'] == rating_df['deepak']))]
unmatched = unmatched[['tweet_id', 'tweet_text', 'year', 'tiffany', 'kishen', 'deepak']]
unmatched['majority_rating'] = (unmatched[['tiffany', 'kishen', 'deepak']].sum(axis=1) > 1).astype(int)
unmatched['final_rating'] = ""

matched = tweets_all[((rating_df['tiffany'] == rating_df['kishen']) & (rating_df['kishen'] == rating_df['deepak']))]
matched = matched[['tweet_id', 'tweet_text', 'year', 'tiffany']].rename(columns={'tiffany':'final_rating'})

In [191]:
midpoint = len(unmatched) // 2
kishen_df = unmatched.iloc[:midpoint]
tiffany_df = unmatched.iloc[midpoint:]

In [192]:
matched.to_csv('matched_tweets_4443.csv', index=False)
unmatched.to_csv('unmatched_tweets_557.csv', index=False)
tiffany_df.to_csv('unmatched_tweets_tiffany.csv', index=False)
kishen_df.to_csv('unmatched_tweets_kishen.csv', index=False)

### Summary Statistics

1. Find out the number of tweets that are marked the same by all of us (how many of them are marked relevant and irrelevant)
2. How many are marked relevant by majority?
3. How many are marked irrelevant by the majority?

In [55]:
# number of tweets marked the same by all annotators
all_agreed = rating_df[(rating_df['tiffany'] == rating_df['kishen']) & (rating_df['kishen'] == rating_df['deepak'])]
print(f"All agreed: {len(all_agreed)}/5000 ({len(all_agreed)/5000*100} %)")
print(all_agreed.value_counts())

All agreed: 4443/5000 (88.86 %)
tiffany  kishen  deepak  relevant  irrelevant
1        1       1       3         0             2442
0        0       0       0         3             2001
dtype: int64


In [75]:
# number of tweets disagreed
not_all_agreed = rating_df[~((rating_df['tiffany'] == rating_df['kishen']) & (rating_df['kishen'] == rating_df['deepak']))].reset_index(drop=True)
print(f"Tweets with Discrepancy: {len(not_all_agreed)}/5000 ({len(not_all_agreed)/5000*100} %)")
not_all_agreed.value_counts()

Tweets with Discrepancy: 557/5000 (11.14 %)


tiffany  kishen  deepak  relevant  irrelevant
1        1       0       2         1             305
         0       1       2         1              89
                 0       1         2              60
0        0       1       1         2              58
         1       0       1         2              26
                 1       2         1              19
dtype: int64

In [85]:
not_all_agreed['rel_by_majority'] = (not_all_agreed['relevant'] > not_all_agreed['irrelevant']).astype(int)
not_all_agreed['irrel_by_majority'] = (not_all_agreed['irrelevant'] > not_all_agreed['relevant']).astype(int)
print(f"Number of tweets marked as 'relevant' by majority: {sum(not_all_agreed['rel_by_majority'])}")
print(f"Number of tweets marked as 'irrelevant' by majority: {sum(not_all_agreed['irrel_by_majority'])}")

Number of tweets marked as 'relevant' by majority: 413
Number of tweets marked as 'irrelevant' by majority: 144


In [59]:
not_all_agreed[(not_all_agreed['tiffany'] == not_all_agreed['kishen'])]

Unnamed: 0,tiffany,kishen,deepak,relevant,irrelevant
8,0,0,1,1,2
16,0,0,1,1,2
59,0,0,1,1,2
66,0,0,1,1,2
73,0,0,1,1,2
...,...,...,...,...,...
4981,1,1,0,2,1
4987,1,1,0,2,1
4988,1,1,0,2,1
4991,1,1,0,2,1


In [61]:
not_all_agreed[(not_all_agreed['tiffany'] == not_all_agreed['deepak'])]

Unnamed: 0,tiffany,kishen,deepak,relevant,irrelevant
67,0,1,0,1,2
100,0,1,0,1,2
789,1,0,1,2,1
797,1,0,1,2,1
933,0,1,0,1,2
...,...,...,...,...,...
4865,1,0,1,2,1
4942,1,0,1,2,1
4962,1,0,1,2,1
4975,1,0,1,2,1


In [62]:
not_all_agreed[(not_all_agreed['kishen'] == not_all_agreed['deepak'])]

Unnamed: 0,tiffany,kishen,deepak,relevant,irrelevant
13,0,1,1,2,1
191,0,1,1,2,1
215,0,1,1,2,1
1175,0,1,1,2,1
1182,0,1,1,2,1
...,...,...,...,...,...
4963,1,0,0,1,2
4983,1,0,0,1,2
4984,1,0,0,1,2
4989,1,0,0,1,2


#### After Final Checks on 557 Unagreed tweets by Deepak

In [4]:
unmatched_tweets_reannotated = pd.read_csv('../../Kishen_Tiffany/unmatched_tweets_557.csv')
unmatched_tweets = unmatched_tweets_reannotated[["tweet_id", "tweet_text", "year", "majority_rating"]]
unmatched_tweets.rename(columns={'majority_rating':'final_rating'}, inplace=True)

matched_tweets = pd.read_csv('matched_tweets_4443.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unmatched_tweets.rename(columns={'majority_rating':'final_rating'}, inplace=True)


In [5]:
unmatched_tweets_reannotated.majority_rating.sum()

294

In [6]:
# unmatched_tweets.to_csv('unmatched_tweets_557_final.csv', index=False)

In [7]:
rel_unmatched = unmatched_tweets[unmatched_tweets.final_rating == 1].iloc[:, :-1]
rel_matched = matched_tweets[matched_tweets.final_rating == 1].iloc[:, :-1]

In [8]:
len(rel_unmatched) + len(rel_matched)

2736

In [9]:
rel_unmatched

Unnamed: 0,tweet_id,tweet_text,year
15,678244265659604992,thickness dependence and percolation scaling o...,2015
16,1299536748108804096,the common booster cores liquid hydrogen loadi...,2020
17,1333678694318018561,talk about taking one for the team there is pr...,2020
18,1341750180736065536,why hydrogen should hook up with nuclear,2020
19,1341990994015825922,plug stock why hydrogen play plug power is cli...,2020
...,...,...,...
543,1064418282067771392,high purity hydrogen gas generator stable v w ...,2018
545,699903784525254656,new technique for turning sunlight into hydrog...,2016
553,1580906325403312129,📣 will invest € billion in france learn more a...,2022
554,978094796588961793,our industry model is completely backwards ups...,2018


In [10]:
rel_matched

Unnamed: 0,tweet_id,tweet_text,year
1249,688636812869369856,solar to hydrogen homes no more lpg be sure to...,2016
1291,1316724254357041152,when hydrogen made from water electrolysis is ...,2020
1877,657892234281537536,동영상 space engineers update hydrogen thrusters ...,2015
1878,570260992715841537,greet the flying bum aircrafts aerospace hydro...,2015
1879,617365883695038464,bmw reveals hydrogen stealth car can go miles ...,2015
...,...,...,...
4436,1525901933332111363,kbr named technical advisor for sk hydrogen de...,2022
4437,976878917792526338,watch ecofriendly road sweeper added to aberde...,2018
4439,973143055992803328,icymi hydrogen semi maker nikola to build asse...,2018
4441,1506976879634489347,what is going on with these hydrogen pushes ov...,2022


In [11]:
final_relevant_tweets = pd.concat([rel_unmatched, rel_matched], ignore_index=True, axis=0)

In [12]:
final_relevant_tweets.to_csv("../final_relevant_tweets.csv", index=False)