In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import glob
from nltk.translate.bleu_score import sentence_bleu
from tqdm import tqdm
from pathlib import Path
pd.set_option('max_colwidth', None)
pd.set_option('max_columns', None)

In [2]:
PATH_ROOT = Path.cwd().parents[1].resolve()

In [3]:
df = pd.read_csv(PATH_ROOT / "data" / "datasets" / "combined_dataset.csv", index_col=0)

In [4]:
df.head(3)

Unnamed: 0,annotator,text1,text2,label,dataset,random,duration,total_seconds,pair_id,reduced_label,WMD,glove_cosine,fasttext_cosine,POS Dist score,L2_score,bleu,bleu1,chrfScore,1-gram_overlap,ROUGE-1,ROUGE-2,ROUGE-l,BertScore
0,A3BCGN11HDM8QR,and he sent eliakim who was over the household and shebna the scribe and the elders of the priests covered with sackcloth unto isaiah the prophet the son of amoz,and he sent eliakim who was over the house and shebna the scribe and the chief priests dressed in haircloth to isaiah the prophet the son of amoz,2,bible_human,0,0 days 00:00:12.000000000,12,pair_0,-1,0.423538,0.019526,0.036428,1.358436,5.054252,0.574595,0.731549,0.705283,0.571429,0.758621,0.642857,0.727273,0.969994
1,A3SQ00HYQN7FYB,and he sent eliakim who was over the household and shebna the scribe and the elders of the priests covered with sackcloth unto isaiah the prophet the son of amoz,and he sent eliakim who was over the house and shebna the scribe and the chief priests dressed in haircloth to isaiah the prophet the son of amoz,3,bible_human,0,0 days 00:00:12.000000000,12,pair_0,0,0.423538,0.019526,0.036428,1.358436,5.054252,0.574595,0.731549,0.705283,0.571429,0.758621,0.642857,0.727273,0.969994
2,A5WAWW70PYRP,and he sent eliakim who was over the household and shebna the scribe and the elders of the priests covered with sackcloth unto isaiah the prophet the son of amoz,and he sent eliakim who was over the house and shebna the scribe and the chief priests dressed in haircloth to isaiah the prophet the son of amoz,4,bible_human,0,0 days 00:07:19.000000000,439,pair_0,1,0.423538,0.019526,0.036428,1.358436,5.054252,0.574595,0.731549,0.705283,0.571429,0.758621,0.642857,0.727273,0.969994


In [8]:
DATA_PATH = PATH_ROOT / 'data'
data_paths = [DATA_PATH / 'raw_data' / '*.csv', DATA_PATH / 'datasets' / '*.csv']

In [23]:
dct = {}
for pth in data_paths:
    pth = str(pth.resolve())
    if 'raw' in pth:
        for p in glob.glob(pth):
            df = pd.read_csv(p, index_col=0)
            dct[Path(p).stem] = {'columns': ', '.join(sorted(df.columns.tolist())),
                                 'row_count': df.shape[0],
                                 'values': ', '.join(sorted(df['Answer.semantic-similarity.label'].unique().tolist())),
                                 'mean_annotations_per_pair': df.groupby(['Input.text1', 'Input.text2']).size().mean()}
    else:
        for p in glob.glob(pth):
                df = pd.read_csv(p, index_col=0)
                dct[Path(p).stem] = {'columns': ', '.join(sorted(df.columns.tolist())),
                                     'row_count': df.shape[0],
                                     'values': ', '.join(map(str, sorted(df['label'].unique().tolist()))),
                                     'mean_annotations_per_pair': None}

In [24]:
datasets_df = pd.DataFrame.from_dict(dct, orient='index')
datasets_df.to_csv(PATH_ROOT / 'data' / 'other' / 'datasets_metadata.csv')

In [22]:
x = pd.read_csv(DATA_PATH / 'datasets' / 'sts.csv', index_col = 0)
x.head()

Unnamed: 0,genres,text_1,text_2,label,dataset,dataset-categ,pair_id,WMD,glove_cosine,fasttext_cosine,POS Dist score,BertScore,L2_score,bleu,bleu1,chrfScore,1-gram_overlap,ROUGE-1,ROUGE-2,ROUGE-l
0,main-captions,a girl is styling her hair,a girl is brushing her hair,2.5,sts,sts-test,0,1.101243,0.020379,0.072218,0.0,0.980371,10.471545,7.262123e-78,0.833333,0.641468,0.714286,0.833333,0.6,0.833333
1,main-captions,a group of men play soccer on the beach,a group of boys are playing soccer on the beach,3.6,sts,sts-test,1,0.813075,0.019756,0.035165,1.193633,0.981469,6.791739,0.3799178,0.7,0.716334,0.583333,0.736842,0.588235,0.736842
2,main-captions,one woman is measuring another woman s ankle,a woman measures another woman s ankle,5.0,sts,sts-test,2,0.845978,0.044791,0.05084,0.0,0.984674,8.400115,0.3768499,0.619198,0.635485,0.444444,0.666667,0.461538,0.615385
3,main-captions,a man is cutting up a cucumber,a man is slicing a cucumber,4.2,sts,sts-test,3,1.116673,0.028424,0.048875,0.0,0.976782,6.943588,6.147255e-78,0.705401,0.50981,0.571429,0.769231,0.545455,0.727273
4,main-captions,a man is playing a harp,a man is playing a keyboard,1.5,sts,sts-test,4,1.726695,0.051076,0.108125,3.129321,0.96217,7.255223,0.7598357,0.833333,0.747707,0.666667,0.833333,0.8,0.8


## Time Outliers

Under the assumption that anyone that takes over the 95 percentile of time.

In [None]:
print(df.total_seconds.describe(percentiles = [.25,.5,.75,.9,.95]))

# ba = bad actor
df['mean_annotation_time'] = df.groupby('annotator').total_seconds.transform('mean')
print(df.mean_annotation_time.describe(percentiles = [.25,.5,.75,.9,.95]))
ba_time = df[df.mean_annotation_time > 405].annotator.unique().tolist()
print(len(df[df.total_seconds > 336].annotator.unique().tolist()))
print(len(ba_time))

## Unvarianced Annotations
Labelers whos std is too low mean non-random - random difference is too high  

In [None]:
labelers = df[df.random==0].groupby(['annotator'])['label'].agg(['size','mean','std','min','max'])
labelers = labelers[labelers['size']>1]
#df = df[df.annotator.apply(lambda x:x in set(labelers.index))]

labelers_rand = df[df.random==1].groupby(['annotator'])['label'].agg(['size','mean','std','min','max'])
labelers_rand = labelers_rand[labelers_rand['size']>1]
labelers = labelers.join(labelers_rand, rsuffix = '_rand')
labelers['mean_random_gap'] = labelers['mean']-labelers['mean_rand']
labelers['std_ratio'] = labelers['std']/labelers['std_rand']

total_std = df.groupby('annotator')['label'].std()
total_std.name = 'total_std'
labelers = labelers.join(total_std)

In [None]:
ba_unvar_annotations = labelers[(labelers.total_std<1) & (labelers.mean_random_gap < 0)].index.tolist()
len(ba_unvar_annotations)

## Unpopular Annotators
Those who over 50% of the time, disagree with the other annotators (in the reduced label)

In [None]:
df_uniquelabels = df.groupby("pair_id")["reduced_label"].nunique()
pairs_twoagree = df_uniquelabels[(df.groupby("pair_id")["reduced_label"].nunique() == 2).values].index.tolist()
df_twoagree = df[df["pair_id"].isin(pairs_twoagree)]

df_twoagree['generally_accepted_label'] = df_twoagree.groupby("pair_id")['reduced_label'].transform('median')

In [None]:
df_twoagree.head()

In [None]:
df_unpopularopinion = df_twoagree[df_twoagree.reduced_label != df_twoagree.generally_accepted_label].groupby('annotator').size().reset_index()
df_unpopularopinion.columns = ['annotator','unpopular_opinion']

df_allopinions = df[df['annotator'].isin(list(df_unpopularopinion.annotator))].groupby('annotator').size().reset_index()
df_allopinions.columns = ['annotator','all_opinion']

df_opinion_all_unpop = df_allopinions.merge(df_unpopularopinion,on="annotator")

In [None]:
df_opinion_all_unpop.head()

In [None]:
ba_unpopular = df_opinion_all_unpop[((df_opinion_all_unpop.unpopular_opinion / df_opinion_all_unpop.all_opinion) > 0.5) & (df_opinion_all_unpop.all_opinion > 4)].annotator.tolist()

## Sentiment // Semantic Understanding

In [None]:
# Sentiment analysis pipeline
sentiment_pipe = pipeline("sentiment-analysis")

In [None]:
from tqdm import tqdm

text1_sent,text2_sent =[], []

pbar = tqdm(total = len(df)//100+1, position = 0, leave = True)
for i in range (len(df)//100+1):
    t1_s = sentiment_pipe(df.text1.tolist()[100*i:np.min([100*i+100,len(df)])])
    t2_s = sentiment_pipe(df.text2.tolist()[100*i:np.min([100*i+100,len(df)])])
    text1_sent+=t1_s
    text2_sent+=t2_s
    pbar.update()
pbar.close()
len(text1_sent)

In [None]:
sent = {'POSITIVE':1,'NEGATIVE':-1}

df['sentiment_1'] = np.array([x['score']*sent[x['label']] for x in text1_sent]) 
df['sentiment_2'] = np.array([x['score']*sent[x['label']] for x in text2_sent])
df['dif_sent'] =  np.abs(df['sentiment_1']-df['sentiment_2'])

In [None]:
pairs = []
for index, row in df.iterrows():
    first_sentence_tokens = row['text1'].strip().split()
    second_sentence_tokens = row['text2'].strip().split()
    pairs.append((first_sentence_tokens, second_sentence_tokens))

In [None]:
scores_bleu1 = []
for first_sentence_tokens, second_sentence_tokens in pairs:

    score_bleu1 = sentence_bleu([first_sentence_tokens], second_sentence_tokens, weights=(1, 0, 0, 0))
    scores_bleu1.append(score_bleu1)

print(np.mean(scores_bleu1))
print(np.std(scores_bleu1))


In [None]:
df['bleu_score_1'] = scores_bleu1

In [None]:
annot_std_semantic = df[(df['bleu_score_1'] > 0.8) & (df['dif_sent'] > 1.9)].groupby('annotator')['label'].std().dropna()
ba_semantics = list(annot_std_semantic[annot_std_semantic > 1.0].index)

# Combining all the results

In [None]:
import itertools

In [None]:
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

In [None]:
all_ba = ['ba_semantics','ba_time','ba_unpopular','ba_unvar_annotations']

for a,b in list(itertools.combinations(all_ba,2)):
    print(f"Jaccard Similarity of {a} and {b} is :{jaccard_similarity(eval(a),eval(b))}")

While the overlap isn't super consistent, it is interesting to note that the two most correlated groups are time and unpopularity and unpopularity with unvaried annotations.

In [None]:
all_ba = list(set(ba_unvar_annotations + ba_unpopular + ba_time + ba_semantics))
print(f"Total number of bad annotators are: {len(all_ba)}")
print(f"Percentage of total annotators are: {len(all_ba)/df.annotator.nunique()}")

### Save the annotators so we can filter them out quicker later

In [None]:
for ba in all_ba:
    with open(f'data/other/{ba}.txt', 'w') as f:
        for item in eval(ba):
            f.write("%s\n" % item)

In [None]:
with open('data/other/ba_all.txt','w') as f:
    for item in list(set(ba_unvar_annotations + ba_unpopular + ba_time + ba_semantics)):
        f.write("%s\n" % item)