### Read the three annotated files

In [65]:
import ast
import nltk
from nltk.tokenize import word_tokenize

import json
import pandas as pd

In [66]:
def read_data(filename):
    data = []
    with open(filename) as f:
        for line in f:
            data.append(json.loads(line))
    
    return data

In [67]:
# finaldata1 = pd.read_csv('player1_data_1103.csv')
# finaldata2 = pd.read_csv('player2_data_1103.csv')
# finaldata3 = pd.read_csv('player3_data_1103.csv')

finaldata1 = pd.read_csv('player1_data_2204.csv')
finaldata2 = pd.read_csv('player2_data_2204.csv')
finaldata3 = pd.read_csv('player3_data_2204.csv')

In [68]:
finaldata1['label'] = finaldata1['label'].apply(lambda x: ast.literal_eval(x))
finaldata2['label'] = finaldata2['label'].apply(lambda x: ast.literal_eval(x))
finaldata3['label'] = finaldata3['label'].apply(lambda x: ast.literal_eval(x))

### Compute the M&A

In [69]:
def merger_or_acquisition(dataframe):
    dataframe['labels'] = dataframe['label'].apply(lambda x: [i[2] for i in x])
    dataframe['merger_or_acq'] = dataframe['labels'].apply(lambda x: 'Acq' if 'Target' in x else ('Merger' if 'Bidder' in x else 'None'))
    
    dataframe.drop('labels', axis = 1, inplace = True)

In [70]:
merger_or_acquisition(finaldata1)
merger_or_acquisition(finaldata2)
merger_or_acquisition(finaldata3)

### Concatenate all dataframes together

In [71]:
overall_df = pd.concat([finaldata1, finaldata2, finaldata3])

In [72]:
overall_df.reset_index(drop = True, inplace = True)

In [73]:
overall_df

Unnamed: 0,id,data,label,merger_or_acq
0,50250,"Last November, T-Mobile and Sprint confirmed t...","[[15, 23, Bidder], [28, 34, Bidder], [35, 100,...",Merger
1,50410,A New York regulator revoked Friday its approv...,"[[21, 48, terminated], [52, 79, Bidder], [99, ...",Merger
2,50448,So sad and unfair that the FCC wouldn’t approv...,"[[27, 30, Org], [31, 47, terminated], [52, 70,...",Merger
3,50459,"With the HCF tie-up now scrapped, HBF can be e...","[[34, 37, Bidder], [13, 32, terminated], [9, 1...",Merger
4,50477,Mattel has rejected Hasbros acquisition offers...,"[[0, 6, Target], [11, 19, terminated], [20, 27...",Acq
...,...,...,...,...
7941,125506,IDFC Bank on Saturday announced it would merge...,"[[0, 9, Bidder], [143, 157, Org], [35, 46, exp...",Merger
7942,125508,PetIQ (Pending: PETQ ) entered into a definiti...,"[[0, 5, Bidder], [23, 30, success], [49, 69, s...",Acq
7943,125509,"Anticipating growing demand, Microsoft acquire...","[[29, 38, Bidder], [39, 47, success], [65, 80,...",Acq
7944,125510,"Noida-based RailYatri, India's leading long-di...","[[69, 77, success], [12, 21, Bidder], [130, 13...",Acq


In [74]:
overall_df = overall_df[overall_df['merger_or_acq'] != 'None']
overall_df['merger_or_acq'].value_counts()

Acq       4199
Merger    3742
Name: merger_or_acq, dtype: int64

In [75]:
overall_df.duplicated(['data']).value_counts()

False    7527
True      414
dtype: int64

In [76]:
overall_df[~overall_df.duplicated(['data'])].to_csv('final_annotation_2204.csv', index = False)

### Token Tagging

In [77]:
finaldata1.rename(columns={'id': 'id_1', 'label': 'label_1', 'merger_or_acq': 'merger_or_acq_1'}, inplace = True)
finaldata2.rename(columns={'id': 'id_2', 'label': 'label_2', 'merger_or_acq': 'merger_or_acq_2'}, inplace = True)
finaldata3.rename(columns={'id': 'id_3', 'label': 'label_3', 'merger_or_acq': 'merger_or_acq_3'}, inplace = True)

In [78]:
def create_token_tags(sentence, labels):
    split = []
    j = 0
    
#     sorted_labels = sorted(ast.literal_eval(labels), key=lambda x: x[0])
    sorted_labels = sorted(labels, key=lambda x: x[0])
    
    for i in sorted_labels:
        if i[0] !=0:
            # string 1
            split.append([(x, 'O') for x in word_tokenize(sentence[j:i[0]])])
        # string 2
        split.append([(x, i[2]) for x in word_tokenize(sentence[i[0]:i[1]])])
        j = i[1]
    # string 3
    split.append([(x, 'O') for x in word_tokenize(sentence[j:])])
    
    # flatten list of lists before returning
    return [item for sublist in split for item in sublist]

In [79]:
def compute_token_tags_for_all_sentences(all_sentences, all_labels):
    return [create_token_tags(x[0], x[1]) for x in zip(all_sentences, all_labels)]

In [80]:
annotated_set1 = compute_token_tags_for_all_sentences(finaldata1['data'], finaldata1['label_1'])
annotated_set2 = compute_token_tags_for_all_sentences(finaldata2['data'], finaldata2['label_2'])
annotated_set3 = compute_token_tags_for_all_sentences(finaldata3['data'], finaldata3['label_3'])

In [81]:
finaldata1['token_label_1'] = annotated_set1
finaldata2['token_label_2'] = annotated_set2
finaldata3['token_label_3'] = annotated_set3

In [82]:
all_players_df = finaldata1.merge(finaldata2, on='data', how='outer').merge(finaldata3, on='data', how='outer')

In [83]:
player_1_2_df = all_players_df[~all_players_df['token_label_1'].isna() & ~all_players_df['token_label_2'].isna() & all_players_df['token_label_3'].isna()]
player_1_3_df = all_players_df[~all_players_df['token_label_1'].isna() & all_players_df['token_label_2'].isna() & ~all_players_df['token_label_3'].isna()]
player_2_3_df = all_players_df[all_players_df['token_label_1'].isna() & ~all_players_df['token_label_2'].isna() & ~all_players_df['token_label_3'].isna()]
player_1_2_3_df = all_players_df[~all_players_df['token_label_1'].isna() & ~all_players_df['token_label_2'].isna() & ~all_players_df['token_label_3'].isna()]

In [84]:
player_1_2_df.reset_index(drop = True, inplace = True)
player_1_3_df.reset_index(drop = True, inplace = True)
player_2_3_df.reset_index(drop = True, inplace = True)
player_1_2_3_df.reset_index(drop = True, inplace = True)

### Compute the gold labels for three annotations and identify the strongest annotator (IAA for three players)

Compute IAA for:
- player 1 and 2
- player 1 and 3
- player 2 and 3
- player 1,2, and 3 (the see who has the highest accuracy -- comparing to gold standard)

In [85]:
from collections import Counter

In [86]:
def compute_gold_standards_for_three_annotations(label_1, label_2, label_3):
    gold_labels = []
    voting = []
    for i in range(len(label_1)):
        labels = Counter([label_1[i][1], label_2[i][1], label_3[i][1]]).most_common()
        gold_labels.append((label_1[i][0], labels[0][0]))
        voting.append(labels)
    
    return gold_labels
#     return (gold_labels, voting)

In [87]:
def check_n_compute_gold_standards_for_three_annotations(label_1, label_2, label_3):
    if not len(label_1) == len(label_2) == len(label_3):
        return ('Not same length', False)
    
    for i in range(len(label_1)):
        if not label_1[i][0] == label_2[i][0] == label_3[i][0]:
            return ('Token index %s doesn\'t match' % str(i), False)
    
    return compute_gold_standards_for_three_annotations(label_1, label_2, label_3)

In [88]:
res = []
for i in range(len(player_1_2_3_df)):
    res.append(check_n_compute_gold_standards_for_three_annotations(player_1_2_3_df['token_label_1'][i], player_1_2_3_df['token_label_2'][i], player_1_2_3_df['token_label_3'][i]))

In [89]:
player_1_2_3_df['gold_token_label'] = res

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_1_2_3_df['gold_token_label'] = res


In [90]:
from statistics import mean
from sklearn.metrics import cohen_kappa_score

In [91]:
def compute_cohen_kappa(label1, label2):
    try:
        return cohen_kappa_score([i[1] for i in label1], [i[1] for i in label2])
    except:
        return 0

In [92]:
kappa_annota1_n_gold = [compute_cohen_kappa(i[0], i[1]) for i in zip(player_1_2_3_df['token_label_1'], player_1_2_3_df['gold_token_label'])]
kappa_annota2_n_gold = [compute_cohen_kappa(i[0], i[1]) for i in zip(player_1_2_3_df['token_label_2'], player_1_2_3_df['gold_token_label'])]
kappa_annota3_n_gold = [compute_cohen_kappa(i[0], i[1]) for i in zip(player_1_2_3_df['token_label_3'], player_1_2_3_df['gold_token_label'])]

In [60]:
print(mean(kappa_annota1_n_gold))
print(mean(kappa_annota2_n_gold))
print(mean(kappa_annota3_n_gold))

0.8036023313321872
0.8395923805803496
0.796243294446743


In [93]:
print(mean(kappa_annota1_n_gold))
print(mean(kappa_annota2_n_gold))
print(mean(kappa_annota3_n_gold))

0.796958032757296
0.8419886479180644
0.8408496980972122


### Compute IAA for two players WITH O tags

In [94]:
player_all_1_2_df = all_players_df[~all_players_df['token_label_1'].isna() & ~all_players_df['token_label_2'].isna()]
player_all_1_3_df = all_players_df[~all_players_df['token_label_1'].isna() & ~all_players_df['token_label_3'].isna()]
player_all_2_3_df = all_players_df[~all_players_df['token_label_2'].isna() & ~all_players_df['token_label_3'].isna()]

In [95]:
player_all_1_2_df.reset_index(drop = True, inplace = True)
player_all_1_3_df.reset_index(drop = True, inplace = True)
player_all_2_3_df.reset_index(drop = True, inplace = True)

In [96]:
kappa_annota1_n_2 = [compute_cohen_kappa(i[0], i[1]) for i in zip(player_all_1_2_df['token_label_1'], player_all_1_2_df['token_label_2'])]
kappa_annota1_n_3 = [compute_cohen_kappa(i[0], i[1]) for i in zip(player_all_1_3_df['token_label_1'], player_all_1_3_df['token_label_3'])]
kappa_annota2_n_3 = [compute_cohen_kappa(i[0], i[1]) for i in zip(player_all_2_3_df['token_label_2'], player_all_2_3_df['token_label_3'])]

In [64]:
print(mean(kappa_annota1_n_2))
print(mean(kappa_annota1_n_3))
print(mean(kappa_annota2_n_3))

0.7002298727264971
0.6683351564961786
0.7383305715756152


In [97]:
print(mean(kappa_annota1_n_2))
print(mean(kappa_annota1_n_3))
print(mean(kappa_annota2_n_3))

0.74771539834382
0.7487014720864019
0.8013310168406973


### Compute IAA gold standard WITHOUT the O tags (just entity tags)

In [98]:
def only_have_entity_tags(label_1, label_2):    
    just_entity_tags_label1 = []
    just_entity_tags_label2 = []
    for i in range(len(label_1)):    
        counter = Counter([label_1[i][1], label_2[i][1]])
        counter_keys = counter.keys()
        if 'O' in counter_keys and len(counter_keys) == 1: # if it's only O tag, ignore
            continue
        else:
            just_entity_tags_label1.append(label_1[i])
            just_entity_tags_label2.append(label_2[i])
    
    return just_entity_tags_label1, just_entity_tags_label2

In [99]:
def compute_entity_tags_only(label_1, label_2):
    if not len(label_1) == len(label_2):
        return -1
    
    for i in range(len(label_1)):
        if not label_1[i][0] == label_2[i][0]:
            return -1
    
    return only_have_entity_tags(label_1, label_2)

In [100]:
def compute_IAA_with_entity_tags_only(all_label_1, all_label_2):
    entity_tags_only = [compute_entity_tags_only(i[0], i[1]) for i in zip(all_label_1, all_label_2)]
    filtered_list = list(filter(lambda a: a != -1, entity_tags_only)) # remove noise mismatch labels
    
    return [compute_cohen_kappa(i[0], i[1]) for i in filtered_list]

In [101]:
kappa_annota1_n_gold_entity_tags_only = compute_IAA_with_entity_tags_only(player_1_2_3_df['token_label_1'], player_1_2_3_df['gold_token_label'])
kappa_annota2_n_gold_entity_tags_only = compute_IAA_with_entity_tags_only(player_1_2_3_df['token_label_2'], player_1_2_3_df['gold_token_label'])
kappa_annota3_n_gold_entity_tags_only = compute_IAA_with_entity_tags_only(player_1_2_3_df['token_label_3'], player_1_2_3_df['gold_token_label'])

In [653]:
print(mean(kappa_annota1_n_gold_entity_tags_only))
print(mean(kappa_annota2_n_gold_entity_tags_only))
print(mean(kappa_annota3_n_gold_entity_tags_only))

0.7228285193512535
0.7970602315930029
0.7022182467164059


In [102]:
print(mean(kappa_annota1_n_gold_entity_tags_only))
print(mean(kappa_annota2_n_gold_entity_tags_only))
print(mean(kappa_annota3_n_gold_entity_tags_only))

0.7475451364881694
0.8327699425016619
0.8184519033535972


### Compute IAA for two players WITHOUT the O tags (just entity tags)

In [103]:
kappa_annota1_n_2_entity_tags_only = compute_IAA_with_entity_tags_only(player_all_1_2_df['token_label_1'], player_all_1_2_df['token_label_2'])
kappa_annota1_n_3_entity_tags_only = compute_IAA_with_entity_tags_only(player_all_1_3_df['token_label_1'], player_all_1_3_df['token_label_3'])
kappa_annota2_n_3_entity_tags_only = compute_IAA_with_entity_tags_only(player_all_2_3_df['token_label_2'], player_all_2_3_df['token_label_3'])

In [649]:
print(mean(kappa_annota1_n_2_entity_tags_only))
print(mean(kappa_annota1_n_3_entity_tags_only))
print(mean(kappa_annota2_n_3_entity_tags_only))

0.5619722870734613
0.49181806141698353
0.5790717885822338


In [104]:
print(mean(kappa_annota1_n_2_entity_tags_only))
print(mean(kappa_annota1_n_3_entity_tags_only))
print(mean(kappa_annota2_n_3_entity_tags_only))

0.6274045578947653
0.6059654838261579
0.6891430747010684


### Compute the gold labels for two annotations

When in dispute between two annotators, bias towards entities (than O tags) and if two entities different, bias towards the annotator with the higher IAA

In [105]:
def compute_gold_standards_for_two_annotations(label_1, label_2, dominant_player):
    gold_labels = []
    voting = []
    for i in range(len(label_1)):
        counter = Counter([label_1[i][1], label_2[i][1]])
        labels = counter.most_common()
        tag, count = labels[0][0], labels[0][1]
        
        if count == 1:
            if not 'O' in counter.keys():
                gold_labels.append((label_1[i][0], labels[dominant_player - 1][0]))
            else:
                if tag == 'O':
                    tag2, count2 = labels[1][0], labels[1][1]
                    gold_labels.append((label_1[i][0], tag2))
                else:
                    gold_labels.append((label_1[i][0], tag))
        elif count == 2:
            gold_labels.append((label_1[i][0], tag))
        voting.append(labels)
    
    return gold_labels
#     return (gold_labels, voting)

In [106]:
def check_n_compute_gold_standards_for_two_annotations(label_1, label_2, dominant_player):
    if not len(label_1) == len(label_2):
        return ('Not same length', False)
    
    for i in range(len(label_1)):
        if not label_1[i][0] == label_2[i][0]:
            return ('Token index %s doesn\'t match' % str(i), False)
    
    return compute_gold_standards_for_two_annotations(label_1, label_2, dominant_player)

In [107]:
res = []
dominant_player = 2
for i in range(len(player_1_2_df)):
    res.append(check_n_compute_gold_standards_for_two_annotations(player_1_2_df['token_label_1'][i], player_1_2_df['token_label_2'][i], dominant_player))
player_1_2_df['gold_token_label'] = res

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_1_2_df['gold_token_label'] = res


In [108]:
res = []
dominant_player = 2
for i in range(len(player_1_3_df)):
    res.append(check_n_compute_gold_standards_for_two_annotations(player_1_3_df['token_label_1'][i], player_1_3_df['token_label_3'][i], dominant_player))
player_1_3_df['gold_token_label'] = res

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_1_3_df['gold_token_label'] = res


In [109]:
res = []
dominant_player = 1 # first player between player 2 and player 3 (therefore player 2)
for i in range(len(player_2_3_df)):
    res.append(check_n_compute_gold_standards_for_two_annotations(player_2_3_df['token_label_2'][i], player_2_3_df['token_label_3'][i], dominant_player))
player_2_3_df['gold_token_label'] = res

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_2_3_df['gold_token_label'] = res


### Reconstruct the all_players_df and create final gold token labels

In [110]:
player_1_df = all_players_df[~all_players_df['token_label_1'].isna() & all_players_df['token_label_2'].isna() & all_players_df['token_label_3'].isna()]
player_2_df = all_players_df[all_players_df['token_label_1'].isna() & ~all_players_df['token_label_2'].isna() & all_players_df['token_label_3'].isna()]
player_3_df = all_players_df[all_players_df['token_label_1'].isna() & all_players_df['token_label_2'].isna() & ~all_players_df['token_label_3'].isna()]

In [111]:
player_1_df['gold_token_label'] = player_1_df['token_label_1']
player_2_df['gold_token_label'] = player_2_df['token_label_2']
player_3_df['gold_token_label'] = player_3_df['token_label_3']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_1_df['gold_token_label'] = player_1_df['token_label_1']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_2_df['gold_token_label'] = player_2_df['token_label_2']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_3_df['gold_token_label'] = player_3_df['token_label_3']


In [112]:
new_df = pd.concat([player_1_df, player_2_df, player_3_df, player_1_2_df, player_1_3_df, player_2_3_df, player_1_2_3_df])
new_df.reset_index(drop = True, inplace = True)

In [113]:
new_df['data'] = new_df['data'].apply(lambda x: x.replace('\r', ''))

In [114]:
new_df

Unnamed: 0,id_1,data,label_1,merger_or_acq_1,token_label_1,id_2,label_2,merger_or_acq_2,token_label_2,id_3,label_3,merger_or_acq_3,token_label_3,gold_token_label
0,50250.0,"Last November, T-Mobile and Sprint confirmed t...","[[15, 23, Bidder], [28, 34, Bidder], [35, 100,...",Merger,"[(Last, O), (November, O), (,, O), (T-Mobile, ...",,,,,,,,,"[(Last, O), (November, O), (,, O), (T-Mobile, ..."
1,50410.0,A New York regulator revoked Friday its approv...,"[[21, 48, terminated], [52, 79, Bidder], [99, ...",Merger,"[(A, O), (New, O), (York, O), (regulator, O), ...",,,,,,,,,"[(A, O), (New, O), (York, O), (regulator, O), ..."
2,50459.0,"With the HCF tie-up now scrapped, HBF can be e...","[[34, 37, Bidder], [13, 32, terminated], [9, 1...",Merger,"[(With, O), (the, O), (HCF, Bidder), (tie-up, ...",,,,,,,,,"[(With, O), (the, O), (HCF, Bidder), (tie-up, ..."
3,50477.0,Mattel has rejected Hasbros acquisition offers...,"[[0, 6, Target], [11, 19, terminated], [20, 27...",Acq,"[(Mattel, Target), (has, O), (rejected, termin...",,,,,,,,,"[(Mattel, Target), (has, O), (rejected, termin..."
4,50503.0,"In terminating its merger agreement, Tribune o...","[[37, 44, Bidder], [70, 78, Bidder], [0, 35, t...",Merger,"[(In, terminated), (terminating, terminated), ...",,,,,,,,,"[(In, terminated), (terminating, terminated), ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7527,121955.0,Google acquired Cambridge-based startup Redux ...,"[[0, 6, Bidder], [7, 15, success], [40, 48, Ta...",Acq,"[(Google, Bidder), (acquired, success), (Cambr...",123366.0,"[[0, 6, Bidder], [7, 15, success], [40, 45, Ta...",Acq,"[(Google, Bidder), (acquired, success), (Cambr...",125430.0,"[[0, 6, Bidder], [7, 15, success], [40, 48, Ta...",Acq,"[(Google, Bidder), (acquired, success), (Cambr...","[(Google, Bidder), (acquired, success), (Cambr..."
7528,121956.0,Facebook is reportedly acquiring London-based ...,"[[0, 8, Bidder], [54, 67, Target], [23, 32, ex...",Acq,"[(Facebook, Bidder), (is, O), (reportedly, O),...",123404.0,"[[0, 8, Bidder], [54, 67, Target], [12, 32, ex...",Acq,"[(Facebook, Bidder), (is, O), (reportedly, exp...",125202.0,"[[0, 8, Bidder], [23, 32, expecting], [54, 67,...",Acq,"[(Facebook, Bidder), (is, O), (reportedly, O),...","[(Facebook, Bidder), (is, O), (reportedly, O),..."
7529,121961.0,Shutterstock A Vancouver company called Buddyb...,"[[79, 92, success], [96, 101, Bidder], [0, 12,...",Acq,"[(Shutterstock, Org), (A, O), (Vancouver, O), ...",123351.0,"[[40, 50, Target], [84, 92, success], [96, 101...",Acq,"[(Shutterstock, O), (A, O), (Vancouver, O), (c...",125280.0,"[[84, 95, success], [96, 101, Bidder], [0, 12,...",Acq,"[(Shutterstock, Org), (A, O), (Vancouver, O), ...","[(Shutterstock, Org), (A, O), (Vancouver, O), ..."
7530,121970.0,"Today (Jan. 17), Pernod Ricard announced it ha...","[[44, 56, success], [57, 74, Target], [17, 30,...",Acq,"[(Today, O), ((, O), (Jan., O), (17, O), (), O...",123378.0,"[[17, 30, Bidder], [48, 56, success], [57, 70,...",Acq,"[(Today, O), ((, O), (Jan., O), (17, O), (), O...",125333.0,"[[17, 30, Bidder], [48, 56, success], [57, 74,...",Acq,"[(Today, O), ((, O), (Jan., O), (17, O), (), O...","[(Today, O), ((, O), (Jan., O), (17, O), (), O..."


In [115]:
new_df.to_csv('gold_label_annotated_data_2204.csv', index = False)