In [2]:
import pandas as pd
import numpy as np
import os
import math

In [3]:
df = pd.read_csv(os.path.join(os.path.expanduser('~/Influence_Scores/score_results'), 'sexist_data_final_train.csv'))

Will follow the following steps
1. Sort the data based on PVI, EL2N and VOG
2. Sample based on the type of prune.
3. Select a random sample
4. Run the experiments and store the final model.
4. Test the accuracy on the test data
5. Test the F1 Score on the HateCheck Eval Sexism subset.

In [4]:
df.head(4)

Unnamed: 0,id,text,numeric_labels,sexist,predicted_label_1,correct_yx,predicted_label_2,predicted_label_3,predicted_label_4,predicted_label_5,average_pvi,average_el2n,average_vog,misclassification_number
0,2651,SIIIIGH http://t.co/BpJX5JwhlH,0,False,LABEL_0,True,LABEL_0,LABEL_0,LABEL_0,LABEL_0,0.148039,0.006603,-0.83291,0
1,6089,I really dislike working with undergrads in la...,0,False,LABEL_0,True,LABEL_0,LABEL_0,LABEL_0,LABEL_0,0.132709,0.021481,-0.832839,0
2,9786,"So calling qualified hires ""tokens"" b/c it's n...",0,False,LABEL_0,True,LABEL_0,LABEL_0,LABEL_0,LABEL_0,0.142481,0.012016,-0.831354,0
3,10635,Noooooo #mkr whyyyyy,0,False,LABEL_0,True,LABEL_0,LABEL_0,LABEL_0,LABEL_0,0.145224,0.009347,-0.834254,0


In [5]:
df['numeric_labels'].value_counts()

numeric_labels
0    8272
1    1269
Name: count, dtype: int64

## Informed Undersampling
In this experimental setup, we will prune the majority classes systematically using the Influence Score values.

In [6]:
def get_informed_undersampled_prune(data, inf_score, type_prune, prune_rate, data_name = 'cmsb'):
    path_to_df = f'{data_name}_{inf_score}_{type_prune}'
    data_sexist = data[data['numeric_labels'] == 1].reset_index(drop = True)
    data_non_sexist = data[data['numeric_labels'] == 0].reset_index(drop = True)
    if type_prune == 'hard':
        data_non_sexist_pruned = data_non_sexist.iloc[-(math.ceil((1 - (prune_rate / 100)) * data_non_sexist.shape[0])): -1].reset_index(drop = True)
        data_final = pd.concat([data_sexist, data_non_sexist_pruned], axis = 0)
        data_final.to_csv(os.path.join(path_to_df, f'sexist_data_undersample_{prune_rate}_train.csv'))
    elif type_prune == 'easy':
        data_non_sexist_pruned = data_non_sexist.iloc[: math.ceil((1 - (prune_rate/ 100)) * data_non_sexist.shape[0])].reset_index(drop = True)
        data_final = pd.concat([data_sexist, data_non_sexist_pruned], axis = 0)
        data_final.to_csv(os.path.join(path_to_df, f'sexist_data_undersample_{prune_rate}_train.csv'))

In [7]:
df_pvi = df[['text', 'numeric_labels', 'average_pvi']].sort_values(by = 'average_pvi').reset_index(drop = True)
df_el2n = df[[ 'text', 'numeric_labels', 'average_el2n']].sort_values(by = 'average_el2n',
                                                                    ascending = False).reset_index(drop = True)
#df_vog = df[['text', 'numeric_labels', 'average_vog']].sort_values(by = 'average_vog',
#                                                                   ascending = False).reset_index(drop = True)

In [11]:
for prune_rate in (5, 10, 15, 20, 25, 30, 35, 40, 50, 60):
    get_informed_undersampled_prune(df_pvi, inf_score = 'pvi', type_prune = 'hard', prune_rate = prune_rate)
    get_informed_undersampled_prune(df_pvi, inf_score = 'pvi', type_prune = 'easy', prune_rate = prune_rate)
    get_informed_undersampled_prune(df_el2n, inf_score = 'el2n', type_prune = 'hard', prune_rate = prune_rate)
    get_informed_undersampled_prune(df_el2n, inf_score = 'el2n', type_prune = 'easy', prune_rate = prune_rate)
    #get_informed_undersampled_prune(df_vog, inf_score = 'vog', type_prune = 'hard', prune_rate = prune_rate)
    #get_informed_undersampled_prune(df_vog, inf_score = 'vog', type_prune = 'easy', prune_rate = prune_rate)

In [9]:
df_el2n[df_el2n['numeric_labels'] == 0]

Unnamed: 0,text,numeric_labels,average_el2n
74,I approve of a woman taking the first step to ...,0,1.149410
75,I approve of a woman taking the aggressive rol...,0,1.146732
77,I think a woman could do most things as well a...,0,1.142843
79,yeah im a female but I think like a man,0,1.139142
80,Women can handle pressure just as well as men ...,0,1.138915
...,...,...,...
9536,“MENTION3853: MENTION4213 MENTION583 MENTION22...,0,0.005389
9537,"MENTION4272 Oh hey, statistics http://t.co/Fla...",0,0.005386
9538,MENTION4683 MENTION2978 I think the hashtag go...,0,0.005281
9539,This is really Drasco ... #mkr http://t.co/zy5...,0,0.005240
