In [1]:
import pandas as pd
import numpy as np
import os
import torch
import pickle
import math
from submodlib import FacilityLocationFunction

In [4]:
with open('cluster_analysis/sexist_data_train.pkl', 'rb') as train_file:
    train_embeds = pickle.load(train_file)

In [7]:
df_train = pd.read_csv(os.path.join(os.path.expanduser('~/Influence_Scores/score_results'), 'sexist_data_final_train.csv'))
#df_train = df_train.drop(columns = ['Unnamed: 0']).reset_index(drop = True)
df_train.head()

Unnamed: 0,id,text,numeric_labels,sexist,predicted_label_1,correct_yx,predicted_label_2,predicted_label_3,predicted_label_4,predicted_label_5,average_pvi,average_el2n,average_vog,misclassification_number
0,2651,SIIIIGH http://t.co/BpJX5JwhlH,0,False,LABEL_0,True,LABEL_0,LABEL_0,LABEL_0,LABEL_0,0.148039,0.006603,-0.83291,0
1,6089,I really dislike working with undergrads in la...,0,False,LABEL_0,True,LABEL_0,LABEL_0,LABEL_0,LABEL_0,0.132709,0.021481,-0.832839,0
2,9786,"So calling qualified hires ""tokens"" b/c it's n...",0,False,LABEL_0,True,LABEL_0,LABEL_0,LABEL_0,LABEL_0,0.142481,0.012016,-0.831354,0
3,10635,Noooooo #mkr whyyyyy,0,False,LABEL_0,True,LABEL_0,LABEL_0,LABEL_0,LABEL_0,0.145224,0.009347,-0.834254,0
4,9749,I don't know ANY genuine female nerds... who w...,0,False,LABEL_1,False,LABEL_1,LABEL_1,LABEL_1,LABEL_1,-1.236664,0.86859,-0.832807,5


In [8]:
df_train['sexist'].value_counts()

sexist
False    8272
True     1269
Name: count, dtype: int64

In [9]:
df_train[df_train['sexist'] == True].shape[0] / df_train[df_train['sexist'] == False].shape[0]

0.1534090909090909

In [10]:
df_train_sexist = df_train[df_train['sexist'] == True].reset_index(drop  = True)
df_train_non_sexist = df_train[df_train['sexist'] == False].reset_index(drop  = True)

In [11]:
def get_embeds(identities, ids_embeds):
    tensor = ids_embeds[identities[0]].unsqueeze(0)
    for identity in identities[1:]:
        tensor = torch.cat((tensor, ids_embeds[identity].unsqueeze(0)), dim  = 0)
    return tensor

In [12]:
identities_sexist = df_train_sexist['id'].to_list()
identities_non_sexist = df_train_non_sexist['id'].to_list()
embed_tensor_sexist = get_embeds(identities_sexist, train_embeds)
embed_tensor_non_sexist = get_embeds(identities_non_sexist, train_embeds)

In [13]:
objFL_sexist = FacilityLocationFunction(n = embed_tensor_sexist.shape[0], data = embed_tensor_sexist, mode = 'dense', metric = 'cosine')
objFL_non_sexist = FacilityLocationFunction(n = embed_tensor_non_sexist.shape[0], data = embed_tensor_non_sexist, mode = 'dense', metric = 'cosine')

In [14]:
def get_subsets(data, budget, objFL):
    #objFL = FacilityLocationFunction(n = data.shape[0], data = data, mode = 'dense', metric = 'cosine')
    greedyList = objFL.maximize(budget = budget, optimizer = 'LazierThanLazyGreedy')
    return greedyList

In [15]:
def get_dataframes_gains(subset_list, train_df):
    dict_id_gains = {'idx': [idx for idx, _ in subset_list], 'gain' : [gain for _, gain in subset_list]}
    train_df_subset = df_train[df_train.index.isin(dict_id_gains['idx'])].reset_index(drop = True)
    train_df_subset['gain'] = dict_id_gains['gain']
    return train_df_subset

In [16]:
for prune_rate in (5, 10, 15, 20, 25, 30, 35, 40, 50, 60):
    prune_rate_sexist = prune_rate / 2
    prune_rate_non_sexist = prune_rate / 2
    budget_sexist = math.ceil((1 - (prune_rate/100)) * embed_tensor_sexist.shape[0])
    budget_non_sexist = math.ceil((1 - (prune_rate/100)) * embed_tensor_non_sexist.shape[0])
    subset_sexist = get_subsets(data = embed_tensor_sexist, budget = budget_sexist, objFL = objFL_sexist)
    #print(len(subset_sexist))
    subset_non_sexist = get_subsets(data = embed_tensor_non_sexist, budget = budget_non_sexist, objFL = objFL_non_sexist)
    #print(len(subset_non_sexist))
    df_sexist_subset = get_dataframes_gains(subset_sexist, df_train_sexist)
    df_non_sexist_subset = get_dataframes_gains(subset_non_sexist, df_train_non_sexist)
    df_final_subset = pd.concat([df_sexist_subset, df_non_sexist_subset], axis = 0)
    df_final_subset.to_csv(os.path.join('submod_data', f'sexist_data_submodular_{prune_rate}_train.csv'), index = False)
    print('Done')

[||||                ]21% [Iteration 241 of 1143]9]

Done


[                    ]1% [Iteration 11 of 1079]445]

Done


[                    ]1% [Iteration 11 of 1016]032]

Done


[                    ]1% [Iteration 10 of 952]6618]

Done


[||||                ]22% [Iteration 187 of 889]04]

Done


[                    ]1% [Iteration 9 of 825] 5791]

Done


[||||||||||||||||||||]100% [Iteration 5377 of 5377]

Done


[                    ]1% [Iteration 7 of 4 of 4964]

Done


[                    ]1% [Iteration 6 of 508] 4136]

Done
Done


[||||||||||||||||||||]100% [Iteration 3309 of 3309]

In [19]:
df = pd.read_csv(os.path.join('submod_data', 'sexist_data_submodular_10_train.csv'))
df['numeric_labels'].value_counts()

numeric_labels
0    7442
1    1146
Name: count, dtype: int64