In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, BertForSequenceClassification, TextClassificationPipeline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import os
from tqdm import tqdm
from scipy.stats import ttest_ind

In [2]:
path = './soc/data/majority_gab_dataset_25k/'
mode = 'dev'
data = pd.read_json(os.path.join(path, f'{mode}.jsonl'), lines=True)
text = data['Text'].to_list()

In [8]:
true_label = ((data['cv']+data['hd'])>0).astype(int).to_list()
for seed in tqdm(range(10), total=10):
    
    # data = pd.read_csv('./data/hatecheck-data/test_suite_cases.csv',index_col=0)

    model_dir = f'./ear_bert/entropybert-gab25k-{seed}-0.01/'
    model = BertForSequenceClassification.from_pretrained(model_dir, num_labels=2)
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, truncation=True,padding="max_length",max_length=512)
    pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, top_k=None, device=0)
    results = pipe(text)
    prob_non_hate = [x['score'] for y in results for x in y if x['label']=='LABEL_0']
    prob_hate = [x['score'] for y in results for x in y if x['label']=='LABEL_1']
    pred_label = [0 if a > b else 1 for (a, b) in zip(data['prob_non_hate'].to_list(), data['prob_hate'].to_list())]
    # data['true_label'] = [0 if x == 'non-hateful' else 1 for x in data['label_gold']]
    
    out_dict = {}
    out_dict['Text'] = text
    out_dict['pred_label'] = pred_label
    out_dict['true_label'] = true_label
    
    results_df = pd.DataFrame.from_dict(out_dict)
    os.makedirs(os.path.join(model_dir, 'prediction'), exist_ok=True)
    results_df.to_csv(os.path.join(model_dir, f'prediction/gab_{mode}.csv'), index=False)

In [5]:
data.head()

Unnamed: 0,text_id,Text,purity,harm,im,cv,ex,degradation,fairness,hd,...,cheating,subversion,rel,sxo,rae,nat,pol,authority,vo,idl
0,134678,Prayer and fasting and staying in a state of g...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,17143934,This is why lamb fries were invented? Mountai...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22712962,I see a life of heroin for Hogg.,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,22253875,With your limited understanding of basic conce...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,37252246,iTS ok to make money off digital currency in t...,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
