In [1]:
%load_ext autoreload
%autoreload 2

# mount drive
from google.colab import drive
drive.mount('/content/drive')

# cd into project directory
%cd /content/drive/My\ Drive/Georgia_Tech/Spring_2021/sbic_stereotypes/baselines

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1M873oJOlIb1Bd5vliq6d04b61XsHLT5h/sbic_stereotypes/baselines


In [7]:
!pip install transformers
!pip install datasets
!pip install rouge
!pip install bert_score
!pip install tqdm

import torch
import pandas as pd
import numpy as np

from data_preprocessing import *
from datasets import Dataset
from transformers import AutoModelForCausalLM

Collecting bert_score
[?25l  Downloading https://files.pythonhosted.org/packages/14/27/ccf86d5dfc19f89bee4449e96ac6e0f7c312f1614de86609c5f6da5c40af/bert_score-0.3.8-py3-none-any.whl (58kB)
[K     |████████████████████████████████| 61kB 2.8MB/s 
Installing collected packages: bert-score
Successfully installed bert-score-0.3.8


In [3]:
DATA_DIR = '../data/'
MAX_LENGTH = 128

#################### PICK MODEL AND DATASET ####################
from_file = DATA_DIR + 'SBIC.v2.dev.csv'
pred_col = ['HITId', 'post', 'sexYN', 'offensiveYN', 'intentYN', 'whoTarget', \
            'targetMinority','targetStereotype', 'speakerMinorityYN']

test_gpt_5epoch = {
                    'TO ACTUAL': 'pred/test/gpt_5epoch_dev_actual_sub.csv',
                    'TO PRED': 'pred/test/gpt_5epoch_dev_pred_sub.csv',
                    'TRAINED MODEL': 'model/gpt_5epoch/checkpoint-44734/',
                    'BASE MODEL': 'openai-gpt',
                    'SAMPLE SIZE': 2500
                  }

test_gpt2_5epoch = {
                    'TO ACTUAL': 'pred/test/gpt2_5epoch_dev_actual.csv',
                    'TO PRED': 'pred/test/gpt2_5epoch_dev_pred.csv',
                    'TRAINED MODEL': 'model/gpt2_5epoch/checkpoint-48150/',
                    'BASE MODEL': 'gpt2',
                    'SAMPLE SIZE': 2500
                  }

active_test = test_gpt_5epoch
################################################################
df = pd.read_csv(from_file)

In [None]:
from testing_utils import *

clean_post(df)
tokenizer = setup_tokenizer(active_test['BASE MODEL'])
model = AutoModelForCausalLM.from_pretrained(active_test['TRAINED MODEL'], \
                                             pad_token_id=tokenizer.eos_token_id)
model.eval()

def get_samples_from_actual(df, pred_col, active_test):
  #actual = df[df.HITId.isin(post_ids)]
  actual = df[pred_col].sample(n=active_test['SAMPLE SIZE'])
  actual = actual[pred_col]
  categorize_var(actual)
  return actual

#post_ids = ['3W0XM68YZPPSXA20A826L4NZQHXK11','3IYI9285WSUH9T6G8KRE1L6DHMOCJG',
#            '3ZXV7Q5FJBI14RKKPU0TMNELOFTCFZ','3X55NP42EOAPI4DVA4LX5EOVK7XP39',
#            '33IXYHIZB5CW0VSMXQRHSSKZYQFE2S']
actual = get_samples_from_actual(df, pred_col, active_test)
predict_samples(model, tokenizer, actual, pred_col, active_test, MAX_LENGTH)

In [4]:
## F1 and Precision/Recall Scores ##
from data_preprocessing import *
from testing_utils import *

actual = pd.read_csv(active_test['TO ACTUAL'])
pred = pd.read_csv(active_test['TO PRED'])

print("Category: (Precision, Recall, F1)")
print('Offensive: ', f1_score(actual, pred, 'offensiveYN', OFFY, OFFN))
print('Intent: ', f1_score(actual, pred, 'intentYN', INTY, INTN))
print('Lewd: ', f1_score(actual, pred, 'sexYN', LEWDY, LEWDN))
print('Group Targeted: ', f1_score(actual, pred, 'whoTarget', GRPY, GRPN))
print('In Group: ', f1_score(actual, pred, 'speakerMinorityYN', INGY, INGN))

Category: (Precision, Recall, F1)
Offensive:  (0.8414403032217309, 0.882703777335984, 0.8615782664941785)
Intent:  (0.7807959570435882, 0.8956521739130435, 0.8342895713803579)
Lewd:  (0.696551724137931, 0.5024875621890548, 0.5838150289017342)
Group Targeted:  (0.7228412256267409, 0.9243098842386465, 0.8112543962485346)
In Group:  (0.6666666666666666, 0.0547945205479452, 0.10126582278481013)


In [5]:
## BLEU/Rouge-L Scores ##
from testing_utils import *
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu
from rouge import Rouge

keep_col = ['HITId', 'post', 'sexYN', 'offensiveYN', 'intentYN', 'whoTarget', \
            'speakerMinorityYN']

sub_df = df[['HITId', 'targetMinority', 'targetStereotype']]
sub_df = aggregate_and_format(sub_df)
actual = actual[keep_col].join(sub_df, on='HITId').reindex(columns=pred_col)

references_tm, hypotheses_tm = get_references_and_hypotheses('targetMinority', actual, pred)
bleu_score_tm_max, bleu_score_tm_avg = get_bleu_score(references_tm, hypotheses_tm)
rouge_scores_tm_max, rouge_scores_tm_avg = get_rouge_scores(references_tm, hypotheses_tm)

references_ts, hypotheses_ts = get_references_and_hypotheses('targetStereotype', actual, pred)
bleu_score_ts_max, bleu_score_ts_avg = get_bleu_score(references_ts, hypotheses_ts)
rouge_scores_ts_max, rouge_scores_ts_avg = get_rouge_scores(references_ts, hypotheses_ts)

print("Target Minority Scores: ")
print("Bleu Score (Avg): ", bleu_score_tm_avg)
print("Bleu Score (Max): ", bleu_score_tm_max)
print("Rouge Score (Avg) (Precision, Recall, F1): ", rouge_scores_tm_avg)
print("Rouge Score (Max) (Precision, Recall, F1): ", rouge_scores_tm_max)

print("Implied Stereotype Scores: ")
print("Bleu Score (Avg): ", bleu_score_ts_avg)
print("Bleu Score (Max): ", bleu_score_ts_max)
print("Rouge Score (Avg) (Precision, Recall, F1): ", rouge_scores_ts_avg)
print("Rouge Score (Max) (Precision, Recall, F1): ", rouge_scores_ts_max)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


Target Minority Scores: 
Bleu Score (Avg):  0.6884384989652276
Bleu Score (Max):  0.73034729388037
Rouge Score (Avg) (Precision, Recall, F1):  [0.66363921 0.66359524 0.66267057]
Rouge Score (Max) (Precision, Recall, F1):  [0.71356667 0.71516667 0.7135219 ]
Implied Stereotype Scores: 
Bleu Score (Avg):  0.5321667997170639
Bleu Score (Max):  0.6219778300600611
Rouge Score (Avg) (Precision, Recall, F1):  [0.49906263 0.48796187 0.48967972]
Rouge Score (Max) (Precision, Recall, F1):  [0.60585528 0.60547436 0.60154562]


In [None]:
from datasets import load_metric
from testing_utils import *

metric = load_metric('bertscore')
bert_scores_ts = metric.compute(predictions=hypotheses_ts, references=references_ts, lang='en')
bert_scores_tm = metric.compute(predictions=hypotheses_tm, references=references_tm, lang='en')

bert_score_ts = get_bert_score(bert_scores_ts, hypotheses_ts, references_ts)
bert_score_tm = get_bert_score(bert_scores_tm, hypotheses_tm, references_tm)

print('Target Minority Scores')
print('BERT Score (Max) (Precision, Recall, F1): ', bert_score_tm)
print('Implied Stereotype Scores')
print('BERT Score (Max) (Precision, Recall, F1): ', bert_score_ts)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=482.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1425941629.0, style=ProgressStyle(descr…


