In [2]:
%load_ext autoreload
%autoreload 2

# mount drive
from google.colab import drive
drive.mount('/content/drive')

# cd into project directory
%cd /content/drive/My\ Drive/Georgia_Tech/Spring_2021/sbic_stereotypes

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1M873oJOlIb1Bd5vliq6d04b61XsHLT5h/sbic_stereotypes


In [3]:
!pip install rouge
!pip install transformers
!pip install datasets

import torch
import pandas as pd
import numpy as np

from data_preprocessing import *
from datasets import Dataset
from transformers import AutoModelForCausalLM

Collecting rouge
  Downloading https://files.pythonhosted.org/packages/43/cc/e18e33be20971ff73a056ebdb023476b5a545e744e3fc22acd8c758f1e0d/rouge-1.0.0-py3-none-any.whl
Installing collected packages: rouge
Successfully installed rouge-1.0.0
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 7.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 37.8MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████|

In [18]:
#################### PICK MODEL AND DATASET ####################
from_file = 'data/SBIC.v2.dev.csv'

test_gpt_5epoch = {
                    'TO ACTUAL': 'data/test/sample_gpt_5epoch_dev_actual.csv',
                    'TO PRED': 'data/test/sample_gpt_5epoch_dev_pred.csv',
                    'TRAINED MODEL': 'model/gpt_5epoch/checkpoint-42500/',
                    'BASE MODEL': 'openai-gpt',
                    'SAMPLE SIZE': 5000
                  }

test_gpt2_5epoch = {
                    'TO ACTUAL': 'data/test/sample_gpt2_5epoch_dev_actual.csv',
                    'TO PRED': 'data/test/sample_gpt2_5epoch_dev_pred.csv',
                    'TRAINED MODEL': 'model/gpt2_5epoch/checkpoint-47334/',
                    'BASE MODEL': 'gpt2',
                    'SAMPLE SIZE': 5000
                  }

test_lb_gpt2_5epoch = {
                    'TO ACTUAL': 'data/test/sample_lb_gpt2_5epoch_dev_actual.csv',
                    'TO PRED': 'data/test/sample_lb_gpt2_5epoch_dev_pred.csv',
                    'TRAINED MODEL': 'model/gpt2_5epoch/checkpoint-47334/',
                    'BASE MODEL': 'gpt2',
                    'SAMPLE SIZE': 1000
                  }

active_test = test_gpt2_5epoch
################################################################

tokenizer = setup_tokenizer(active_test['BASE MODEL'])
model = AutoModelForCausalLM.from_pretrained(active_test['TRAINED MODEL'], \
                                             pad_token_id=tokenizer.eos_token_id)
model.eval()
df = pd.read_csv(from_file)

In [None]:
from testing_utils import *

pred_col = ['post', 'sexYN', 'offensiveYN', 'intentYN', 'whoTarget', 'targetMinority','targetStereotype', 'speakerMinorityYN']
clean_post(df)

def get_samples_from_actual(df, pred_col, active_test):
  actual = df[pred_col].sample(n=active_test['SAMPLE SIZE'])
  categorize_var(actual)
  return actual

def get_lewd_balanced_samples_from_actual(df, pred_col, active_test):
  actual = df[pred_col].copy()
  categorize_var(actual)
  half_sample = int(active_test['SAMPLE SIZE'] / 2)

  actual_1 = actual.loc[actual['sexYN'] == LEWDY]
  actual_2 = actual.loc[actual['sexYN'] == LEWDN]

  actual = pd.concat([actual_1.sample(n=half_sample), actual_2.sample(n=half_sample)], axis=0)
  return actual

actual = get_lewd_balanced_samples_from_actual(df, pred_col, active_test)
predict_samples(model, tokenizer, actual, pred_col, active_test)

In [19]:
## F1 and Precision/Recall Scores ##

from data_preprocessing import *
from testing_utils import *

actual = pd.read_csv(active_test['TO ACTUAL'])
pred = pd.read_csv(active_test['TO PRED'])

print("Category: (F1, Precision, Recall)")
print('Offensive: ', f1_score(actual, pred, 'offensiveYN', OFFY, OFFN))
print('Intent: ', f1_score(actual, pred, 'intentYN', INTY, INTN))
print('Lewd: ', f1_score(actual, pred, 'sexYN', LEWDY, LEWDN))
print('Group Targeted: ', f1_score(actual, pred, 'whoTarget', GRPY, GRPN))
print('In Group: ', f1_score(actual, pred, 'speakerMinorityYN', INGY, INGN))

Category: (F1, Precision, Recall)
Offensive:  (0.8667888787045523, 0.8154642138545559, 0.9250081512879035)
Intent:  (0.8396584440227703, 0.7631503305547571, 0.9332161687170475)
Lewd:  (0.4379310344827586, 0.675531914893617, 0.3239795918367347)
Group Targeted:  (0.7930611529700367, 0.6709160984286985, 0.9695801199657241)
In Group:  (0, 0, 0.0)


In [14]:
## BLEU/Rouge-L Scores ##
from nltk.translate.bleu_score import corpus_bleu
from rouge import Rouge

def print_scores(col_name):
  cmp_grp_target = pd.concat([actual[col_name].rename('actual'), \
                              pred[col_name].rename('pred')], \
                              axis=1)
  cmp_grp_target = cmp_grp_target.replace(np.nan, '', regex=True)
  cmp_grp_target = cmp_grp_target[cmp_grp_target['actual'] != '']

  cmp_grp_target['actual'] = cmp_grp_target['actual'].str.lower()
  cmp_grp_target['pred'] = cmp_grp_target['pred'].str.lower()

  references = cmp_grp_target.actual.tolist()
  hypotheses = cmp_grp_target.pred.tolist()

  rouge = Rouge()
  scores = rouge.get_scores(hypotheses, references, avg=True, ignore_empty=True)
  print('Rouge-L: ', scores['rouge-l'])

  references = [[reference] for reference in references]
  print('Bleu: ', corpus_bleu(references, hypotheses, weights=[0.5,0.5,0,0]))

print('Scores for Target Minority')
print_scores('targetMinority')

print('Scores for Target Stereotype')
print_scores('targetStereotype')

Scores for Target Minority
Rouge-L:  {'f': 0.4633071314541108, 'p': 0.47273462783171527, 'r': 0.46058901454532514}
Bleu:  0.39285895624340844
Scores for Target Stereotype
Rouge-L:  {'f': 0.3215983915249947, 'p': 0.34539919450367207, 'r': 0.3168531116983972}
Bleu:  0.37323984063358867
