In [None]:
%load_ext autoreload
%autoreload 2

# mount drive
from google.colab import drive
drive.mount('/content/drive')

# cd into project directory
%cd /content/drive/My\ Drive/Georgia_Tech/Spring_2021/sbic_stereotypes/src/classification

In [None]:
!pip install transformers
!pip install datasets

import torch
import pandas as pd
import numpy as np

# Useful constants
DATA_DIR = '../../data/'
MODEL_NAME = 'bert-base-uncased'
CLASSIFY_COL = 'whoTarget'

OFFENSE_PARAMS = {
      'clean_data_file': 'data/train_' + CLASSIFY_COL + '.csv',
      'model_output': 'model/' + CLASSIFY_COL,
      'max_length': 128,
      'lr': 5e-6,
      'batch_size': 32,
      'num_epochs': 2.0,
  }

LEWD_PARAMS = {
      'clean_data_file': 'data/train_' + CLASSIFY_COL + '.csv',
      'model_output': 'model/' + CLASSIFY_COL,
      'max_length': 128,
      'lr': 5e-6,
      'batch_size': 32,
      'num_epochs': 1.0,
  }

INTENT_PARAMS = {
      'clean_data_file': 'data/train_' + CLASSIFY_COL + '.csv',
      'model_output': 'model/' + CLASSIFY_COL,
      'max_length': 128,
      'lr': 5e-7,
      'batch_size': 32,
      'num_epochs': 1.0,
  }

GROUP_PARAMS = {
      'clean_data_file': 'data/train_' + CLASSIFY_COL + '.csv',
      'model_output': 'model/' + CLASSIFY_COL,
      'max_length': 128,
      'lr': 5e-6,
      'batch_size': 32,
      'num_epochs': 2.0,
  }

PARAMS = GROUP_PARAMS

In [None]:
from classifier_utils import *

# Classify column
df = pd.read_csv(DATA_DIR + 'SBIC.v2.trn.csv')
df = prep_df_for_classification(df, PARAMS['clean_data_file'], CLASSIFY_COL)
print(df[CLASSIFY_COL].isna().any())

In [None]:
from transformers import BertTokenizer
from datasets import Dataset
import statistics as stats

dataset = Dataset.from_pandas(df)
datasets = dataset.train_test_split(test_size=0.2, shuffle=True)

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
# This takes a while to run
# del tokenized_datasets
if 'tokenized_datasets' not in globals():
  tokenized_datasets = datasets.map(lambda row: tokenizer(row["post"], \
                                                          truncation=True, \
                                                          padding='max_length', \
                                                          max_length=PARAMS['max_length']), \
                                      batched=True, num_proc=4, \
                                      remove_columns=["post"])

print(tokenized_datasets)

## Compute Statistics in case we are interested.
compute_statistics(tokenized_datasets['train']['input_ids'])

In [None]:
# Prepare Final Dataset for Training
lm_datasets = {}

for key in tokenized_datasets.keys():
  lm_datasets[key] = tokenized_datasets[key].rename_column(CLASSIFY_COL, 'labels')

print(lm_datasets)

In [None]:
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import math

model = BertForSequenceClassification.from_pretrained(MODEL_NAME)
model.train()

num_rows = lm_datasets['train'].num_rows
if PARAMS['num_epochs'] == 1:
  warmup_steps = math.ceil(num_rows / PARAMS['batch_size']) // 2
  save_steps = warmup_steps * 2
  eval_steps = (save_steps * 5.0) // 100
else:
  warmup_steps = math.ceil(num_rows / PARAMS['batch_size'])
  save_steps = (warmup_steps * PARAMS['num_epochs']) // 2
  eval_steps = (warmup_steps * PARAMS['num_epochs'] * 5.0) // 100

print("\n")
print("Linear Warm Up: ", warmup_steps)
print("Save Steps: ", save_steps)
print("Eval Steps: ", eval_steps)

training_args = TrainingArguments(
    output_dir = PARAMS['model_output'],
    evaluation_strategy = 'steps',
    eval_steps = eval_steps,
    logging_steps = eval_steps,
    save_steps = save_steps,
    save_total_limit = 1,
    warmup_steps = warmup_steps,
    learning_rate = PARAMS['lr'],
    per_device_train_batch_size = PARAMS['batch_size'],
    num_train_epochs = PARAMS['num_epochs'],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
)

trainer.train()