In [1]:
%load_ext autoreload
%autoreload 2

# mount drive
from google.colab import drive
drive.mount('/content/drive')

# cd into project directory
%cd /content/drive/My\ Drive/Georgia_Tech/Spring_2021/sbic_stereotypes/src

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1M873oJOlIb1Bd5vliq6d04b61XsHLT5h/sbic_stereotypes/src
Python 3.7.10


In [None]:
!pip install transformers
!pip install datasets

# Useful constants
CLASSIFIER_MODEL_NAME = 'bert-base-uncased'
CLASSIFIERS = ['./classification/model/whoTarget/checkpoint-1280/']
SEQ2SEQ_MODEL_NAME = 'facebook/bart-base'
DATA_DIR = '../data/'
JOIN_DROPOUT = 0.0
SEED = 154
WARMUP_DIV = 9.793

In [None]:
from seq2seq_utils import *
from datasets import Dataset

df = pd.read_csv(DATA_DIR + 'SBIC.v2.trn.csv')
clean_post(df)
df = clean_target(df)
df.to_csv('data/clean_train_df.csv')

dataset = Dataset.from_pandas(df)
datasets = dataset.train_test_split(test_size=0.2, shuffle=True)
seq2seq_tok, classifier_tok, tokenized = get_tokenized_data(
    datasets,
    SEQ2SEQ_MODEL_NAME,
    CLASSIFIER_MODEL_NAME,
    labels=True,
)

print(tokenized)

In [None]:
from torch import nn, torch
from seq2seq import BartForConditionalGenerationJoinModel

model = BartForConditionalGenerationJoinModel.from_pretrained(
            SEQ2SEQ_MODEL_NAME,
            join_dropout=JOIN_DROPOUT,
            classifiers=CLASSIFIERS,
        )

if torch.cuda.is_available():
  model = model.cuda()

model.train();

In [None]:
from transformers import Trainer, TrainingArguments
import math

num_rows = tokenized['train'].num_rows
num_epochs = 3.0
learning_rate = 5e-6
batch_size = 8

if num_epochs == 1:
  one_epoch_steps = math.ceil(num_rows / batch_size) // 2
  warmup_steps = (one_epoch_steps * num_epochs) // WARMUP_DIV
  save_steps = one_epoch_steps * 2
  eval_steps = (save_steps * 10.0) // 100
else:
  one_epoch_steps = math.ceil(num_rows / batch_size)
  warmup_steps = (one_epoch_steps * num_epochs) // 
  save_steps = (one_epoch_steps * num_epochs) // 2
  eval_steps = (one_epoch_steps * num_epochs * 5.0) // 100

print("Linear Warm Up: ", warmup_steps)
print("Save Steps: ", save_steps)
print("Eval Steps: ", eval_steps)

training_args = TrainingArguments(
    output_dir = 'model',
    evaluation_strategy = 'steps',
    eval_steps = eval_steps,
    logging_steps = eval_steps,
    save_steps = save_steps,
    save_total_limit = 1,
    warmup_steps = warmup_steps,
    learning_rate = learning_rate,
    per_device_train_batch_size = batch_size,
    num_train_epochs = num_epochs,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
)

trainer.train()