In [None]:
%load_ext autoreload
%autoreload 2

# mount drive
from google.colab import drive
drive.mount('/content/drive')

# cd into project directory
%cd /content/drive/My\ Drive/Georgia_Tech/Spring_2021/sbic_stereotypes/baselines

# Useful Constants
DATA_DIR = '../data/'

In [None]:
!pip install transformers
!pip install datasets

import torch
import pandas as pd
import numpy as np

In [None]:
## Set all parameters here ##
gpt_dict_5epoch = {
                    'MODEL_NAME':'openai-gpt', \
                    'OUTPUT_DIR':'model/gpt_5epoch', \
                    'LEARNING_RATE': 5e-6, \
                    'SAVE_STEPS': 22367, \
                    'EPOCHS': 5.0
                  }

gpt2_dict_5epoch =  {
                      'MODEL_NAME':'gpt2', \
                      'OUTPUT_DIR':'model/gpt2_5epoch', \
                      'LEARNING_RATE': 1e-5, \
                      'SAVE_STEPS': 24075, \
                      'EPOCHS': 5.0
                    }

active_dict = gpt_dict_5epoch

In [None]:
from data_preprocessing import *

# Read and Clean Data
from_train_file = DATA_DIR + 'SBIC.v2.trn.csv'
from_dev_file = DATA_DIR + 'SBIC.v2.dev.csv'
to_train_file = 'data/baseline_train_text.csv'
to_dev_file = 'data/baseline_dev_text.csv'

clean_df(from_train_file, to_train_file)
clean_df(from_dev_file, to_dev_file)

In [None]:
from datasets import load_dataset

datasets = load_dataset("csv", data_files={"train": to_train_file, "validation": to_dev_file})
print(datasets)

In [None]:
# We need to create the model and tokenizer
tokenizer = setup_tokenizer(active_dict['MODEL_NAME'])
tokenized_datasets = datasets.map(lambda examples: tokenizer(examples["text"]), \
                                    batched=True, num_proc=4, \
                                    remove_columns=["text"])

print(tokenized_datasets)

In [None]:
# Normalize the text length as Blocks
block_size = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    
    if total_length % block_size:
      remainder = block_size - (total_length % block_size)
    else:
      remainder = 0
    
    pad_extension = [tokenizer.pad_token_id for _ in range(remainder)]
    attention_extension = [0 for _ in range(remainder)]

    concatenated_examples['input_ids'].extend(pad_extension)
    concatenated_examples['attention_mask'].extend(attention_extension)
    total_length += remainder
    
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

print(lm_datasets)

In [None]:
# Load PreTrained Model and train it
from transformers import AutoModelForCausalLM
from transformers import Trainer, TrainingArguments

model = AutoModelForCausalLM.from_pretrained(active_dict['MODEL_NAME'])
model.resize_token_embeddings(len(tokenizer))
model.train()

training_args = TrainingArguments(
    output_dir = active_dict['OUTPUT_DIR'],
    evaluation_strategy = 'steps',
    eval_steps = 2000,
    save_steps = active_dict['SAVE_STEPS'],
    save_total_limit = 1,
    warmup_steps = 5000,
    learning_rate = active_dict['LEARNING_RATE'],
    per_device_train_batch_size = 4,
    num_train_epochs = active_dict['EPOCHS'],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)

trainer.train()