# 02 - Train GPT2 Model

This notebook contains the steps to train open source gpt2 model hosted on hugging face.

Author:
- Santosh Yadaw
- santoshyadawprl@gmail.com

## a. Setup

In [1]:
import os
import re
import logging
from tqdm.auto import tqdm

tqdm.pandas()

import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

import torch
from transformers import DataCollatorForLanguageModeling, DataCollatorWithPadding, GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, AutoConfig
from datasets import Dataset

In [2]:
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging

In [3]:
# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logger.info(f"device: {device}")

INFO:root:device: cuda


In [4]:
# Constants
HOME_PATH = os.path.split(os.getcwd())[0]
logger.info(f"HOME_PATH: {HOME_PATH}")

DATA_PATH = os.path.join(HOME_PATH,"data", "raw", "task2.csv")
logger.info(f"DATA_PATH: {DATA_PATH}")

INTERIM_DATA_PATH= os.path.join(HOME_PATH,"data", "interim", "interim_data.csv")
logger.info(f"INTERIM_DATA_PATH: {INTERIM_DATA_PATH}")

SPLIT_DATA_PATH = os.path.join(HOME_PATH,"data","processed","split_data.csv")
logger.info(f"SPLIT_DATA_PATH: {SPLIT_DATA_PATH}")

# Set the path to save gpt2 model
MODEL_PATH = os.path.join(HOME_PATH, "models")
logger.info(f"model_path: {MODEL_PATH}")

# Model training constants
TRAIN_SIZE = 0.9
SEED = 2023

# Specify special tokens for gpt2
bos = '<|endoftext|>'
eos = '<|EOS|>'
pad = '<|pad|>'
special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad}

NUM_EPOCHS_TRAIN=6             # total # of training epochs
BATCH_SIZE_TRAIN=1 # batch size per device during training
BATCH_SIZE_EVAL=1  # batch size for evaluation
WARMUP_STEPS=200              # number of warmup steps for learning rate scheduler
WEIGHT_DECAY=0.01              # strength of weight decay
LOGGING_DIR=MODEL_PATH           # directory for storing logs
PREDICTION_LOSS=True
SAVE_STEPS=10000 

INFO:root:HOME_PATH: /home/jupyter/text-gen
INFO:root:DATA_PATH: /home/jupyter/text-gen/data/raw/task2.csv
INFO:root:INTERIM_DATA_PATH: /home/jupyter/text-gen/data/interim/interim_data.csv
INFO:root:SPLIT_DATA_PATH: /home/jupyter/text-gen/data/processed/split_data.csv
INFO:root:model_path: /home/jupyter/text-gen/models


### Data loading and processing

The data processing in this case consists of three steps:

1. Remove duplicate since we are gonna split it randomly
2. Clean up the dataset
3. Train test split
4. Add the start and end tokens to the headlines

In [5]:
# Helper function

def clean(review):
    
    # Convert to lower case
    review = review.lower()
    # Remove any numbers
    review = re.sub('[^a-z A-Z 0-9-]+', '', review)
    # Remove any stopwords in english
    review = " ".join([word for word in review.split() if word not in stopwords.words('english')])
    
    return review

In [6]:
# Load raw data
data = pd.read_csv(DATA_PATH, encoding="ISO-8859-1")
data = data.T.reset_index().T.reset_index(drop=True).rename(columns={0: "text"})

#### 1. Remove duplicated

In [7]:
data = data[~data['text'].duplicated()]

#### 2. Clean up the text data and save

In [8]:
# 1. Remove duplicated and clean up the text data
data['text'] = data['text'].progress_apply(clean)
data.to_csv(INTERIM_DATA_PATH, index=False)

  0%|          | 0/46909 [00:00<?, ?it/s]

#### 3. Train Test Split

In [9]:
# Split randomly
data_train, data_val = train_test_split(data, train_size = TRAIN_SIZE, random_state = SEED)
print(f'There are {len(data_train)} samples for training and {len(data_val)} for validation')

There are 42218 samples for training and 4691 for validation


In [10]:
# Save
data_train["split"] = "train"
data_val["split"] = "val"

combined_data = pd.concat([data_train, data_val])
combined_data.to_csv(SPLIT_DATA_PATH, index=False)

#### 4. Add start and end tokens to the headlines

In [11]:
# Add special tokens
data['text'] = bos + ' ' + data['text'] + ' ' + eos

## Training GPT2 Model

### Load GPT2 model and tokenizer

In [12]:
# Initialise gpt2 tokenizer
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Checking what is the vocab size for gpt2
logger.info(f"Words in vocabulary: {gpt2_tokenizer.vocab_size}")

INFO:root:Words in vocabulary: 50257


In [13]:
# the new token is added to the tokenizer
num_added_toks = gpt2_tokenizer.add_special_tokens(special_tokens_dict)

In [14]:
# Initialise the model configs and add special tokens
config = AutoConfig.from_pretrained('gpt2', 
                                    bos_token_id=gpt2_tokenizer.bos_token_id,
                                    eos_token_id=gpt2_tokenizer.eos_token_id,
                                    pad_token_id=gpt2_tokenizer.pad_token_id,
                                    output_hidden_states=False)

In [15]:
# Load GPT2 model with special tokens
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2', config=config)

In [16]:
# Resize embedding to tokenizer dimensions
gpt2_model.resize_token_embeddings(len(gpt2_tokenizer))

Embedding(50259, 768)

### Prepare data into format accepted by GPT2

In [17]:
# Helper function
def tokenize_function(text: str):
    """
    Tokenize the given text
    """
    return gpt2_tokenizer(text['text'], padding=True, truncation=True, max_length = 1024)

In [18]:
# load the data using hugging face dataset
train_dataset = Dataset.from_pandas(data_train[['text']])
val_dataset = Dataset.from_pandas(data_val[['text']])

In [19]:
# Tokenize dataset
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=5,
    remove_columns=['text'],
)
tokenized_val_dataset = val_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=5,
    remove_columns=['text'],
)



#0:   0%|          | 0/9 [00:00<?, ?ba/s]



#1:   0%|          | 0/9 [00:00<?, ?ba/s]



#2:   0%|          | 0/9 [00:00<?, ?ba/s]

#3:   0%|          | 0/9 [00:00<?, ?ba/s]

#4:   0%|          | 0/9 [00:00<?, ?ba/s]



#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

### Train GPT2 Model

In [20]:
# Set the training arguments for gpt2. Using default            
training_args = TrainingArguments(
    output_dir=MODEL_PATH,          # output directory
    num_train_epochs=NUM_EPOCHS_TRAIN,              # total # of training epochs
    per_device_train_batch_size=BATCH_SIZE_TRAIN,  # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE_EVAL,   # batch size for evaluation
    warmup_steps=WARMUP_STEPS,                # number of warmup steps for learning rate scheduler
    weight_decay=WEIGHT_DECAY,               # strength of weight decay
    logging_dir=MODEL_PATH,            # directory for storing logs
    prediction_loss_only=PREDICTION_LOSS,
    save_steps=SAVE_STEPS 
)

In [21]:
# Initialise the data collator
data_collator = DataCollatorForLanguageModeling(
        tokenizer=gpt2_tokenizer,
        mlm=False
    )

In [22]:
# Initialise huggingface trainer
trainer = Trainer(
    model=gpt2_model,                         # instantiated gpt2 model to be trained
    args=training_args,                  # training arguments, defined above
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,         # training dataset
    eval_dataset=tokenized_val_dataset            # evaluation dataset
)
# Train model
trainer.train()

Step,Training Loss



KeyboardInterrupt



In [None]:
# Save trained model
trainer.save_model()
gpt2_tokenizer.save_pretrained(MODEL_PATH)

## END