In [None]:
# Install dependencies (restart environment after installation)
%pip install transformers
%pip install matplotlib
%pip install torch --index-url https://download.pytorch.org/whl/cu124
%pip install accelerate
%pip install pandas
%pip install datasets
%pip install numpy
%pip install evaluate

In [3]:
# import dependencies
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
import torch
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset

In [21]:
# check for cuda gpu
print(f"[!] GPU Available: {torch.cuda.is_available()}")

[!] GPU Available: True


In [22]:
# load gpt-2 model and tokenizer
def load_model_and_tokenizer(local_path="./model", model_name="gpt2"):
    if os.path.exists(local_path):
        print("[+] loading model from local directory.")
        model = GPT2LMHeadModel.from_pretrained(local_path).to("cuda")
        tokenizer = GPT2Tokenizer.from_pretrained(local_path)
    else:
        print("[+] loading model from Hugging-Face hub")
        model = GPT2LMHeadModel.from_pretrained(model_name).to("cuda")
        tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        
    tokenizer.pad_token = tokenizer.eos_token
    
    return model, tokenizer

In [23]:
# Pre-process "All-seasons.csv" (south park dialog dataset)
def preprocess_south_park_dialog_data(tokenizer, file_path="./data/All-seasons.csv"):
    dialog_data = []
    
    df = pd.read_csv(file_path, encoding="utf-8")
    lines = df["Line"].dropna().values
    
    # Structure training data format
    for i in range(0, len(lines) - 1, 2):
        user = lines[i]
        bot = lines[i+1]
        dialog_data.append(f"<USER>: {user} {tokenizer.eos_token}\n<BOT>: {bot}{tokenizer.eos_token}\n")
        
    # Visualize data - printing the last 2 elements
    print("\n[+] Visualizing pre-processed south park dataset.")
    print(dialog_data[:2])
    
    return dialog_data

In [24]:
# Tokenize dialog data
def tokenize_data(data, tokenizer, max_length=158):
    return tokenizer(data, return_tensors="pt", padding=True, truncation=True, max_length=max_length)

In [25]:
def data_collator(data):
    input_ids = torch.stack([f[0] for f in data])
    attention_mask = torch.stack([f[1] for f in data])

    labels = input_ids.clone()
    labels[input_ids == tokenizer.pad_token_id] = -100

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [26]:
import re

# Function to check for english text
def is_english(text):
  pattern = r'^[A-Za-z0-9\s.,!?\'":;()&*%#$@^_+-=<>{}[\]\\/|]*$'
  return bool(re.match(pattern, text))

In [47]:
def get_train_eval_data(tokenized_data, test_size=0.2, random_state=42):
    # Tokenize the data
    input_ids = tokenized_data["input_ids"]
    attention_mask = tokenized_data["attention_mask"]
    
    # Split the data
    train_input_ids, eval_input_ids, train_attention_mask, eval_attention_mask = train_test_split(
        input_ids, attention_mask, test_size=test_size, random_state=random_state
    )
    
    # Create the TensorDataset objects
    train_dataset = TensorDataset(train_input_ids, train_attention_mask)
    eval_dataset = TensorDataset(eval_input_ids, eval_attention_mask)
    
    return train_dataset, eval_dataset

In [28]:
def save_model_and_tokenizer(model, tokenizer, local_path="./trained-model"):
    print(f"\n[+] saving model & tokenizer to: {local_path}")
    model.save_pretrained(local_path)
    tokenizer.save_pretrained(local_path)

In [29]:
def get_training_arguments(learning_rate=5e-5, epochs=3):
    # Define parameters for training
    training_config = TrainingArguments(
        output_dir="./fine-tuned-gpt2",
        overwrite_output_dir=True,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        eval_strategy="steps",
        save_steps=500,
        num_train_epochs=epochs,
        save_total_limit=epochs,
        logging_dir="./logs",
        logging_steps=10,
        warmup_steps=500,
        weight_decay=0.01,
        use_cpu=False,
        eval_steps=100,
        learning_rate=learning_rate,
    )
    
    return training_config

In [51]:
def train_model(model, train_data, eval_data, tokenizer, training_args):
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=eval_data,
        data_collator=data_collator,
    )
    
    # Start training
    trainer.train()
    
    # Save model after training
    save_model_and_tokenizer(model, tokenizer)

In [31]:
model, tokenizer = load_model_and_tokenizer()
dialog_data = preprocess_south_park_dialog_data(tokenizer)

tokenized_data = tokenize_data(dialog_data, tokenizer)

train_data, eval_data = get_train_eval_data(tokenized_data)

train_model(model, train_data, eval_data, tokenizer, get_training_arguments())


[+] loading model from Hugging-Face hub

[+] Visualizing pre-processed south park dataset.
['<USER>: You guys, you guys! Chef is going away. \n <|endoftext|>\n<BOT>: Going away? For how long?\n<|endoftext|>\n', "<USER>: Forever.\n <|endoftext|>\n<BOT>: I'm sorry boys.\n<|endoftext|>\n"]


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

# Training on New Dataset
Fine-tuning on new dialog dataset - "casual_data_windows.csv" from reddit conversations.

In [32]:
# Load model and tokenizer
model, tokenizer = load_model_and_tokenizer(local_path="./models/trained-model-2.6-loss")

[+] loading model from local directory.


In [33]:
# Pre-process new dataset: "casual_data_windows.csv"
df = pd.read_csv("./data/casual_data_windows.csv")

# Drop unused column
df = df.drop(columns=["2"])

# Remove rows where C2 and C3 share data
df = df[df["0"] != df["1"]]

# Remove duplicates in columns
df = df.drop_duplicates(subset=["0", "1"])

# Drop rows with any empty cells
df = df.dropna()
df = df[df.apply(lambda row: row.str.strip().all(), axis=1)]


In [34]:
#  Visualize dataset
df.head()

Unnamed: 0.1,Unnamed: 0,0,1
0,0,What kind of phone(s) do you guys have?,I have a pixel. It's pretty great. Much better...
1,1,I have a pixel. It's pretty great. Much better...,Does it really charge all the way in 15 min?
2,2,Does it really charge all the way in 15 min?,"Pretty fast. I've never timed it, but it's und..."
3,3,What kind of phone(s) do you guys have?,Samsung Galaxy J1. It's my first cell phone an...
4,4,Samsung Galaxy J1. It's my first cell phone an...,What do you think of it? Anything you don't like?


In [39]:
# Format data for training
dialog_data = []
illegal_line_count = 0
for _, row in df.iterrows():
    user = row["0"]
    bot = row["1"]

    if is_english(user) and is_english(bot):
        dialog_data.append(f"<USER>: {user} {tokenizer.eos_token}. <BOT>: {bot}{tokenizer.eos_token}.")
    else:
        illegal_line_count += 1
        
print(f"[-] Removed {illegal_line_count} lines.")

[-] Removed 7896 lines.


In [43]:
print(dialog_data[:3])
print(f"\n[+] total data samples: {len(dialog_data)}")

["<USER>: What kind of phone(s) do you guys have? <|endoftext|>. <BOT>: I have a pixel. It's pretty great. Much better than what I had before. <|endoftext|>.", "<USER>: I have a pixel. It's pretty great. Much better than what I had before.  <|endoftext|>. <BOT>: Does it really charge all the way in 15 min?<|endoftext|>.", "<USER>: Does it really charge all the way in 15 min? <|endoftext|>. <BOT>: Pretty fast. I've never timed it, but it's under half an hour. <|endoftext|>."]

[+] total data samples: 41682


In [46]:
# Tokenize the data
tokenized_data = tokenize_data(dialog_data, tokenizer)
print(tokenized_data)

{'input_ids': tensor([[   27, 29904, 31175,  ..., 50256, 50256, 50256],
        [   27, 29904, 31175,  ..., 50256, 50256, 50256],
        [   27, 29904, 31175,  ..., 50256, 50256, 50256],
        ...,
        [   27, 29904, 31175,  ..., 50256, 50256, 50256],
        [   27, 29904, 31175,  ..., 50256, 50256, 50256],
        [   27, 29904, 31175,  ..., 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [49]:
train_data, eval_data = get_train_eval_data(tokenized_data)

In [52]:
train_model(model, train_data, eval_data, tokenizer, get_training_arguments(epochs=5))

Step,Training Loss,Validation Loss


KeyboardInterrupt: 