<a href="https://colab.research.google.com/github/Prisze/AAI-520-Final-Project/blob/Priscilla's-Model-Architecture%2FTraining/Final_LLM_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00

In [None]:
import numpy as np
import pandas as pd
import torch
from datasets import Dataset

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Dataset

In [None]:
dataset_df_original = pd.read_csv('/content/drive/MyDrive/Datasets/UbuntuDataset/Ubuntu-dialogue-corpus/dialogueText_196.csv')

In [None]:
dataset_df = dataset_df_original.iloc[:500000] # Use only a subset of the training dataset

In [None]:
import pandas as pd

def preprocess_conversations(df):
    # Remove rows with any NaN values
    df_clean = df.dropna(subset=['text'])

    # Filter dialogues with exactly 2 participants
    valid_dialogues = df_clean.groupby('dialogueID').filter(lambda x: x['from'].nunique() == 2)

    # Sort the dataframe by 'dialogueID' and 'date' to ensure conversation order
    valid_dialogues = valid_dialogues.sort_values(by=['dialogueID', 'date'])

    # Merge consecutive messages from the same sender in each dialogue
    merged_dialogues = []

    for dialogue_id, group in valid_dialogues.groupby('dialogueID'):
        group = group.reset_index(drop=True)  # Reset index to avoid issues when iterating

        # Initialize a list to collect messages for the current conversation
        conversation = []

        # Loop through the messages in this dialogue
        current_speaker = group.loc[0, 'from']
        current_message = group.loc[0, 'text']

        for i in range(1, len(group)):
            if group.loc[i, 'from'] == current_speaker:
                # If the next message is from the same speaker, merge it
                current_message += " " + group.loc[i, 'text']
            else:
                # If the next message is from a different speaker, append the current speaker's message
                conversation.append(current_message)
                # Switch to the new speaker
                current_speaker = group.loc[i, 'from']
                current_message = group.loc[i, 'text']

        # Append the last message
        conversation.append(current_message)

        # Concatenate messages in the dialogue using the '<|endoftext|>' separator
        merged_dialogue = '<|endoftext|>'.join(conversation)
        merged_dialogues.append(merged_dialogue)

    return merged_dialogues

conversations = preprocess_conversations(dataset_df)

In [None]:
dataset = Dataset.from_dict({
    'text': conversations
})

# Tokenization

In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-medium')
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [None]:
# Encode the dataset
def encode(examples):
    encoded = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
    encoded['labels'] = encoded['input_ids'][:]
    # Ignore padding tokens in labels
    encoded['labels'] = [[(label if label != tokenizer.pad_token_id else -100) for label in labels]
                         for labels in encoded['labels']]
    return encoded

# Apply encoding to the dataset
encoded_dataset = dataset.map(encode, batched=True)

Map:   0%|          | 0/7769 [00:00<?, ? examples/s]

# Training

In [None]:
from transformers import TrainingArguments, Trainer, GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-medium')

# Define training arguments
training_args = TrainingArguments(
    output_dir='results2',           # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=None,                # directory for storing logs
    learning_rate=2e-5,              # small learning_rate for finetune
    fp16=True                        # use floating point 16 bit precision for training
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset
)

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/863M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
trainer.train()

Step,Training Loss
500,4.6365
1000,3.8137


TrainOutput(global_step=1458, training_loss=4.056382251210991, metrics={'train_runtime': 1067.3971, 'train_samples_per_second': 21.835, 'train_steps_per_second': 1.366, 'total_flos': 5411306771841024.0, 'train_loss': 4.056382251210991, 'epoch': 3.0})

# Inference

In [None]:
inputs = tokenizer('Say hello!<|endoftext|>', return_tensors="pt")
outputs = model.generate(
    inputs.input_ids.cuda(),
    attention_mask=inputs.attention_mask.cuda(),
    max_new_tokens=50,
    no_repeat_ngram_size=5,
    early_stopping=True,
    num_beams=2,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    repetition_penalty = 10.0,
    temperature=0.7,
    top_p=0.8,
    do_sample=True,
    length_penalty=0.1
    )
tokenizer.batch_decode(outputs, skip_special_tokens=False)[0]

"Say hello!<|endoftext|>Hello! I'm trying to install a new kernel on my laptop, but it doesn't seem to be working. Can anyone help me? Is there an easy way to do this? How would I go about doing that? It's not showing up"

# Chatbot Interface

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

class Chat:
  def __init__(self, chatbot, max_response_len = 20):
    self.chatbot = chatbot
    self.messages = []
    self.max_response_len = max_response_len

  def send(self, text: str):
    self.messages.append(text)
    prompt = '<|endoftext|>'.join(self.messages) + '<|endoftext|>'
    response = self.chatbot.generate(prompt, max_length = self.max_response_len)
    self.messages.append(response)
    return response

class ChatBot:
  def __init__(self, model_path: str, device = None):
    if not device:
      device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.device = device
    self.model = GPT2LMHeadModel.from_pretrained(model_path).to(self.device)
    self.tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-medium')
    self.tokenizer.pad_token = self.tokenizer.eos_token

  def generate(self, text: str, max_length: int = 20) -> str:
    with torch.no_grad():
      inputs = self.tokenizer(text, return_tensors="pt")
      outputs = self.model.generate(
        inputs.input_ids.to(self.device),
        attention_mask=inputs.attention_mask.to(self.device),
        max_new_tokens=50,
        no_repeat_ngram_size=5,
        early_stopping=True,
        num_beams=2,
        pad_token_id=self.tokenizer.eos_token_id,
        eos_token_id=self.tokenizer.eos_token_id,
        repetition_penalty = 10.0,
        temperature=0.7,
        top_p=0.8,
        do_sample=True,
        length_penalty=0.1
      )
      response_outputs = outputs[:, len(inputs['input_ids'][0]):]
      response = self.tokenizer.batch_decode(response_outputs, skip_special_tokens=True)[0]
      return response

  def create_chat(self) -> Chat:
    return Chat(self)

## Usage Example

In [None]:
# Load a ChatBot model
chatBot = ChatBot('/content/drive/MyDrive/Datasets')


In [None]:
# Open a Chat
conversation = chatBot.create_chat()

In [None]:
# Converse with bot

message = 'How do I install Ubuntu?'
print(f'User: {message}')

response = conversation.send(message)
print(f'Chatbot: {response}')

message = 'Thank you, it is working now!'
print(f'User: {message}')

response = conversation.send(message)
print(f'Chatbot: {response}')


User: How do I install Ubuntu?
Chatbot: sudo apt-get install ubuntu, then you can use the synaptic package manager to get it installed. sudo apt-get install libgtk2-dev and so on... if that doesn't work try 'apt-get install xserver-
User: Thank you, it is working now!
Chatbot: you're welcome :) i'm sure there's a better way ;) but this one works well for me :D  what version of gnome are you using? 6.10 or 7.04? have you tried installing 9.06 yet? did


In [None]:
# Converse with bot
conversation = chatBot.create_chat()

message = 'what color is the sky?'
print(f'User: {message}')

response = conversation.send(message)
print(f'Chatbot: {response}')



User: what color is the sky?
Chatbot: it's a light blueish grey with some white highlights and black stripes on top of it. I don't know if that helps or not, but i'm using xchat right now to get my friends in touch with me so they can help me
