## Chatbot Design using the Ubuntu Dialogue Corpus Dataset
## Group 6: Priscilla Marquez, Johnathan Long, Greg Moore
## Applied Artificial Intelligence (AAI), University of San Diego
## AAI-520: Natural Language Processing and GenAI
## Professor Kahila Mokhtari, PhD
## October 21, 2024

In [1]:
####################################################################

# Project Overview:
#
# Goal: Build a chatbot that can carry out multi-turn conversations, adapt
# to context, and handle a variety of topics.
# Output: A web or app interface where users can converse with the chatbot.
#
# Pre-requisites:
# Basic understanding of deep learning and neural networks.
# Familiarity with a deep learning framework (e.g., TensorFlow, PyTorch).
# Basic knowledge of web development (for the interface).
#
# Phases:
#   
# Research and Study Phase:
# Study generative-based chatbot architectures like Seq2Seq, Transformers,
# and GPT and deep learning. Understand the challenges of chatbot design:
# context management, coherency, handling ambiguous queries, etc.
#
# Data Collection and Preprocessing:
# We chose to use the Ubuntu Dialogue Corpus Dataset
# https://www.kaggle.com/datasets/rtatman/ubuntu-dialogue-corpus
#
# Model Design and Training:
# Choose an architecture (e.g., Transformer-based models or deep learning models).
# Implement or leverage existing implementations to train the model with the dataset.
#
# Evaluation:
# Implement evaluation metrics.

##################################################

# Load the libraries
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup
# stop the future warning
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# We will use the text column in the Ubuntu Dialogue Corpus
# Read in entire Ubuntu dialogueText_196.csv dataset
ubuntu = pd.read_csv('C:/Users/gregm/.spyder-py3/AAI_520/FINAL/dialogueText_196.csv')
# Output Ubuntu Dataset
print('\nDisplay Ubuntu Dataset:\n', ubuntu)

##################################################


Display Ubuntu Dataset:
          folder dialogueID                      date        from       to  \
0           301      1.tsv  2004-11-23T11:49:00.000Z     stuNNed      NaN   
1           301      1.tsv  2004-11-23T11:49:00.000Z     crimsun  stuNNed   
2           301      1.tsv  2004-11-23T11:49:00.000Z     stuNNed  crimsun   
3           301      1.tsv  2004-11-23T11:49:00.000Z     crimsun  stuNNed   
4           301      1.tsv  2004-11-23T11:50:00.000Z     stuNNed  crimsun   
...         ...        ...                       ...         ...      ...   
9212872      13   3676.tsv  2012-07-07T20:17:00.000Z  MonkeyDust  legolas   
9212873      13   3676.tsv  2012-07-07T20:18:00.000Z  MonkeyDust  legolas   
9212874      13  16586.tsv  2008-07-25T01:53:00.000Z    linuxfce      NaN   
9212875      13  16586.tsv  2008-07-25T01:53:00.000Z    linuxfce      NaN   
9212876      13  16586.tsv  2008-07-25T01:54:00.000Z    linuxfce      NaN   

                                                 

In [3]:
##################################################

# Text Preprocessing
# Extract subset of Ubuntu
text = ubuntu.iloc[:500000]
# Extract subset of 100 conversations for test and evaluation
eval_dataset_df = ubuntu.iloc[500000:500100]

# Output the first 5 texts
print('\nHead of First 5 texts:\n', text.head(5))

##################################################


Head of First 5 texts:
    folder dialogueID                      date     from       to  \
0     301      1.tsv  2004-11-23T11:49:00.000Z  stuNNed      NaN   
1     301      1.tsv  2004-11-23T11:49:00.000Z  crimsun  stuNNed   
2     301      1.tsv  2004-11-23T11:49:00.000Z  stuNNed  crimsun   
3     301      1.tsv  2004-11-23T11:49:00.000Z  crimsun  stuNNed   
4     301      1.tsv  2004-11-23T11:50:00.000Z  stuNNed  crimsun   

                                                text  
0   any ideas why java plugin takes so long to load?  
1                                          java 1.4?  
2                                                yes  
3                       java 1.5 loads _much_ faster  
4  noneus: how can i get 1.5 is there a .deb some...  


In [4]:
##################################################

# Data Preparation and Pre-processing
class DialogueDataset(Dataset):
    def __init__(self, dialogues, tokenizer, max_length=512):
        self.dialogues = dialogues
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dialogues)

    def __getitem__(self, idx):
        dialogue = self.dialogues[idx]
        encoded = self.tokenizer.encode_plus(
            dialogue,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoded['input_ids'].squeeze(),
            'attention_mask': encoded['attention_mask'].squeeze()
        }


# Pre-process data
dialogues = text
# Set tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-medium')
tokenizer.pad_token = tokenizer.eos_token
# Pre-process text data
dataset = DialogueDataset(dialogues, tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
print('\n',dialogues)

##################################################


         folder dialogueID                      date      from        to  \
0          301      1.tsv  2004-11-23T11:49:00.000Z   stuNNed       NaN   
1          301      1.tsv  2004-11-23T11:49:00.000Z   crimsun   stuNNed   
2          301      1.tsv  2004-11-23T11:49:00.000Z   stuNNed   crimsun   
3          301      1.tsv  2004-11-23T11:49:00.000Z   crimsun   stuNNed   
4          301      1.tsv  2004-11-23T11:50:00.000Z   stuNNed   crimsun   
...        ...        ...                       ...       ...       ...   
499995      25   4002.tsv  2007-07-28T11:13:00.000Z    reanjr  xtknight   
499996      25   4002.tsv  2007-07-28T11:23:00.000Z    reanjr  xtknight   
499997      25   4002.tsv  2007-07-28T11:23:00.000Z  xtknight    reanjr   
499998      25   4002.tsv  2007-07-28T11:26:00.000Z    reanjr  xtknight   
499999      25   4002.tsv  2007-07-28T12:10:00.000Z    reanjr  xtknight   

                                                     text  
0        any ideas why java plugin ta

## From Priscilla: added last line to process eval_conversations for eval metrics

In [5]:
##################################################

import pandas as pd

def preprocess_conversations(df):
    # Remove rows with any NaN values
    df_clean = df.dropna(subset=['text'])

    # Filter dialogues with exactly 2 participants
    valid_dialogues = df_clean.groupby('dialogueID').filter(lambda x: x['from'].nunique() == 2)

    # Sort the dataframe by 'dialogueID' and 'date' to ensure conversation order
    valid_dialogues = valid_dialogues.sort_values(by=['dialogueID', 'date'])

    # Merge consecutive messages from the same sender in each dialogue
    merged_dialogues = []

    for dialogue_id, group in valid_dialogues.groupby('dialogueID'):
        group = group.reset_index(drop=True)  # Reset index to avoid issues when iterating

        # Initialize a list to collect messages for the current conversation
        conversation = []

        # Loop through the messages in this dialogue
        current_speaker = group.loc[0, 'from']
        current_message = group.loc[0, 'text']

        for i in range(1, len(group)):
            if group.loc[i, 'from'] == current_speaker:
                # If the next message is from the same speaker, merge it
                current_message += " " + group.loc[i, 'text']
            else:
                # If the next message is from a different speaker, append the current speaker's message
                conversation.append(current_message)
                # Switch to the new speaker
                current_speaker = group.loc[i, 'from']
                current_message = group.loc[i, 'text']

        # Append the last message
        conversation.append(current_message)

        # Concatenate messages in the dialogue using the '<|endoftext|>' separator
        merged_dialogue = '<|endoftext|>'.join(conversation)
        merged_dialogues.append(merged_dialogue)

    return merged_dialogues

# Preprocess the evaluation metrics conversations
eval_conversations = preprocess_conversations(eval_dataset_df)

##################################################

## Calculate Perplexity Score for ChatBot Model using Ubuntu Eval Data

In [6]:
##################################################

# Calculate Perplexity Score for ChatBot Model using Ubuntu Eval Data
# Implement a sliding window approach to calculate perplexity for a LLM
# ChatBot Model on long sequences that exceed the model's max context length
# Import GPT2 model with language modeling head and tokenizer and PyTorch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer 
import torch

# Define a function to calculate the perplexity of conversation text
def calc_perplexity_score(model, tokenizer, text):
    # Tokenize the input text. Convert the text to token IDs and pad or truncate 
    # to max length of 1000 tokens, return result as PyTorch tensors
    encodings = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=1000)
    # Max sequence length set to 1000
    max_length = 1000
    # Set the step size for the sliding window approach used in perplexity
    stride = 512
    # Initialize list for negative log-likelihoods
    nlls = []
    # Loop through input sequence of step size stride
    for i in range(0, encodings.input_ids.size(1), stride):
        # Calculate start and end positions for window
        strt_pos = max(i + stride - max_length, 0)
        end_pos = min(i + stride, encodings.input_ids.size(1))
        # Calculate target length
        trg_len = end_pos - i
        # Extract input IDs for window, make a copy for targets
        input_ids = encodings.input_ids[:, strt_pos:end_pos].to(device)
        target_ids = input_ids.clone()
        # Set all but last tokens to -100 (ignore in loss computation)
        target_ids[:, :-trg_len] = -100
        # Run model on input, compute loss, calculate negative log-likelihood
        # Prevent gradient computation to save memory
        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            neg_log_like = outputs.loss * trg_len
        # Add computed negative log-likelihood to the list
        nlls.append(neg_log_like)
    # Compute perplexity by summing negative log-likelihoods, divide by total
    # length, take the exponential and return perplexity score as a number
    ppl = torch.exp(torch.stack(nlls).sum() / end_pos)
    return ppl.item()

# Set tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-medium')
tokenizer.pad_token = tokenizer.eos_token
# Read in ChatBot Model
model = GPT2LMHeadModel.from_pretrained('C:/Users/gregm/.spyder-py3/AAI_520/FINAL')
# Use evaluation conversations to calculate perplexity score
ubuntu_text = eval_conversations
perplexity = calc_perplexity_score(model, tokenizer, ubuntu_text)
print(f"ChatBot Model Perplexity Score: {perplexity}")

##################################################

ChatBot Model Perplexity Score: 199.4624786376953
