In [17]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, AdamW
from torch.utils.data import Dataset, DataLoader
import torch
import pandas as pd

In [18]:
# Load CSV data
df = pd.read_csv('chatdata.csv')

# Tokenizer and Model Configuration
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
config = GPT2Config.from_pretrained('gpt2')
model = GPT2LMHeadModel(config)

# Define a simple dataset
class ChatDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        user_input = self.data.iloc[idx]['question']
        answer = self.data.iloc[idx]['answer']

        # Combine user input and answer
        conversation = f"User: {user_input} Bot: {answer}"

        # Tokenize the conversation
        input_ids = self.tokenizer.encode(conversation, max_length=self.max_length, return_tensors="pt", truncation=True).squeeze()

        return {
            'input_ids': input_ids,
        }

# Prepare the dataset and dataloader
dataset = ChatDataset(df, tokenizer)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)


In [19]:
# Training configuration
num_epochs = 5
learning_rate = 1e-4
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Training loop (fine-tuning)
for epoch in range(num_epochs):
    total_loss = 0.0
    num_batches = len(dataloader)

    for batch_num, batch in enumerate(dataloader, 1):
        input_ids = batch['input_ids']
        labels = input_ids.clone()

        optimizer.zero_grad()
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Print training progress
        if batch_num % 100 == 0:
            avg_loss = total_loss / batch_num
            print(f"Epoch [{epoch + 1}/{num_epochs}], Batch [{batch_num}/{num_batches}], Loss: {avg_loss:.4f}", flush=True)

    # Average loss for the epoch
    avg_epoch_loss = total_loss / num_batches
    print(f"Epoch [{epoch + 1}/{num_epochs}], Average Loss: {avg_epoch_loss:.4f}", flush=True)

# Save the fine-tuned model
model.save_pretrained('fine_tuned_model')

Epoch [1/5], Batch [100/228], Loss: 7.5707
Epoch [1/5], Batch [200/228], Loss: 6.7739
Epoch [1/5], Average Loss: 6.6468
Epoch [2/5], Batch [100/228], Loss: 5.1584
Epoch [2/5], Batch [200/228], Loss: 5.0258
Epoch [2/5], Average Loss: 4.9782
Epoch [3/5], Batch [100/228], Loss: 4.1895
Epoch [3/5], Batch [200/228], Loss: 4.1715
Epoch [3/5], Average Loss: 4.1514
Epoch [4/5], Batch [100/228], Loss: 3.4828
Epoch [4/5], Batch [200/228], Loss: 3.3306
Epoch [4/5], Average Loss: 3.2638
Epoch [5/5], Batch [100/228], Loss: 2.7568
Epoch [5/5], Batch [200/228], Loss: 2.5100
Epoch [5/5], Average Loss: 2.4848


In [20]:
# Load the fine-tuned model
model = GPT2LMHeadModel.from_pretrained('fine_tuned_model')

In [21]:
def test_model(model, tokenizer, user_input, df):
    # Tokenize the user input
    input_ids = tokenizer.encode(f"User: {user_input} Bot:", return_tensors="pt", truncation=True)

    # Generate a response using the fine-tuned GPT-2 model
    output = model.generate(input_ids, max_length=100, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.7)

    # Decode the generated response
    generated_response = tokenizer.decode(output[0], skip_special_tokens=True)

    # Check if the generated response contains known keywords from the dataset
    if any(keyword.lower() in generated_response.lower() for keyword in df['answer'].values):
        return generated_response
    else:
        return "I don't understand this. Please provide more information or contact the head of the department for assistance."



In [22]:
# Example usage
user_input = "how to pay online?"
response = test_model(model, tokenizer, user_input, df)
print("User:", user_input)
print("Bot:", response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


User: how to pay online?
Bot: I don't understand this. Please provide more information or contact the head of the department for assistance.


In [23]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model
model = GPT2LMHeadModel.from_pretrained('fine_tuned_model')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def test_model(model, tokenizer, user_input):
    # Tokenize the user input
    input_ids = tokenizer.encode(f"User: {user_input} Bot:", return_tensors="pt", truncation=True)

    # Generate a response using the fine-tuned GPT-2 model
    output = model.generate(input_ids, max_length=100, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.7)

    # Decode the generated response
    generated_response = tokenizer.decode(output[0], skip_special_tokens=True)

    return generated_response

# Example usage
user_input = "how to pay online?"
response = test_model(model, tokenizer, user_input)
print("User:", user_input)
print("Bot:", response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


User: how to pay online?
Bot: User: how to pay online? Bot: No, students and faculty with UMKC webmail via https://outlook.office.com/mail16-235-1354. Forumkc.edu/: cashiers@edu. Center or cash payments. Contact payments in-286-Friday. The Cashiers Office at 816/235/out/@um: Access in person at 5 pm/C Wireless can be: Contact: Pay from your web pm65.
