In [1]:
# load libs
import warnings
warnings.simplefilter("ignore")
import os
import torch
from transformers import AutoModelForCausalLM,AutoTokenizer
from cust_supp_data import ChatData
from torch.utils.data import DataLoader
from torch.optim import Adam
import tqdm
from nltk.translate.bleu_score import sentence_bleu
import nltk
import json
import numpy as np
import pandas as pd
import random

In [2]:
llm_model_name = "openai-community/gpt2-medium"
cache_dir="D:/data_science_projects/llm/llm_models/"
sess_num=2
if sess_num==1:
    df = pd.DataFrame()
    df[["sess"]] = np.nan
    df[["score"]] = np.nan
    df.to_csv("mdl_perf.csv")
    if "finetuned_gpt.pt" in os.listdir():
        os.remove("finetuned_gpt.pt")
torch.cuda.is_available()

True

In [3]:
def check_if_model_on_cuda(model):
    device = next(model.parameters()).device
    if device.type == 'cuda':
        print("Model is already on CUDA (GPU).")
    else:
        print("Model is not on CUDA (GPU). You may need to move it.")    
        
def get_response_from_llm(text,model,tokenizer):
    
    encoded_input = tokenizer(text, return_tensors='pt')  # Allow for longer input
    encoded_input = encoded_input.to("cuda")
    generated_ids = model.generate(**encoded_input, max_length=50,
                                                        no_repeat_ngram_size=4,
                                                        pad_token_id=tokenizer.eos_token_id)  # Generate a longer output
    decoded_text = tokenizer.decode(generated_ids[0])
    return decoded_text        

In [4]:
# load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(llm_model_name,cache_dir = cache_dir)
# load gpt2 model
gpt_model = AutoModelForCausalLM.from_pretrained(llm_model_name,cache_dir=cache_dir)

check_if_model_on_cuda(gpt_model)
# push the model to cuda
gpt_model = gpt_model.to("cuda")

Model is not on CUDA (GPU). You may need to move it.


In [5]:
# response from model without finetuning
resp = get_response_from_llm("My order has not been delivered.",
                                      model=gpt_model,tokenizer=tokenizer)

resp.split("\n")[0]

'My order has not been delivered. Please try again later.'

In [6]:
# tokenizer - additional config
# gpt tokenizer doesn't know what tokens in data are for start / end of string
special_tokens_dict = {
    "pad_token": "<pad>",
    "bos_token": "<startofstring>",
    "eos_token": "<endofstring>"
}

tokenizer.add_special_tokens(special_tokens_dict)
tokenizer.add_tokens(["<bot>:"])

1

In [7]:
# resize the model tokens based on changes introduced above
# <bot> token won't be available, needs to be introduced
len(tokenizer)

50261

In [8]:
gpt_model.resize_token_embeddings(len(tokenizer))

Embedding(50261, 1024)

In [9]:
if  "finetuned_gpt.pt" in os.listdir():
    state_dict = torch.load("finetuned_gpt.pt")
    gpt_model.load_state_dict(state_dict)
    print("found_ckpt")

found_ckpt


In [10]:
chat_data_loader = ChatData("./customer_support_chat.json", tokenizer)
chat_data =  DataLoader(chat_data_loader, batch_size=8)

In [11]:
def train(chatData, model, optim,device="cuda",epochs=1):
    
    print("training .... ")
    for i in range(epochs):
        print(f"epoch : {i+1}")
        for X, a in tqdm.tqdm(chatData):
            X = X.to(device)
            a = a.to(device)
            optim.zero_grad()
            loss = model(X, attention_mask=a, labels=X).loss
            loss.backward()
            optim.step()
        torch.save(model.state_dict(), "finetuned_gpt.pt")
    print("training complete")    

In [12]:
optim = Adam(gpt_model.parameters(), lr=1e-3)
train(chat_data, gpt_model, optim,epochs=40)

training .... 
epoch : 1


100%|██████████| 10/10 [00:44<00:00,  4.42s/it]


epoch : 2


100%|██████████| 10/10 [00:46<00:00,  4.63s/it]


epoch : 3


100%|██████████| 10/10 [00:46<00:00,  4.63s/it]


epoch : 4


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 5


100%|██████████| 10/10 [00:46<00:00,  4.63s/it]


epoch : 6


100%|██████████| 10/10 [00:46<00:00,  4.63s/it]


epoch : 7


100%|██████████| 10/10 [00:46<00:00,  4.63s/it]


epoch : 8


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 9


100%|██████████| 10/10 [00:46<00:00,  4.63s/it]


epoch : 10


100%|██████████| 10/10 [00:46<00:00,  4.63s/it]


epoch : 11


100%|██████████| 10/10 [00:46<00:00,  4.63s/it]


epoch : 12


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 13


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 14


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 15


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 16


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 17


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 18


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 19


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 20


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 21


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 22


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 23


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 24


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 25


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 26


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 27


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 28


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 29


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 30


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 31


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 32


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 33


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 34


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 35


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 36


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 37


100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


epoch : 38


100%|██████████| 10/10 [00:46<00:00,  4.65s/it]


epoch : 39


100%|██████████| 10/10 [00:46<00:00,  4.65s/it]


epoch : 40


100%|██████████| 10/10 [00:46<00:00,  4.66s/it]


training complete


In [13]:
inp = "i have a problem with your product."
resp = get_response_from_llm(inp,
                                      model=gpt_model,tokenizer=tokenizer)

resp.split("\n")[0]

"i have a problem with your product. <bot>: We apologize for the inconvenience. Can you please provide your order number and the new shipping address you'd like to use? <endofstring><pad><pad><pad><pad><endofstring><pad><pad><bot>:- place of the most instructions"

In [14]:
ref_data = json.load(open("./customer_support_chat.json",  "r"))
ref_data = random.sample(ref_data, 25)

mdl_gen_resp = []

for i in tqdm.tqdm(ref_data):
    
    mdl_inp = i.split("<bot>:")[0].replace("<startofstring>","")
    resp = get_response_from_llm(mdl_inp,model=gpt_model,tokenizer=tokenizer)
    final_resp = resp.split("\n")[0]
    gen_resp = "<startofstring>"+mdl_inp+"<bot>:"+final_resp.split(inp)[-1]+"endofstring"
    mdl_gen_resp.append(gen_resp)

100%|██████████| 25/25 [00:31<00:00,  1.26s/it]


In [15]:
    
bleu_score_ls = []

for i,j in zip(ref_data,mdl_gen_resp):
    ref_tokens = i.split()  # Tokenize reference sentence
    gen_tokens = j.split()  # Tokenize generated sentence
    bs = sentence_bleu([ref_tokens], gen_tokens)
    
    bleu_score_ls.append(bs)    
    
np.mean(bleu_score_ls)    

0.5079907222917704

In [16]:
dfi = pd.read_csv("mdl_perf.csv",index_col=0)
dfj = pd.DataFrame([{"sess":sess_num,"score":np.mean(bleu_score_ls)   }])
dfij = pd.concat([dfi,dfj])
dfij.to_csv("mdl_perf.csv")

In [17]:
dfij

Unnamed: 0,sess,score
0,1,0.247265
0,2,0.507991


In [28]:
# looking at the finetuned response
print("user : ", resp.split("<endofstring>")[0].split("<bot>")[0])
print("bot : ", resp.split("<endofstring>")[0].split("<bot>:")[1])

user :   I haven't received a response to my email inquiry. 
bot :   We apologize for the delay. Can you please provide your ticket number or account email address so we can follow up email? 
