In [None]:
from torch.utils.data import Dataset
import json

class ChatData(Dataset):
    def __init__(self, path:str, tokenizer):
        self.data = json.load(open(path, "r"))

        self.X = []
        for i in self.data:
            for j in i['dialog']:
                self.X.append(j['text'])

        for idx, i in enumerate(self.X):
            try:
                self.X[idx] = "<startofstring> "+i+" <bot>: "+self.X[idx+1]+" <endofstring>"
            except:
                break

        self.X = self.X[:5000]

        print(self.X[0])

        self.X_encoded = tokenizer(self.X,max_length=40, truncation=True, padding="max_length", return_tensors="pt")
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])

In [None]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
import tqdm
import torch

class ChatData(Dataset):
    def __init__(self, json_file, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.max_length = max_length
        # Load data from JSON file
        with open(json_file, 'r') as file:
            self.data = json.load(file)

        # Print some info to debug
        print(f"Loaded {len(self.data)} dialogues from {json_file}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        dialogue = self.data[idx]["dialog"]

        # Extract input and target texts from the dialogue
        input_texts = []
        target_texts = []
        for message in dialogue:
            if message["sender"] == "participant1":
                input_texts.append(message["text"])
            elif message["sender"] == "participant2":
                target_texts.append(message["text"])

        # Combine input and target texts into conversations
        conversations = []
        for input_text, target_text in zip(input_texts, target_texts):
            input_text = "<startofstring> " + input_text + " <bot>: "
            target_text = target_text + " <endofstring>"
            conversations.append((input_text, target_text))

        # Tokenize inputs
        tokenized_conversations = []
        for input_text, target_text in conversations:
            input_tokens = self.tokenizer.encode(input_text, max_length=self.max_length, truncation=True, padding='max_length')
            target_tokens = self.tokenizer.encode(target_text, max_length=self.max_length, truncation=True, padding='max_length')
            tokenized_conversations.append({
                "input_ids": input_tokens,
                "attention_mask": [1 if token != self.tokenizer.pad_token_id else 0 for token in input_tokens],
                "target_ids": target_tokens
            })

        return tokenized_conversations

def train(chatData, model, optim):
    epochs = 10

    for epoch in tqdm.tqdm(range(epochs)):
        print(f"Epoch {epoch+1}/{epochs}")
        model.train()
        for batch in chatData:
            for conv in batch:
                X = torch.tensor(conv["input_ids"]).unsqueeze(0).to(device)
                a = torch.tensor(conv["attention_mask"]).unsqueeze(0).to(device)
                optim.zero_grad()
                loss = model(X, attention_mask=a, labels=X)[0]
                loss.backward()
                optim.step()
        torch.save(model.state_dict(), "model_state.pt")
        print(infer("hello how are you"))

def infer(inp):
    inp = "<startofstring> "+inp+" <bot>: "
    inp = tokenizer(inp, return_tensors="pt")
    X = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    output = model.generate(X, attention_mask=a)
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    return output

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "<pad>",
                                "bos_token": "<startofstring>",
                                "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

model = model.to(device)

chatData = ChatData("/content/chat__Data.json", tokenizer)
chatData = DataLoader(chatData, batch_size=1)

optim = Adam(model.parameters(), lr=1e-3)

print("Training....")
train(chatData, model, optim)

print("Inference from the model:")
while True:
    inp = input("User: ")
    if inp.lower() == 'exit':
        break
    response = infer(inp)
    print("Bot:", response)


Loaded 1 dialogues from /content/chat__Data.json
Training....


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1/10


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 10%|█         | 1/10 [00:30<04:34, 30.48s/it]

 hello how are you  <bot>:   <bot>: 

Epoch 2/10


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 20%|██        | 2/10 [00:42<02:38, 19.85s/it]

 hello how are you  <bot>: , and, and, and, and,, and,
Epoch 3/10


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 30%|███       | 3/10 [00:50<01:38, 14.07s/it]

 hello how are you  <bot>:  
Epoch 4/10


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 40%|████      | 4/10 [00:56<01:05, 10.93s/it]

 hello how are you  <bot>:  
Epoch 5/10


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 5/10 [01:02<00:47,  9.41s/it]

 hello how are you  <bot>:  
Epoch 6/10


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 60%|██████    | 6/10 [01:08<00:33,  8.26s/it]

 hello how are you  <bot>:  
Epoch 7/10


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 70%|███████   | 7/10 [01:16<00:23,  7.91s/it]

 hello how are you  <bot>:  
Epoch 8/10


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 80%|████████  | 8/10 [01:29<00:19,  9.75s/it]

 hello how are you  <bot>:  
Epoch 9/10


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 90%|█████████ | 9/10 [01:36<00:08,  8.71s/it]

 hello how are you  <bot>:  
Epoch 10/10


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 10/10 [01:43<00:00, 10.34s/it]


 hello how are you  <bot>: ,
Inference from the model:


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Bot:  hi  <bot>: ,,,,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Bot:  how are you  <bot>: ,,,,,,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Bot:  name  <bot>: ,,,,,,,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Bot:  go  <bot>: ,,,,,,,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Bot:  I love iphone! i just bought new iphone!  <bot>:  


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Bot:  hi  <bot>:  


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Bot:  your name  <bot>: ,,
