In [19]:
from torch.utils.data import Dataset
import json

class ChatData(Dataset):
    def __init__(self, path:str, tokenizer):
        self.data = json.load(open(path, "r"))

        self.X = []
        for i in self.data:
            for j in i['dialog']:
                self.X.append(j['text'])

        for idx, i in enumerate(self.X):
            try:
                self.X[idx] = "<startofstring> "+i+" <bot>: "+self.X[idx+1]+" <endofstring>"
            except:
                break

        self.X = self.X[:5000]

        print(self.X[0])

        self.X_encoded = tokenizer(self.X,max_length=40, truncation=True, padding="max_length", return_tensors="pt")
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])

In [20]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.optim import Adam
from torch.utils.data import DataLoader
import tqdm
import torch

def train(chatData, model, optim):
    epochs = 12
    for i in tqdm.tqdm(range(epochs)):
        for X, a in chatData:
            X = X.to(device)
            a = a.to(device)
            optim.zero_grad()
            loss = model(X, attention_mask=a, labels=X).loss
            loss.backward()
            optim.step()
        torch.save(model.state_dict(), "model_state.pt")
        print(infer("hello how are you"))

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "<pad>",
                                "bos_token": "<startofstring>",
                                "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

model = model.to(device)

# print(tokenizer.decode(model.generate(**tokenizer("hey i was good at basketball but ",
#                          return_tensors="pt"))[0]))

chatData = ChatData("./chat_data.json", tokenizer)
chatData =  DataLoader(chatData, batch_size=64)

model.train()

optim = Adam(model.parameters(), lr=1e-3)



<startofstring> I love iphone! i just bought new iphone! <bot>: Thats good for you, i'm not very into new tech <endofstring>


In [21]:
print("training .... ")
train(chatData, model, optim)

training .... 


  0%|          | 0/12 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  8%|▊         | 1/12 [00:53<09:50, 53.67s/it]

hello how are you  <bot>:


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 17%|█▋        | 2/12 [01:36<07:53, 47.38s/it]

hello how are you  <bot>:   i am a huge i am a huge, i am a huge i am a huge i am a huge huge gamer, i am a huge, i


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 25%|██▌       | 3/12 [02:19<06:48, 45.33s/it]

hello how are you  <bot>:   i am not a huge i am not sure i am not a huge i am a huge i am a


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 33%|███▎      | 4/12 [03:07<06:12, 46.51s/it]

hello how are you  <bot>:   I do you are doing it is a little?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 42%|████▏     | 5/12 [03:58<05:35, 47.98s/it]

hello how are you  <bot>:   i am not a big fan of the nighing, i am not a huge fan of a huge fan of a huge fan of a fan of a fan of all the truth  i am a huge fan of the


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 6/12 [04:51<04:58, 49.81s/it]

hello how are you  <bot>:   i am a huge gamer  i am a gamer  i am a gamer  i am a huge gamer  ok


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 58%|█████▊    | 7/12 [05:40<04:07, 49.44s/it]

hello how are you  <bot>:   i am not sure what you mean  i am not sure you are right man, i am not sure you are you a little man in a man


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 67%|██████▋   | 8/12 [06:23<03:09, 47.29s/it]

hello how are you  <bot>:   I am a huge fan of it's not fan of it's fan of all the news. i am a fan of all about that fan, fan. i am a fan of it. i love it. fan of fan. i love it's fan. fan fan.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 75%|███████▌  | 9/12 [07:17<02:28, 49.37s/it]

hello how are you  <bot>:   Hi, how are doing?  Hi, how are doing great, how are doing you doing?  Hi, how are doing?  I just finished doing?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 83%|████████▎ | 10/12 [08:00<01:35, 47.64s/it]

hello how are you  <bot>:   I am a huge fan of all animals. are you a lot of animals?  I have a lot of them. are you?  I


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 92%|█████████▏| 11/12 [08:48<00:47, 47.60s/it]

hello how are you  <bot>:   I am a huge fan of all animals. I love animals. are you a 2pac, or are you?  I am a 2pac, i am a golden?  I am a golden retriever  I am a


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 12/12 [09:37<00:00, 48.12s/it]

hello how are you  <bot>:   I am a student, i am a student       I am a real student





In [22]:
def infer(inp):
    inp = "<startofstring> " + inp + " <bot>: "
    inp = tokenizer(inp, return_tensors="pt")
    X = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    output = model.generate(X, attention_mask=a, max_length=100, num_return_sequences=1)
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return output_text.split("<endofstring>")[0].strip()

In [23]:
print("infer from model : ")
while True:
  inp = input()
  print(infer(inp))

infer from model : 
hello


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


hello  <bot>:   Hi, how are doing?  Hi, how are doing?  Hi, how are doing?  Hi, how are doing?  Hi
what are you doing


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


what are you doing  <bot>:   I am doing well. I am just hanging out with my dog.  I am a dog.  I am a dog.   I am a dog.
do you love music


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


do you love music  <bot>:   I do, i do you speak a french???  I speak french?  Yes, but
do you like coding


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


do you like coding  <bot>:   i do you like music?  i like to listen music  i like to music  i like to play the best  i like to play the piano?


KeyboardInterrupt: Interrupted by user