In [1]:
import json

# Load the JSON dataset

with open('/content/drive/MyDrive/train.json') as file:
    data = json.load(file)

In [2]:
# Initialize an empty list to store the messages
messages_list = []
sentiments_list = []

# Iterate through the data structure to extract messages
for conversation_id, conversation_data in data.items():
    for message_data in conversation_data['content']:
        message = message_data['message']
        sentiment = message_data['sentiment']
        messages_list.append(message)
        sentiments_list.append(sentiment)

In [3]:
print(len(messages_list))
print(len(sentiments_list))

188378
188378


In [4]:
print(set(sentiments_list))

{'Angry', 'Surprised', 'Curious to dive deeper', 'Fearful', 'Sad', 'Happy', 'Disgusted', 'Neutral'}


In [5]:
messages_list[:10]

['Are you a fan of Google or Microsoft?',
 'Both are excellent technology they are helpful in many ways. For the security purpose both are super.',
 "I'm not  a huge fan of Google, but I use it a lot because I have to. I think they are a monopoly in some sense. ",
 'Google provides online related services and products, which includes online ads, search engine and cloud computing.',
 "Yeah, their services are good. I'm just not a fan of intrusive they can be on our personal lives. ",
 'Google is leading the alphabet subsidiary and will continue to be the Umbrella company for Alphabet internet interest.',
 'Did you know Google had hundreds of live goats to cut the grass in the past? \n',
 'It is very interesting. Google provide "Chrome OS" which is a light weight OS. Google provided a lot of hardware mainly in 2010 to 2015. ',
 'I like Google Chrome. Do you use it as well for your browser? ',
 'Yes.Google is the biggest search engine and Google service figure out top 100 website, includi

In [6]:
messages_list=messages_list[:5000]

In [10]:
from torch.utils.data import Dataset
# define class for custom dataset
class ChatData(Dataset):
  def __init__(self,messages_list,tokenizer):
    self.messages_list=messages_list


    for idx,i in enumerate(self.messages_list):
       try:
          self.messages_list[idx]="<startofstring> "+i+" <bot>: "+self.messages_list[idx+1]+" <endofstring>"
       except:
        break


    print(self.messages_list[0])


    self.text_encoded= tokenizer(self.messages_list,max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    self.input_ids=self.text_encoded['input_ids']
    self.attention_mask=self.text_encoded['attention_mask']


  def __len__(self):
    return len(self.messages_list)
  def __getitem__(self, idx):
    return (self.input_ids[idx],self.attention_mask[idx])



In [None]:
pip install transformers

In [11]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.optim import Adam
from torch.utils.data import DataLoader
import tqdm
import torch

# Define a function to train the model
def train(ChatData, model, optim):
    epochs = 10
    for i in tqdm.tqdm(range(epochs)):
        for x, a in ChatData:
            x = x.to(device)
            a = a.to(device)
            optim.zero_grad()
            loss = model(x, attention_mask=a, labels=x).loss
            loss.backward()
            optim.step()
        torch.save(model.state_dict(), 'model_state.pt')
        print(infer(" I'm not  a huge fan of Google"))

# Define a function for inference
def infer(inp):
    inp = "<startofstring> " + inp + " <bot>: "
    inp = tokenizer(inp, return_tensors="pt")
    X = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    output = model.generate(X, attention_mask=a)
    output = tokenizer.decode(output[0])
    return output

# Choose the appropriate device
device = "cuda" if torch.cuda.is_available() else "gpu"

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({"pad_token": "<pad>",
                              "bos_token": "<startofstring>",
                              "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])

# Initialize the model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Resize the token embeddings
model.resize_token_embeddings(len(tokenizer))
model.to(device)

# Assuming you have a ChatData class for your data loading

# Create DataLoader for ChatData
ChatData = ChatData(messages_list, tokenizer)
ChatData = DataLoader(ChatData, batch_size=32)

# Set the model in training mode
model.train()

# Initialize the optimizer
optim = Adam(model.parameters(), lr=1e-3)

print("Training....")
train(ChatData, model, optim)

print("Infer from model:")
# while True:
#     inp = input()
#     print(infer(inp))


You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 50261. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


<startofstring> <startofstring> Are you a fan of Google or Microsoft? <bot>: Both are excellent technology they are helpful in many ways. For the security purpose both are super. <endofstring> <bot>: <startofstring> Both are excellent technology they are helpful in many ways. For the security purpose both are super. <bot>: I'm not  a huge fan of Google, but I use it a lot because I have to. I think they are a monopoly in some sense.  <endofstring> <endofstring>
Training....


  0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 10%|█         | 1/10 [02:47<25:11, 167.94s/it]

<startofstring> I'm not  a huge fan of Google <bot>: I'm not sure. I'm not sure


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 20%|██        | 2/10 [05:34<22:19, 167.41s/it]

<startofstring> I'm not  a huge fan of Google <bot>: I'm not  a big fan of Google


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 30%|███       | 3/10 [08:21<19:28, 166.95s/it]

<startofstring> I'm not  a huge fan of Google <bot>: I'm not  a big fan of Google


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 40%|████      | 4/10 [11:07<16:39, 166.65s/it]

<startofstring> I'm not  a huge fan of Google <bot>: I'm not  a big fan of Google


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 5/10 [13:54<13:53, 166.65s/it]

<startofstring> I'm not  a huge fan of Google <bot>: I'm not  a huge fan of Google


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 60%|██████    | 6/10 [16:41<11:07, 166.79s/it]

<startofstring> I'm not  a huge fan of Google <bot>: I'm not a huge fan of Google.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 70%|███████   | 7/10 [19:30<08:23, 167.68s/it]

<startofstring> I'm not  a huge fan of Google <bot>: I'm  a little more into the internet


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 80%|████████  | 8/10 [22:18<05:35, 167.64s/it]

<startofstring> I'm not  a huge fan of Google <bot>: I just buy an Apple tshirt for not


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 90%|█████████ | 9/10 [25:05<02:47, 167.53s/it]

<startofstring> I'm not  a huge fan of Google <bot>: That's pretty cool. I can see why


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
100%|██████████| 10/10 [27:52<00:00, 167.29s/it]

<startofstring> I'm not  a huge fan of Google <bot>: I don't know why they are good for
Infer from model:





In [12]:
while True:
    inp = input()
    if inp=='quit':
      break
    else:
         print(infer(inp))

hello


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> hello <bot>: Hi, how are you? <endofstring> <bot>: <startofstring> Hi, how are you? <bot>: hello
hello


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> hello <bot>: Hi, how are you? <endofstring> <bot>: <startofstring> Hi, how are you? <bot>: Good
do you know about google


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> do you know about google <bot>: I love Google. Google even prefers dogs to cats. <endofstring> <bot>:
I'm not  a huge fan of Google, but I use it a lot because I have to


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> I'm not  a huge fan of Google, but I use it a lot because I have to <bot>: Google
but I use it a lot because I have to


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> but I use it a lot because I have to <bot>: I think it helps people to feel more
what about microsoft


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> what about microsoft <bot>: Yes, i did not know that. i wonder if it was in
quit
