<a href="https://colab.research.google.com/github/ReethamG/Simple-Chatbot/blob/main/Transformers_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#fast

In [58]:
!pip install transformers==4.28.0



In [59]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [60]:
def read_txt(file_path):
  with open(file_path, 'r') as file:
    text = file.read().strip()
  return text

In [61]:
def train_chatbot(directory, model_output_path, train_fraction = 0.8):
  input_data = read_txt(directory)

  split_index = int(train_fraction * len(input_data))
  train_text = input_data[:split_index]
  val_text = input_data[split_index: ]

  with open('train.txt', 'w') as f:
    f.write(train_text)
  with open('val.txt', 'w') as f:
    f.write(val_text)

  tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
  model = GPT2LMHeadModel.from_pretrained('gpt2')

  train_dataset = TextDataset(tokenizer = tokenizer, file_path = 'train.txt', block_size = 128)
  val_dataset = TextDataset(tokenizer = tokenizer, file_path = 'val.txt', block_size = 128)
  data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm=False)
  training_args = TrainingArguments(
      output_dir = model_output_path,
      overwrite_output_dir = True,
      per_device_train_batch_size = 4,
      per_device_eval_batch_size = 4,
      num_train_epochs = 5,
      save_steps = 10_000,
      save_total_limit = 2
  )

  trainer = Trainer(
      model = model,
      args = training_args,
      data_collator = data_collator,
      train_dataset = train_dataset,
      eval_dataset = val_dataset
  )

  trainer.train()
  trainer.save_model(model_output_path)

  tokenizer.save_pretrained(model_output_path)

In [65]:
def generate_response(model, tokenizer, prompt, max_length = 15):
  input_ids = tokenizer.encode(prompt, return_tensors='pt')

  attention_mask = torch.ones_like(input_ids)
  pad_token_id = tokenizer.eos_token_id

  output = model.generate(
      input_ids,
      max_length = max_length,
      num_return_sequences = 1,
      attention_mask = attention_mask,
      pad_token_id = pad_token_id
  )

  return tokenizer.decode(output[0], skip_special_tokens = True)

In [63]:
directory = '/content/dialogs.txt'
model_output_path = '/content/model/'

train_chatbot(directory, model_output_path)



Step,Training Loss
500,1.4026


In [70]:
model_final = GPT2LMHeadModel.from_pretrained(model_output_path)
tokenizer_final = GPT2Tokenizer.from_pretrained(model_output_path)

prompt = input('Please say something: ')
while prompt.lower() not in ['stop']:
  response = generate_response(model_final, tokenizer_final, prompt)
  print('Generated Response: ', response)
  prompt = input('Please say something: ')

Please say something: chill out
Generated Response:  chill out loud.	i'll just put my phone on the radio


KeyboardInterrupt: ignored