In [1]:
import json

input_file = "muiltiarith_training.json"  # Replace with the path to your JSONL file
output_file = "output.txt"  # Output file path

with open(input_file, "r") as f:
    data = json.load(f)

with open(output_file, "w") as f:
    for i in range(0,int(len(data)*0.7)):
        # json_data = json.loads(line.strip())
        prompt = data[i]["prompt"]
        completion = data[i]["reasoning"]
        answer = data[i]["answer"]
        f.write(f"[Q] {prompt}\n")
        # f.write("[Q] " + prompt + "\n")
        f.write(f"[A] {completion}--> {answer} END\n")
        # f.write("[A] " + completion + " --> " + answer  " END\n")
        f.write("\n")

In [2]:
# data

In [3]:
import pandas as pd
import numpy as np
import re
import os

In [4]:
def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def read_documents_from_directory():
    combined_text = ""
    file_path = 'output.txt'
    combined_text += read_txt(file_path)
    return combined_text

In [None]:
# train_directory = '/content/drive/MyDrive/ColabNotebooks/data/chatbot_docs/training_data/q_and_a'
text_data = read_documents_from_directory()
text_data = re.sub(r'\n+', '\n', text_data).strip()  # Remove excess newline characters
text_data = re.sub(r"END", "END\n",text_data)
text_data

In [6]:
with open(output_file, 'w') as output_file:
    # output_file.write("")
    output_file.write(text_data)
    output_file.close()

In [7]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [8]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

In [9]:
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [10]:
train_file_path = "output.txt"
model_name = 'gpt2'
output_dir = 'custom_q_and_a'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 50
save_steps = 50000

In [None]:
!pip install accelerate -U

In [None]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(model_path, sequence, max_length):

    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    return tokenizer.decode(final_outputs[0], skip_special_tokens=True)

In [None]:
model2_path = "custom_q_and_a"
sequence2 = "[Q] There are 37 short bushes and 30 tall trees currently in the park . Park workers will plant 20 short bushes today . How many short bushes will the park have when the workers are finished ?"
max_len = 150
answer = generate_text(model2_path, sequence2, max_len)
print(answer)

[Q] There are 37 short bushes and 30 tall trees currently in the park. Park workers will plant 20 short bushes today. How many short bushes will the park have when the workers are finished?
[A] Sure, let's work through this step by step!
1. Short bushes:
	* Each short bush will have 37 flowers
	* So, the park will have 37 short trays left over after the workers have left and the flowers have been picked (37 - 20 = 18).
2. Long trays:
	* Each long trays will have 37 flowers
	* So, the park will have 37 long trays left over after the workers have left and the flowers have been picked (37


In [16]:
f = open('muiltiarith_training.json')
test_data = json.load(f)

In [17]:
correct_predictions = 0
model2_path = "custom_q_and_a"
max_len = 150
for i in range(int(len(test_data)*0.7), len(test_data)):

  new_prompt = test_data[i]['prompt']
  answer = generate_text(model2_path, "[Q] "+new_prompt, max_len)
  try:
    pred = int(answer.split("-->")[1].split("END")[0].strip())
  except:
    pred = -9999
    pass
  ans = test_data[i]['answer']
  if '.' in test_data[i]['answer']:
    pred=float(pred)
    ans = float(ans)
  else:
    ans = int(ans)
  if pred == ans:
    correct_predictions+= 1


In [18]:
total_data_points = len(test_data) - int(len(test_data)*0.7)
print("Accuracy = ",(correct_predictions/total_data_points)*100, "%")

Accuracy =  5.555555555555555 %


In [None]:
for i in range(int(len(test_data)*0.7), len(test_data)):

  new_prompt = test_data[i]['prompt']
  answer = generate_text(model2_path, "[Q] "+new_prompt, max_len)
  try:
    pred = int(answer.split("-->")[1].split("END")[0].strip())
    print(answer)
    print(pred)
    # break
  except:
    pred = -9999
    pass
  ans = test_data[i]['answer']
  if '.' in test_data[i]['answer']:
    pred=float(pred)
    ans = float(ans)
    print(ans)
  else:
    ans = int(ans)
    print(ans)

  break
  if pred == ans:
    correct_predictions+= 1

[Q] Sally had 39 baseball cards, and 9 were torn. Sara bought 24 of Sally's baseball cards. How many baseball cards does Sally have now?
[A] Sure! Let's break down the problem step by step:
1. Sally had 39 baseball cards.
2. Sara bought 24 of Sally's baseball cards.
So, let's add 24 to 39:
39 + 24 = 85
Therefore, Sally has 85 baseball cards now.--> 85 END

[Q] There are 41 short trees and 32 tall trees currently in the park. Park workers had to cut down 32 short trees that were damaged. How many short trees will the park have when the workers are finished?
[A] Sure
85
15
