In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
current_directory = os.getcwd()
file_path = os.path.join(current_directory, '/content/drive/MyDrive/data/train_data.csv')
print(file_path)
finbot = pd.read_csv(file_path)

In [None]:
finbot.head()


# Clean data for GPT2

In [None]:
# This function creates a list of dictionaries from finbot dataframe pulling each Q and A and turinng them into a string
def prepare_data_from_csv(finbot):
    return [
        {
            "text": f"User: {row['question']}\nBot: {row['answer']}"
        }
        for _, row in finbot.iterrows()
    ]


In [None]:
prepared_data = prepare_data_from_csv(finbot)

In [None]:
with open("formatted_qa.txt", "w") as f:
    for item in prepared_data:
        f.write(item["text"] + "\n\n")

# Fine tuning GPT2 Model


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer


In [None]:
pip install transformers datasets torch

In [None]:
from datasets import load_dataset


In [None]:
#Assigning the end of sentence token for padding and matching with our base model
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

In [None]:
from datasets import load_dataset

dataset = load_dataset("text", data_files={"train": "formatted_qa.txt"})

In [None]:
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])


In [None]:
# creating batches
from transformers import DataCollatorForLanguageModeling

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [None]:
from transformers import TrainingArguments, Trainer


In [None]:
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned-finance",
    overwrite_output_dir=True,
    num_train_epochs=4,
    per_device_train_batch_size=8,
    save_steps=1000,
    save_total_limit=2,
    prediction_loss_only=True,
    logging_steps=1000
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator
)


In [None]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdimakatso-ntwampe99[0m ([33mdimakatso-ntwampe99-shaper[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


In [None]:
# saving the model and tokenizer
trainer.save_model("/content/drive/MyDrive/model/gpt2-finance-chatbot")
tokenizer.save_pretrained("/content/drive/MyDrive/model/gpt2-finance-chatbot")


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch


In [None]:
model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/model/gpt2-finance-chatbot")
tokenizer = GPT2Tokenizer.from_pretrained("/content/drive/MyDrive/model/gpt2-finance-chatbot")

In [None]:
def generate_response(prompt):
    inputs = tokenizer.encode(prompt, return_tensors="pt")


In [None]:
   def generate_response(prompt):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(inputs,
        max_length=150,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_k=50,
        top_p=0.95
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
    user_input = input("Please enter your question: ")

    prompt = f"User: {user_input}\nBot:"
    response = generate_response(prompt)
    print(response.split("Bot:")[-1].strip())