In [1]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments


  from .autonotebook import tqdm as notebook_tqdm





In [3]:
# Load your DataFrame with QA pairs
df = pd.read_excel("training_data_assignment.xlsx")  # Replace with your dataset path
df.columns = ['question', 'answer']

# Combine questions and answers into a single string
df['input_text'] = df['question'] + " [SEP] " + df['answer']
texts = df['input_text'].tolist()

In [5]:
# Tokenize your dataset using the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Save the texts to a text file with the correct encoding (e.g., 'utf-8')
with open("training_data_assignment.txt", "w", encoding="utf-8") as file:
    for text in texts:
        file.write(text + "\n")

In [6]:
# Create a TextDataset
text_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path="training_data_assignment.txt",  # Specify the path to the dataset file
    block_size=128,  # Adjust the block size based on your data
)

# Fine-tune the GPT-2 model on your QA dataset
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)



In [8]:
training_args = TrainingArguments(
    output_dir="./fine-tuned-gpt2-qa",
    overwrite_output_dir=True,
    num_train_epochs=1,  # Adjust as needed
    per_device_train_batch_size=2,  # Adjust based on your GPU memory
    save_steps=10_000,
    save_total_limit=2,
)

model = GPT2LMHeadModel.from_pretrained("gpt2")

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=text_dataset,  # Use the TextDataset for training
)

trainer.train()

 98%|█████████▊| 500/510 [27:31<00:27,  2.71s/it]

{'loss': 3.0923, 'learning_rate': 9.80392156862745e-07, 'epoch': 0.98}


100%|██████████| 510/510 [27:57<00:00,  3.29s/it]

{'train_runtime': 1677.5683, 'train_samples_per_second': 0.607, 'train_steps_per_second': 0.304, 'train_loss': 3.082860273473403, 'epoch': 1.0}





TrainOutput(global_step=510, training_loss=3.082860273473403, metrics={'train_runtime': 1677.5683, 'train_samples_per_second': 0.607, 'train_steps_per_second': 0.304, 'train_loss': 3.082860273473403, 'epoch': 1.0})

In [9]:
# Save the fine-tuned GPT-2 model and tokenizer
model.save_pretrained("fine-tuned-gpt2-qa")
tokenizer.save_pretrained("fine-tuned-gpt2-qa")

('fine-tuned-gpt2-qa\\tokenizer_config.json',
 'fine-tuned-gpt2-qa\\special_tokens_map.json',
 'fine-tuned-gpt2-qa\\vocab.json',
 'fine-tuned-gpt2-qa\\merges.txt',
 'fine-tuned-gpt2-qa\\added_tokens.json')

In [14]:
prompt = "What is the stocks?"
input_ids = tokenizer.encode(prompt, return_tensors="pt")
output = model.generate(input_ids)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


What is the stocks? [SEP] The stock market is a market where investors buy and sell
