## Importing Libraries

In [32]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, TrainerCallback
from tqdm import tqdm
from torch.utils.data import DataLoader
from flask import Flask, request, jsonify

## Dataset preprocessing

In [33]:
# 1. Prepare your custom dataset
def load_dataset(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )
    return dataset

In [34]:
train_file_path = "/content/Title The Wanderer’s Path.txt.txt"



# Loading pretrained model

In [35]:
# 2. Load the pre-trained GPT-2 model
model_name = "gpt2"  # You can choose "gpt2-medium" or "gpt2-large" if you have more resources
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [36]:
# Set pad_token_id for GPT-2 tokenizer
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id


In [37]:
# 3. Fine-tune the model
# Prepare dataset
train_dataset = load_dataset(train_file_path, tokenizer)




In [38]:
# Set up data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

In [39]:
# Training parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

## Setting Hyperparameters

In [40]:
epochs = 20
batch_size = 4
learning_rate = 5e-5

In [41]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=data_collator)

# Model Training

In [42]:
# Training loop
print("Training the model...")
model.train()
total_steps = len(dataloader) * epochs
progress_bar = tqdm(total=total_steps, desc="Training")

for epoch in range(epochs):
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = (input_ids != tokenizer.pad_token_id).to(device)  # Create attention mask
        labels = input_ids.clone()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        progress_bar.update(1)
        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

progress_bar.close()

Training the model...


Training: 100%|██████████| 60/60 [00:08<00:00,  6.81it/s, loss=0.0919]


# Saving Model

In [43]:
# Save the final model
print("Saving the final model...")
model.save_pretrained("./fine_tuned_gpt2_final")
tokenizer.save_pretrained("./fine_tuned_gpt2_final")

print("Training complete!")


Saving the final model...
Training complete!


# Text generating from model

In [44]:
# 4. Generate text
def generate_text(prompt, model, tokenizer, max_length=100):
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, no_repeat_ngram_size=2)
    return tokenizer.decode(output[0], skip_special_tokens=True)



# Loading model

In [45]:
# Load fine-tuned model
fine_tuned_model = GPT2LMHeadModel.from_pretrained("./fine_tuned_gpt2_final").to(device)
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_gpt2_final")



# prompt Completion

In [46]:
# Generate text
prompt = "kadar was nomad"
generated_text = generate_text(prompt, fine_tuned_model, fine_tuned_tokenizer)
print("Generated text:", generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated text: kadar was nomad for many years, and was a guardian of the desert. He was wise and had seen many seasons come and go. His life was one of constant movement, following the stars at night and the whispers of wind by day.

Kadara was the last of a tribe of archers, who had long since forsaken permanent settlements in favor of an ancient, life on the move. Their survival depended on their survival against the odds. They had become one with
