**INSTALL DEPENDENCIES**

In [None]:
!pip install pypdf2

Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf2
Successfully installed pypdf2-3.0.1


In [None]:
# Import Libraries

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

# Step 1: Load Text from File
def load_text_from_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    return text

# Step 2: Tokenize Data
tokenizer = AutoTokenizer.from_pretrained("gpt2")  # Use GPT-2 tokenizer for simplicity
def tokenize_text(text, block_size=128):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    sequences = [tokens[i : i + block_size] for i in range(0, len(tokens) - block_size, block_size)]
    return sequences

# Step 3: Create Dataset Class
class TextDataset(Dataset):
    def __init__(self, sequences):
        self.data = torch.tensor(sequences, dtype=torch.long)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx][:-1], self.data[idx][1:]  # Input and target shift by 1

# Step 4: Define a Minimal GPT Model
class MiniGPT(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_heads=4, num_layers=5, block_size=128):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.pos_encoding = nn.Parameter(torch.zeros(1, block_size, embed_dim))
        self.transformer_blocks = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads), num_layers
        )
        self.fc_out = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embed(x) + self.pos_encoding[:, :x.shape[1], :]
        x = self.transformer_blocks(x)
        return self.fc_out(x)

# Step 5: Train the Model
def train_model(model, dataloader, epochs=500, lr=1e-3):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = 0
        for x, y in dataloader:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.view(-1, outputs.shape[-1]), y.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch + 1}: Loss = {total_loss / len(dataloader)}")
        if ((epoch + 1) % 10) == 0:
          print("\nGenerated Text:")
          print(generate_text(model, "Oh!", length=500))

# Step 6: Generate Text
def generate_text(model, start_text, length=100, temperature=0.8):
    model.eval()
    input_ids = tokenizer.encode(start_text, return_tensors="pt").cuda()
    generated = input_ids

    for _ in range(length):
        with torch.no_grad():
            logits = model(generated[:, -128:])
            logits = logits[:, -1, :] / temperature  # Apply temperature scaling
            probs = torch.nn.functional.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            generated = torch.cat([generated, next_token], dim=-1)

    return tokenizer.decode(generated[0].tolist())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

### **MAIN EXECUTION STARTS HERE**

In [None]:
file_path = "harry_potter.txt"  # Path to training text file
text_data = load_text_from_file(file_path)
sequences = tokenize_text(text_data)

dataset = TextDataset(sequences)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

vocab_size = tokenizer.vocab_size
model = MiniGPT(vocab_size).cuda()

Token indices sequence length is longer than the specified maximum sequence length for this model (113795 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
# Train the model
train_model(model, dataloader)

# Final text generation after training
print("\nFinal Generated Text:")
print(generate_text(model, "Okay!", length=5000))

Epoch 1: Loss = 7.40033894777298
Epoch 2: Loss = 5.754582107067108
Epoch 3: Loss = 5.220138422080448
Epoch 4: Loss = 4.861935453755515
Epoch 5: Loss = 4.615268341132572
Epoch 6: Loss = 4.426807488713946
Epoch 7: Loss = 4.269371943814414
Epoch 8: Loss = 4.137812878404345
Epoch 9: Loss = 4.0237778680665155
Epoch 10: Loss = 3.9243767857551575

Generated Text:
Oh!Well, but he muttered the hat had a spindor, it's why he wanted to the table on the Slytherin' it's the walls, and Harry thinks he wanted to the first one of the first, from it's too as though it is, I'd all the glass, the middle, they were an'ry in time -- he could see that's the first time away from kting curled up to him. "N-Bind," said Ron said. 


"Just all this is particularly few." 

"What's the hall, and the floors to knock the other around theest," said Ron, "But I want to hear the Stone's hut around the cat was called and a large minutes, but it. "Be trunk, but the library." 

"Oh, "I will -- but much. 

"I've got if yeh