In [1]:
import torch
import torch.nn as nn
import math
# ===============================
# Multi Attention Block
# ===============================
class MultiHeadAttentionCustom(nn.Module):
    def __init__(self, embed_dim, num_heads=12, dropout=0.1):
        super().__init__()
        assert embed_dim % num_heads ==0
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.Wq = nn.Linear(embed_dim,embed_dim)
        self.Wk = nn.Linear(embed_dim,embed_dim)
        self.Wv = nn.Linear(embed_dim,embed_dim)
        self.Wo = nn.Linear(embed_dim,embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        B,T,Q = x.size()
        Q = self.Wq(x)
        K = self.Wk(x)
        V = self.Wv(x)

        Q = Q.view(B,T,self.num_heads,self.head_dim).transpose(1,2)
        K = K.view(B,T,self.num_heads,self.head_dim).transpose(1,2)
        V = V.view(B,T,self.num_heads,self.head_dim).transpose(1,2)


        Q_dot_K = Q @ K.transpose(-2,-1) / math.sqrt(self.head_dim)

        Mask = torch.triu(torch.ones(T,T,device = x.device),diagonal = 1)
        #Mask = Mask.unsqueeze(0).unsqueeze(0)

        Q_dot_K = Q_dot_K.masked_fill(Mask == 1, float('-inf'))

        Q_dot_K = torch.softmax(Q_dot_K, dim=-1)
        Q_dot_K = self.dropout(Q_dot_K)
        out = Q_dot_K @ V
        out = out.transpose(1,2).contiguous().view(B,T,self.embed_dim)

        out = self.Wo(out)
        return out

In [None]:


# ===============================
# Transformer Block
# ===============================
class GPTBlock_Torch(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ln2 = nn.LayerNorm(embed_dim)
        self.attn = MultiHeadAttentionCustom(embed_dim, num_heads, dropout=dropout)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.GELU(),
            nn.Linear(ff_dim, embed_dim)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Self-attention + residual
        attn_out = self.attn(x)
        x = self.ln1(x + self.dropout(attn_out))
        # Feed-forward + residual
        ff_out = self.ff(x)
        x = self.ln2(x + self.dropout(ff_out))
        return x


# ===============================
# GPT Model
# ===============================
class GPTModel_Torch(nn.Module):
    def __init__(self, vocab_size=50257, max_len=1024, embed_dim=768, num_heads=12, ff_dim=3072, num_layers=12, dropout=0.1):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(max_len, embed_dim)
        self.drop = nn.Dropout(dropout)
        self.blocks = nn.ModuleList([
            GPTBlock_Torch(embed_dim, num_heads, ff_dim, dropout) for _ in range(num_layers)
        ])
        self.ln_f = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, vocab_size, bias=False)

        # initialize weights (as in GPT-2)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx):
        B, T = idx.shape
        device = idx.device
        pos = torch.arange(0, T, dtype=torch.long, device=device).unsqueeze(0)  # (1, T)

        # Token + positional embeddings
        x = self.token_emb(idx) + self.pos_emb(pos)
        x = self.drop(x)

        # Causal mask to block future tokens
        mask = torch.triu(torch.ones(T, T, device=device), diagonal=1).bool()
        mask = mask.masked_fill(mask, float('-inf'))

        # Pass through transformer blocks
        for block in self.blocks:
           # x = block(x, attn_mask=mask)
            x = block(x)
        # Final layer norm + output head
        x = self.ln_f(x)
        logits = self.head(x)
        return logits


In [None]:
import torch

def generate(model, tokenizer, prompt, max_new_tokens, temperature=1.0, top_k=None):
    """
    Generate text from a GPT-like PyTorch model.

    Args:
        model: GPT model (torch.nn.Module)
        tokenizer: tokenizer with encode/decode methods
        prompt: initial text prompt
        max_new_tokens: number of tokens to generate
        temperature: randomness scaling (1.0 = normal, <1 = conservative, >1 = creative)
        top_k: if set, only sample from the top_k most likely tokens
    """
    with torch.no_grad():
        model.eval()  # ✅ set model to evaluation mode
        tokens = torch.tensor([tokenizer.encode(prompt)], dtype=torch.long)

        # move to same device as model
        device = next(model.parameters()).device
        tokens = tokens.to(device)

        for _ in range(max_new_tokens):
            with torch.no_grad():
                logits = model(tokens)  # (B, T, vocab)
                next_token_logits = logits[:, -1, :] / temperature  # only last token logits

                # Optionally apply top-k sampling (keeps top k most likely)
                if top_k is not None:
                    values, _ = torch.topk(next_token_logits, k=top_k)
                    min_values = values[:, -1].unsqueeze(1)
                    next_token_logits = torch.where(
                        next_token_logits < min_values,
                        torch.full_like(next_token_logits, float('-inf')),
                        next_token_logits,
                    )

                probs = torch.softmax(next_token_logits, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)

                # Append new token
                tokens = torch.cat([tokens, next_token], dim=1)

        # Decode back to text
        output_text = tokenizer.decode(tokens[0].tolist())
        return output_text


In [None]:
from datasets import load_dataset

# Download and cache locally
#ds = load_dataset("wikitext", "wikitext-2-raw-v1")
ds = load_dataset("openwebtext")
print(ds)
# -> DatasetDict({train: ..., validation: ..., test: ...})


RuntimeError: Dataset scripts are no longer supported, but found openwebtext.py

In [None]:
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
max_len = 1024
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
def encode(example):
    tokens = tokenizer.encode(example["text"], truncation=True, max_length=max_len)
    return {"input_ids": tokens}

# Tokenize all text
tokenized_ds = ds["train"].map(encode, remove_columns=["text"])


def collate_fn(batch):

    batch = [torch.tensor(x["input_ids"]) for x in batch if len(x["input_ids"]) > 1]
    if not batch:
        return None
    # pad to same length
    maxlen = max(len(x) for x in batch)
    inputs = torch.zeros(len(batch), maxlen, dtype=torch.long)
    targets = torch.zeros_like(inputs)

    for i, x in enumerate(batch):
        inputs[i, :len(x)-1] = x[:-1]
        targets[i, :len(x)-1] = x[1:]
    return inputs.to(device), targets.to(device)

loader = DataLoader(tokenized_ds, batch_size=8, shuffle=True, collate_fn=collate_fn)



tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

In [None]:
from transformers import GPT2Tokenizer,get_cosine_schedule_with_warmup
import torch
import time
import gc

def encode(text,tokenizer):
    tokens = tokenizer.encode(text,add_special_tokens=False)
    return torch.tensor(tokens)

def train_model(model,loader):

    num_epochs = 4

    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
    criterion = torch.nn.CrossEntropyLoss()
    scaler = torch.cuda.amp.GradScaler()  # ✅ for mixed precision



    # -----------------------------
    # Setup dynamic LR scheduler
    # -----------------------------
    total_steps = len(loader) * num_epochs
    warmup_steps = int(0.1 * total_steps)  # 10% warmup
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )



    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        temp_loss =0
        for i,batch in enumerate(loader):
            if batch is None:
                continue
            inputs, targets = batch


            optimizer.zero_grad(set_to_none=True)  # ✅ reduces memory fragmentation
            with torch.amp.autocast(device_type = "cuda",dtype=torch.float16):
                logits = model(inputs)
                loss = criterion(
                          logits.view(-1, logits.size(-1)),
                          targets.view(-1)
                      )
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()  # ✅ dynamically adjust LR

            total_loss += loss.item()

            if i % 20 == 0:

                temp_loss = 0
                current_lr = scheduler.get_last_lr()[0]
                print(f"Epoch {epoch+1} | Step {i}/{len(loader)} | Loss: {loss.item():.4f} | LR: {current_lr:.6f}")

           # if i % 140 == 0:
           #     time.sleep(2)
            if i % 100 == 0:
              del logits, loss, inputs, targets
              torch.cuda.empty_cache()
              gc.collect()

        print(f"Epoch {epoch+1} | Loss: {total_loss:.4f}")




#model_torch = GPTModel_Torch(
#        vocab_size=50257,
#        max_len=1024,
#        embed_dim=768,
#        num_heads=12,
#        ff_dim=3072,
#        num_layers=12
#).to(device)


train_model(model_torch,loader)
torch.save(model_torch, "model_full.pth")


  scaler = torch.cuda.amp.GradScaler()  # ✅ for mixed precision


Epoch 1 | Step 0/4590 | Loss: 11.3397 | LR: 0.000000
Epoch 1 | Step 20/4590 | Loss: 10.2586 | LR: 0.000001
Epoch 1 | Step 40/4590 | Loss: 6.9268 | LR: 0.000002
Epoch 1 | Step 60/4590 | Loss: 4.1492 | LR: 0.000003
Epoch 1 | Step 80/4590 | Loss: 5.1609 | LR: 0.000004
Epoch 1 | Step 100/4590 | Loss: 4.1004 | LR: 0.000006
Epoch 1 | Step 120/4590 | Loss: 3.4439 | LR: 0.000007
Epoch 1 | Step 140/4590 | Loss: 5.0378 | LR: 0.000008
Epoch 1 | Step 160/4590 | Loss: 6.0255 | LR: 0.000009
Epoch 1 | Step 180/4590 | Loss: 5.1341 | LR: 0.000010
Epoch 1 | Step 200/4590 | Loss: 5.6962 | LR: 0.000011
Epoch 1 | Step 220/4590 | Loss: 2.8795 | LR: 0.000012
Epoch 1 | Step 240/4590 | Loss: 4.9617 | LR: 0.000013
Epoch 1 | Step 260/4590 | Loss: 4.6712 | LR: 0.000014
Epoch 1 | Step 280/4590 | Loss: 3.9977 | LR: 0.000015
Epoch 1 | Step 300/4590 | Loss: 3.2619 | LR: 0.000016
Epoch 1 | Step 320/4590 | Loss: 5.0231 | LR: 0.000017
Epoch 1 | Step 340/4590 | Loss: 5.0499 | LR: 0.000019
Epoch 1 | Step 360/4590 | Loss: 

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.03 GiB. GPU 0 has a total capacity of 14.74 GiB of which 568.12 MiB is free. Process 2603 has 14.18 GiB memory in use. Of the allocated memory 13.60 GiB is allocated by PyTorch, and 458.97 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
torch.save(model_torch.state_dict(), save_path + "model_full2.pth")

In [None]:
save_path = "/content/drive/MyDrive/gpt_model_training/"
import os
os.makedirs(save_path, exist_ok=True)
model_torch = GPTModel_Torch(
        vocab_size=50257,
        max_len=1024,
        embed_dim=768,
        num_heads=12,
        ff_dim=3072,
        num_layers=12
).to(device)

#state_dict = torch.load("/content/drive/MyDrive/gpt_model_training/model_full2.pth", map_location=device)
#model_torch.load_state_dict(state_dict)
#torch.save(model_torch.state_dict(), save_path + "model_full2.pth")


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
#torch.save(model_torch, "model_full_v2.pth")



# Run generation
prompt = "The war during 1940-1941 was"
generated_text = generate(model_torch, tokenizer, prompt, max_new_tokens=100, temperature=0.9, top_k=50)

print(generated_text)


The war during 1940-1941 was the year to a single @-@ year study for the American Association . 
 " was considered a new program , and a new character of an unopposed being the new building that was written with the new version of the original industry . Most of the International and Red Cross made some of the country and was developed . 
 . The series was the first aired on April 4 , 2009 , with the Ustašeir . The release of the film was released by the University and was


In [None]:
# coding=utf-8
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The Open WebText Corpus"""

import re

import datasets


_CITATION = """\
@misc{Gokaslan2019OpenWeb,
  title={OpenWebText Corpus},
  author={Aaron Gokaslan*, Vanya Cohen*, Ellie Pavlick, Stefanie Tellex},
  howpublished{\\url{http://Skylion007.github.io/OpenWebTextCorpus}},
  year={2019}
}
"""

_DESCRIPTION = """\
An open-source replication of the WebText dataset from OpenAI.
"""

_N_DATA_FILES = 21
_DATA_FILES = ["subsets/urlsf_subset{:02d}.tar".format(i) for i in range(_N_DATA_FILES)]


class Openwebtext(datasets.GeneratorBasedBuilder):
    """The Open WebText dataset."""

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="plain_text",
            description="Plain text",
            version=datasets.Version("1.0.0"),
        )
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features({"text": datasets.Value("string")}),
            homepage="https://skylion007.github.io/OpenWebTextCorpus/",
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        archives = dl_manager.download(_DATA_FILES)
        return [
            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={
                "archive_iterators": [
                    dl_manager.iter_archive(archive) for archive in archives
                ],
                "iter_archive": dl_manager.iter_archive
            }),
        ]

    def _generate_examples(self, archive_iterators, iter_archive):
        """Yields examples."""
        for archive_iterator in archive_iterators:
            for xz_filepath, xz_f in archive_iterator:
                if not xz_filepath.endswith(".xz"):
                    continue
                for txt_filepath, txt_f in iter_archive(xz_f):
                    if not txt_filepath.endswith(".txt"):
                        continue
                    idx = f"{xz_filepath}/{txt_filepath}"
                    yield idx, {"text": re.sub("\n\n\n+", "\n\n", txt_f.read().decode("utf-8")).strip()}