In [1]:
import os
import re
import time
import torch

  cpu = _conversion_method_template(device=torch.device("cpu"))


Task 1. Dataset Acquisition

In [None]:
from pathlib import Path

RAW_DIR = Path("data_raw/harry_potter_books")
assert RAW_DIR.exists(), f"Folder not found: {RAW_DIR}"

txt_files = sorted(RAW_DIR.glob("*.txt"))
print("Found files:", len(txt_files))
for p in txt_files:
    print("-", p.name)


Found files: 3
- 01 Harry Potter and the Sorcerers Stone.txt
- 02 Harry Potter and the Chamber of Secrets.txt
- 03 Harry Potter and the Prisoner of Azkaban.txt


In [3]:
import re

def clean_book_text(text: str) -> str:
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    # collapse excessive blank lines
    text = re.sub(r"\n{3,}", "\n\n", text)
    # remove trailing spaces on lines
    text = "\n".join([line.rstrip() for line in text.split("\n")])
    return text.strip()

books = []
for fp in txt_files:
    t = fp.read_text(encoding="utf-8", errors="ignore")
    t = clean_book_text(t)
    books.append(t)

print("Loaded books:", len(books))
print("Example preview:\n", books[0][:500])


Loaded books: 3
Example preview:
 M r. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense.

Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amoun


In [4]:
DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True)

corpus_text = "\n\n".join(books)
CORPUS_PATH = DATA_DIR / "hp_kaggle_corpus.txt"
CORPUS_PATH.write_text(corpus_text, encoding="utf-8")

print("Saved:", CORPUS_PATH)
print("Chars:", len(corpus_text))
print("Tokens (whitespace):", len(corpus_text.split()))


Saved: data\hp_kaggle_corpus.txt
Chars: 1556494
Tokens (whitespace): 274252


Task 2. Model Training

Tokenize

In [5]:
def simple_tokenize(text: str):
    text = re.sub(r"([.,!?;:()\"'])", r" \1 ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip().split(" ")

text = CORPUS_PATH.read_text(encoding="utf-8", errors="ignore")
tokens = simple_tokenize(text)

print("Num tokens:", len(tokens))
print("First 40 tokens:", tokens[:40])


Num tokens: 326052
First 40 tokens: ['M', 'r', '.', 'and', 'Mrs', '.', 'Dursley', ',', 'of', 'number', 'four', ',', 'Privet', 'Drive', ',', 'were', 'proud', 'to', 'say', 'that', 'they', 'were', 'perfectly', 'normal', ',', 'thank', 'you', 'very', 'much', '.', 'They', 'were', 'the', 'last', 'people', 'you’d', 'expect', 'to', 'be', 'involved']


Build vocab

In [6]:
from collections import Counter

PAD = "<pad>"
UNK = "<unk>"

def build_vocab(tokens, min_freq=2):
    counts = Counter(tokens)
    itos = [PAD, UNK]
    for tok, c in counts.most_common():
        if c >= min_freq and tok not in (PAD, UNK):
            itos.append(tok)
    stoi = {tok: i for i, tok in enumerate(itos)}
    return stoi, itos, counts

min_freq = 2
stoi, itos, counts = build_vocab(tokens, min_freq=min_freq)
pad_idx = stoi[PAD]
unk_idx = stoi[UNK]
vocab_size = len(itos)

print("Vocab size:", vocab_size)
print("Most common:", counts.most_common(10))


Vocab size: 8705
Most common: [(',', 20667), ('.', 17591), ('the', 11793), ('to', 6467), ('and', 6304), ('a', 5302), ('of', 4923), ('”', 4571), ('Harry', 4467), ('was', 4109)]


Numericalize

In [7]:
def numericalize(tokens, stoi, unk_idx):
    return [stoi.get(tok, unk_idx) for tok in tokens]

ids = numericalize(tokens, stoi, unk_idx)
print("First 30 ids:", ids[:30])



First 30 ids: [2540, 1, 3, 6, 222, 3, 599, 2, 8, 904, 625, 2, 1055, 1056, 2, 39, 2115, 5, 194, 23, 35, 39, 1556, 1319, 2, 2022, 19, 73, 196, 3]


Sequences

In [8]:
import torch

seq_len = 40

def make_sequences(ids, seq_len):
    n = (len(ids) - 1) // seq_len * seq_len
    x = torch.tensor(ids[:n], dtype=torch.long)
    y = torch.tensor(ids[1:n+1], dtype=torch.long)
    x = x.view(-1, seq_len)
    y = y.view(-1, seq_len)
    return x, y

X, Y = make_sequences(ids, seq_len)
print("X:", X.shape, "Y:", Y.shape)


X: torch.Size([8151, 40]) Y: torch.Size([8151, 40])


DataLoaders

In [9]:
from torch.utils.data import TensorDataset, DataLoader

def make_loaders(X, Y, batch_size=64, train_ratio=0.9):
    n = X.size(0)
    perm = torch.randperm(n)
    X, Y = X[perm], Y[perm]
    n_train = int(n * train_ratio)
    X_train, Y_train = X[:n_train], Y[:n_train]
    X_val, Y_val = X[n_train:], Y[n_train:]
    train_loader = DataLoader(TensorDataset(X_train, Y_train), batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(TensorDataset(X_val, Y_val), batch_size=batch_size, shuffle=False)
    return train_loader, val_loader

train_loader, val_loader = make_loaders(X, Y, batch_size=64)
print("Train batches:", len(train_loader))
print("Val batches:", len(val_loader))



Train batches: 115
Val batches: 13


In [None]:
import math
from torch import nn
from torch.nn import functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

Hyperparameters

In [None]:
embed_dim  = 128
hidden_dim = 256
num_layers = 1
dropout    = 0.4

learning_rate = 1e-3
num_epochs    = 45
grad_clip     = 1.0

Define Model

In [None]:
class LanguageModelLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)

        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0
        )

        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        """
        x: [B, T] integer token ids
        returns:
          logits: [B, T, V]
          hidden: (h_n, c_n)
        """
        emb = self.embedding(x)
        out, hidden = self.lstm(emb, hidden)
        logits = self.fc(out)
        return logits, hidden


Initialize model, loss, optimizer

In [13]:
model = LanguageModelLSTM(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    hidden_dim=hidden_dim,
    num_layers=num_layers,
    dropout=dropout,
    pad_idx=pad_idx
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model

LanguageModelLSTM(
  (embedding): Embedding(8705, 128, padding_idx=0)
  (lstm): LSTM(128, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=8705, bias=True)
)

In [None]:
def run_epoch(model, data_loader, train=True):
    if train:
        model.train()
    else:
        model.eval()

    total_loss = 0.0
    total_tokens = 0

    for xb, yb in data_loader:
        xb = xb.to(device)
        yb = yb.to(device)

        logits, _ = model(xb)

        loss = criterion(
            logits.view(-1, logits.size(-1)),
            yb.view(-1)
        )

        if train:
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()

        total_loss += loss.item() * xb.numel()
        total_tokens += xb.numel()

    avg_loss = total_loss / total_tokens
    ppl = math.exp(avg_loss) if avg_loss < 20 else float("inf")
    return avg_loss, ppl


Training Loop

In [None]:
best_val_loss = float("inf")
best_epoch = 0

os.makedirs("artifacts", exist_ok=True)

for epoch in range(1, num_epochs + 1):
    train_loss, train_ppl = run_epoch(model, train_loader, train=True)
    val_loss, val_ppl = run_epoch(model, val_loader, train=False)

    print(
        f"Epoch {epoch:02d} | "
        f"Train Loss: {train_loss:.4f}, PPL: {train_ppl:.2f} | "
        f"Val Loss: {val_loss:.4f}, PPL: {val_ppl:.2f}"
    )

    if val_loss < best_val_loss:# save best model based on validation loss
        best_val_loss = val_loss
        best_epoch = epoch
        torch.save(
            {
                "model_state_dict": model.state_dict(),
                "stoi": stoi,
                "itos": itos,
                "pad_idx": pad_idx,
                "unk_idx": unk_idx,
                "vocab_size": vocab_size,
                "embed_dim": embed_dim,
                "hidden_dim": hidden_dim,
                "num_layers": num_layers,
                "dropout": dropout,
                "seq_len": seq_len
            },
            "artifacts/lm_lstm_best.pt"
        )

print(f"Best epoch: {best_epoch}, best val loss: {best_val_loss:.4f}")


Epoch 01 | Train Loss: 6.7126, PPL: 822.67 | Val Loss: 6.1423, PPL: 465.14
Epoch 02 | Train Loss: 5.8888, PPL: 360.96 | Val Loss: 5.7221, PPL: 305.56
Epoch 03 | Train Loss: 5.5252, PPL: 250.93 | Val Loss: 5.4634, PPL: 235.90
Epoch 04 | Train Loss: 5.2760, PPL: 195.59 | Val Loss: 5.2901, PPL: 198.36
Epoch 05 | Train Loss: 5.0969, PPL: 163.52 | Val Loss: 5.1707, PPL: 176.04
Epoch 06 | Train Loss: 4.9580, PPL: 142.31 | Val Loss: 5.0846, PPL: 161.51
Epoch 07 | Train Loss: 4.8441, PPL: 126.99 | Val Loss: 5.0177, PPL: 151.07
Epoch 08 | Train Loss: 4.7455, PPL: 115.06 | Val Loss: 4.9619, PPL: 142.86
Epoch 09 | Train Loss: 4.6587, PPL: 105.50 | Val Loss: 4.9178, PPL: 136.71
Epoch 10 | Train Loss: 4.5812, PPL: 97.63 | Val Loss: 4.8824, PPL: 131.95
Epoch 11 | Train Loss: 4.5098, PPL: 90.90 | Val Loss: 4.8542, PPL: 128.28
Epoch 12 | Train Loss: 4.4442, PPL: 85.13 | Val Loss: 4.8261, PPL: 124.73
Epoch 13 | Train Loss: 4.3835, PPL: 80.12 | Val Loss: 4.8059, PPL: 122.23
Epoch 14 | Train Loss: 4.3256

Validation loss decreased steadily during the early training phase and reached its minimum at epoch 22. After this point, training loss continued to decrease while validation loss began to increase, indicating the onset of overfitting. Therefore, the final model was selected based on the lowest validation loss using an early stopping criterion rather than the last training epoch.

Task 3. Text Generation - Web Application Development

In [None]:
import torch
from torch import nn
from torch.nn import functional as F
import re

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ckpt = torch.load("artifacts/lm_lstm_best.pt", map_location=device)

stoi = ckpt["stoi"]
itos = ckpt["itos"]
pad_idx = ckpt["pad_idx"]
unk_idx = ckpt["unk_idx"]
vocab_size = ckpt["vocab_size"]

embed_dim  = ckpt["embed_dim"]
hidden_dim = ckpt["hidden_dim"]
num_layers = ckpt["num_layers"]
dropout    = ckpt["dropout"]
seq_len    = ckpt["seq_len"]

class LanguageModelLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0
        )
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        emb = self.embedding(x)
        out, hidden = self.lstm(emb, hidden) 
        logits = self.fc(out)
        return logits, hidden

model = LanguageModelLSTM(vocab_size, embed_dim, hidden_dim, num_layers, dropout, pad_idx).to(device)
model.load_state_dict(ckpt["model_state_dict"])
model.eval()

print("Loaded model ✅", "device:", device)


Loaded model ✅ device: cpu


In [23]:
def simple_tokenize(text: str):
    text = re.sub(r"([.,!?;:()\"'])", r" \1 ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip().split(" ") if text.strip() else []

In [None]:
def top_k_filter(logits: torch.Tensor, k: int) -> torch.Tensor:
    if k is None or k <= 0 or k >= logits.numel():
        return logits
    values, idx = torch.topk(logits, k)
    filtered = torch.full_like(logits, float("-inf"))
    filtered[idx] = logits[idx]
    return filtered

@torch.no_grad()
def generate_text(prompt: str, max_new_tokens: int = 60, temperature: float = 1.0, top_k: int = 50) -> str:
    model.eval()

    prompt_tokens = simple_tokenize(prompt)
    prompt_ids = [stoi.get(t, unk_idx) for t in prompt_tokens]
    if len(prompt_ids) == 0:
        prompt_ids = [unk_idx]

    x = torch.tensor(prompt_ids, dtype=torch.long, device=device).unsqueeze(0)
    logits, hidden = model(x, None)

    generated = list(prompt_ids)

    for _ in range(max_new_tokens):
        next_logits = logits[0, -1]
        next_logits = next_logits / max(temperature, 1e-6)
        next_logits = top_k_filter(next_logits, top_k)

        probs = F.softmax(next_logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1).item()

        generated.append(next_id)

        x_next = torch.tensor([[next_id]], dtype=torch.long, device=device)
        logits, hidden = model(x_next, hidden)

    return " ".join(itos[i] if i < len(itos) else "<unk>" for i in generated)


In [21]:
print(generate_text("Harry Potter is", max_new_tokens=60, temperature=1.0, top_k=50))


Harry Potter is . We’re going to be getting into , while I’d find out of all this . They told him what the most is at Hogwarts School of wizards <unk> you coming . …” Dumbledore came to Harry , who was holding a Slytherin table before he’d gone over , and he had never heard a moment , the Snitch he
