In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

import math
from collections import Counter
from tqdm import tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# torch.cuda.get_device_name(0)

cpu


# 1. Data Acquisition (Task 1)

## Load Data

In [36]:
nltk.download("gutenberg", download_dir="data/nltk_data")
import nltk
nltk.data.path.append("data/nltk_data")

[nltk_data] Downloading package gutenberg to data/nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.


In [37]:
from nltk.corpus import gutenberg

text = gutenberg.raw("austen-emma.txt")
print(len(text), text[:500])

887071 [Emma by Jane Austen 1816]

VOLUME I

CHAPTER I


Emma Woodhouse, handsome, clever, and rich, with a comfortable home
and happy disposition, seemed to unite some of the best blessings
of existence; and had lived nearly twenty-one years in the world
with very little to distress or vex her.

She was the youngest of the two daughters of a most affectionate,
indulgent father; and had, in consequence of her sister's marriage,
been mistress of his house from a very early period.  Her mother
had died t


# 2. Preprocessing

## Split data

In [10]:
import re

# Split into paragraphs
paras = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
print("paragraphs:", len(paras))
print(paras[10][:200])

# Split 90/5/5
# n = len(paras)
# train_paras = paras[: int(0.90*n)]
# valid_paras = paras[int(0.90*n) : int(0.95*n)]
# test_paras  = paras[int(0.95*n) :]

# Split 80/10/10
n = len(paras)
train_paras = paras[: int(0.80 * n)]
valid_paras = paras[int(0.80 * n) : int(0.90 * n)]
test_paras  = paras[int(0.90 * n) :]

train_dataset = [{"text": p} for p in train_paras]
valid_dataset = [{"text": p} for p in valid_paras]
test_dataset  = [{"text": p} for p in test_paras]

len(train_dataset), len(valid_dataset), len(test_dataset)


paragraphs: 2371
The evil of the actual disparity in their ages (and Mr. Woodhouse had
not married early) was much increased by his constitution and habits;
for having been a valetudinarian all his life, without activ


(1896, 237, 238)

## Tokenizing

In [12]:
token_re = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)?|[0-9]+|[^\sA-Za-z0-9]")

def tokenize(text: str):
    return token_re.findall(text.lower())


## Nummericalizing

In [13]:
special = ["<unk>", "<pad>", "<eos>"]

def build_vocab(paragraphs, min_freq=3):
    counter = Counter()
    for p in paragraphs:
        counter.update(tokenize(p))

    itos = list(special)
    for tok, freq in counter.most_common():
        if freq >= min_freq and tok not in special:
            itos.append(tok)

    stoi = {tok:i for i,tok in enumerate(itos)}
    return stoi, itos

stoi, itos = build_vocab(train_paras, min_freq=3)
vocab_size = len(itos)

print("vocab_size:", vocab_size)
print("itos[:20]:", itos[:20])


vocab_size: 2939
itos[:20]: ['<unk>', '<pad>', '<eos>', ',', '.', '-', 'the', 'to', 'and', 'of', '"', 'i', 'a', 'it', ';', 'was', 'her', 'she', 'not', 'in']


In [14]:
UNK = stoi["<unk>"]
EOS = stoi["<eos>"]

def encode_paragraphs(paragraphs):
    ids = []
    for p in paragraphs:
        toks = tokenize(p) + ["<eos>"]
        ids.extend([stoi.get(t, UNK) for t in toks])
    return torch.tensor(ids, dtype=torch.long)

train_ids = encode_paragraphs(train_paras)
valid_ids = encode_paragraphs(valid_paras)
test_ids  = encode_paragraphs(test_paras)

train_ids.shape, valid_ids.shape, test_ids.shape


(torch.Size([155786]), torch.Size([23013]), torch.Size([21859]))

# 3. Prepare the batch loader

## Prepare ddata

In [16]:
def get_data(data, batch_size):
    n_batches = data.size(0) // batch_size
    data = data[: n_batches * batch_size]
    return data.view(batch_size, n_batches)

batch_size = 64
train_data = get_data(train_ids, batch_size)
valid_data = get_data(valid_ids, batch_size)
test_data  = get_data(test_ids,  batch_size)

train_data.shape, valid_data.shape, test_data.shape


(torch.Size([64, 2434]), torch.Size([64, 359]), torch.Size([64, 341]))

In [17]:
def get_batch(data, seq_len, idx):
    x = data[:, idx:idx+seq_len]             # [B, S]
    y = data[:, idx+1:idx+seq_len+1]         # [B, S]
    return x, y


# 4. Modeling

In [18]:
class LSTMLM(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout):
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim = hid_dim

        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(
            emb_dim, hid_dim,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            batch_first=True
        )
        self.drop = nn.Dropout(dropout)
        self.fc = nn.Linear(hid_dim, vocab_size)

    def init_hidden(self, batch_size, device):
        h = torch.zeros(self.num_layers, batch_size, self.hid_dim, device=device)
        c = torch.zeros(self.num_layers, batch_size, self.hid_dim, device=device)
        return (h, c)

    def detach_hidden(self, hidden):
        return (hidden[0].detach(), hidden[1].detach())

    def forward(self, x, hidden):
        # x: [B, S]
        x = self.drop(self.emb(x))               # [B, S, E]
        out, hidden = self.lstm(x, hidden)       # [B, S, H]
        out = self.drop(out)
        logits = self.fc(out)                    # [B, S, V]
        return logits, hidden


In [19]:
def train_epoch(model, data, optimizer, criterion, seq_len, clip):
    model.train()
    total_loss = 0.0
    T = data.size(1)

    hidden = model.init_hidden(batch_size=data.size(0), device=data.device)

    for idx in tqdm(range(0, T - seq_len - 1, seq_len), desc="train", leave=False):
        optimizer.zero_grad()
        hidden = model.detach_hidden(hidden)

        x, y = get_batch(data, seq_len, idx)
        logits, hidden = model(x, hidden)

        # flatten
        B, S, V = logits.shape
        loss = criterion(logits.reshape(B*S, V), y.reshape(B*S))

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        total_loss += loss.item()

    return total_loss / max(1, (T // seq_len))


@torch.no_grad()
def eval_epoch(model, data, criterion, seq_len):
    model.eval()
    total_loss = 0.0
    T = data.size(1)

    hidden = model.init_hidden(batch_size=data.size(0), device=data.device)

    for idx in range(0, T - seq_len - 1, seq_len):
        hidden = model.detach_hidden(hidden)
        x, y = get_batch(data, seq_len, idx)
        logits, hidden = model(x, hidden)

        B, S, V = logits.shape
        loss = criterion(logits.reshape(B*S, V), y.reshape(B*S))
        total_loss += loss.item()

    return total_loss / max(1, (T // seq_len))


In [24]:
emb_dim = 1024
hid_dim = 1024
num_layers = 2
dropout_rate = 0.65
lr = 1e-3

model = LSTMLM(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

seq_len = 35
clip = 0.25
epochs = 20

best_valid = float("inf")

for ep in range(1, epochs+1):
    tr_loss = train_epoch(model, train_data, optimizer, criterion, seq_len, clip)
    va_loss = eval_epoch(model, valid_data, criterion, seq_len)

    if va_loss < best_valid:
        best_valid = va_loss
        torch.save(model.state_dict(), "model/language_model.pt")

    print(f"Epoch {ep:02d} | Train Perplexity {math.exp(tr_loss):.3f} | Valid Perplexity {math.exp(va_loss):.3f}")


                                                      

Epoch 01 | Train Perplexity 294.472 | Valid Perplexity 135.579


                                                      

Epoch 02 | Train Perplexity 134.846 | Valid Perplexity 92.210


                                                      

Epoch 03 | Train Perplexity 101.764 | Valid Perplexity 77.885


                                                      

Epoch 04 | Train Perplexity 87.037 | Valid Perplexity 71.280


                                                      

Epoch 05 | Train Perplexity 78.249 | Valid Perplexity 67.169


                                                      

Epoch 06 | Train Perplexity 72.003 | Valid Perplexity 64.031


                                                      

Epoch 07 | Train Perplexity 67.152 | Valid Perplexity 61.946


                                                      

Epoch 08 | Train Perplexity 63.368 | Valid Perplexity 60.367


                                                      

Epoch 09 | Train Perplexity 59.929 | Valid Perplexity 59.225


                                                      

Epoch 10 | Train Perplexity 57.252 | Valid Perplexity 58.057


                                                      

Epoch 11 | Train Perplexity 54.626 | Valid Perplexity 57.063


                                                      

Epoch 12 | Train Perplexity 52.387 | Valid Perplexity 56.989


                                                      

Epoch 13 | Train Perplexity 50.243 | Valid Perplexity 56.459


                                                      

Epoch 14 | Train Perplexity 48.452 | Valid Perplexity 55.920


                                                      

Epoch 15 | Train Perplexity 46.668 | Valid Perplexity 55.708


                                                      

Epoch 16 | Train Perplexity 45.144 | Valid Perplexity 55.422


                                                      

Epoch 17 | Train Perplexity 43.471 | Valid Perplexity 55.325


                                                      

Epoch 18 | Train Perplexity 42.054 | Valid Perplexity 55.374


                                                      

Epoch 19 | Train Perplexity 40.762 | Valid Perplexity 55.338


                                                      

Epoch 20 | Train Perplexity 39.506 | Valid Perplexity 55.246


In [38]:
import os

os.makedirs("model", exist_ok=True)

with open("model/vocab_itos.json", "w", encoding="utf-8") as f:
    json.dump(itos, f, ensure_ascii=False)

print("Saved model and vocabulary.")


Saved model and vocabulary.


In [None]:
import json
config = {
    "emb_dim": emb_dim,
    "hid_dim": hid_dim,
    "num_layers": num_layers
}

with open("model/config.json", "w") as f:
    json.dump(config, f)


In [26]:
model.load_state_dict(torch.load("model/language_model.pt", map_location=device))
te_loss = eval_epoch(model, test_data, criterion, seq_len)
print("Test Perplexity:", math.exp(te_loss))


Test Perplexity: 55.899205435391465


In [29]:
import torch.nn.functional as F
UNK = stoi["<unk>"]
EOS = stoi["<eos>"]

@torch.no_grad()
def generate_text(
    model,
    prompt,
    max_new_tokens=60,
    temperature=0.8,
    seed=None
):
    if seed is not None:
        torch.manual_seed(seed)

    model.eval()

    tokens = tokenize(prompt.lower())
    if not tokens:
        tokens = ["the"]

    ids = [stoi.get(t, UNK) for t in tokens]

    hidden = model.init_hidden(batch_size=1, device=device)

    # warm up model with the prompt
    x = torch.tensor([ids], dtype=torch.long, device=device)
    _, hidden = model(x, hidden)

    current_id = ids[-1]

    for _ in range(max_new_tokens):
        x = torch.tensor([[current_id]], dtype=torch.long, device=device)
        logits, hidden = model(x, hidden)

        logits = logits[:, -1, :] / temperature
        probs = F.softmax(logits, dim=-1)

        next_id = torch.multinomial(probs, 1).item()

        if next_id == EOS:
            break

        ids.append(next_id)
        current_id = next_id

    return " ".join(itos[i] for i in ids)


In [35]:
prompt = "emma was "
max_new_tokens = 30
seed = 0

temperatures = [0.5, 0.7, 0.9, 1.0]

for temperature in temperatures:
    text = generate_text(
        model,
        prompt=prompt,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        seed=seed
    )

    print(f"Temperature = {temperature}")
    print(text)
    print()


Temperature = 0.5
emma was pleased by the young lady .

Temperature = 0.7
emma was pleased into the mouth ; and when the <unk> of the churchills , as one of the happiness , was not often <unk> ; mr . knightley was telling her

Temperature = 0.9
emma was pleased into the mouth ; and when the <unk> of the parish , as one of the happiness , was not often <unk> ; mr . knightley must be safely

Temperature = 1.0
emma was pleased into the mouth ; and when the <unk> of the parish , as one of the happiness , was correct by no means mr . knightley , which seemed



Conclusion: <br>
The LSTMs is train using a word level 