Importing data from file

#importing data
import pandas as p

In [None]:
import re
# read file
text = open('linux_data', 'r', encoding='utf-8').read()
text = text.replace('\r\n', '\n').replace('\r', '\n')
text = text[:len(text)//10]  # use only last 90% of the text

In [4]:
import re

# Read the dataset
with open('linux_data.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Normalize line endings (convert \r\n or \r to \n)
text = re.sub(r'\r\n?|\r', '\n', text)

# Split by newline → each line is one statement
statements = text.split('\n')

# Clean each statement (remove leading/trailing spaces)
statements = [re.sub(r'^\s+|\s+$', '', stmt) for stmt in statements]

# Remove empty lines
statements = [stmt for stmt in statements if stmt]

# Example: print first few statements
for i, stmt in enumerate(statements[:10]):
    print(f"{i+1:02d}: {stmt}")

# Save processed dataset
with open('processed_dataset.txt', 'w', encoding='utf-8') as out:
    for stmt in statements:
        out.write(stmt + '\n')


01: /*
02: * linux/kernel/irq/autoprobe.c
03: *
04: * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
05: *
06: * This file contains the interrupt probing code and driver APIs.
07: */
08: #include <linux/irq.h>
09: #include <linux/module.h>
10: #include <linux/interrupt.h>


In [29]:
words
# paragraphs

['well',
 'prince',
 'so',
 'genoa',
 'and',
 'lucca',
 'are',
 'now',
 'just',
 'family',
 'estates',
 'of',
 'the',
 'buonapartes',
 '.',
 'but',
 'i',
 'warn',
 'you',
 'if',
 'you',
 'dont',
 'tell',
 'me',
 'that',
 'this',
 'means',
 'war',
 'if',
 'you',
 'still',
 'try',
 'to',
 'defend',
 'the',
 'infamies',
 'and',
 'horrors',
 'perpetrated',
 'by',
 'that',
 'antichrist',
 'i',
 'really',
 'believe',
 'he',
 'is',
 'antichrist',
 'i',
 'will',
 'have',
 'nothing',
 'more',
 'to',
 'do',
 'with',
 'you',
 'and',
 'you',
 'are',
 'no',
 'longer',
 'my',
 'friend',
 'no',
 'longer',
 'my',
 'faithful',
 'slave',
 'as',
 'you',
 'call',
 'yourself',
 'but',
 'how',
 'do',
 'you',
 'do',
 'i',
 'see',
 'i',
 'have',
 'frightened',
 'you',
 'sit',
 'down',
 'and',
 'tell',
 'me',
 'all',
 'the',
 'news',
 '.',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 '<PAD>',
 'it',
 'was',
 'in',
 'july',
 '1805',
 'and',
 'the',
 'speaker',
 'was',
 'the',
 'well',
 'known',
 'anna',
 'pavlovna',
 'schere

In [30]:
window_size = 4

X, Y = [], []
for i in range(len(words)):
    start_idx = max(0, i - window_size)
    context = words[start_idx:i]
    context = ['<PAD>'] * (window_size - len(context)) + context  # left padding
    target = words[i]
    
    X.append(context)
    Y.append(target)


for i in range(100):
    print(' '.join(X[i]), '--->', Y[i])


<PAD> <PAD> <PAD> <PAD> ---> well
<PAD> <PAD> <PAD> well ---> prince
<PAD> <PAD> well prince ---> so
<PAD> well prince so ---> genoa
well prince so genoa ---> and
prince so genoa and ---> lucca
so genoa and lucca ---> are
genoa and lucca are ---> now
and lucca are now ---> just
lucca are now just ---> family
are now just family ---> estates
now just family estates ---> of
just family estates of ---> the
family estates of the ---> buonapartes
estates of the buonapartes ---> .
of the buonapartes . ---> but
the buonapartes . but ---> i
buonapartes . but i ---> warn
. but i warn ---> you
but i warn you ---> if
i warn you if ---> you
warn you if you ---> dont
you if you dont ---> tell
if you dont tell ---> me
you dont tell me ---> that
dont tell me that ---> this
tell me that this ---> means
me that this means ---> war
that this means war ---> if
this means war if ---> you
means war if you ---> still
war if you still ---> try
if you still try ---> to
you still try to ---> defend
still try t

In [31]:
from collections import Counter

word_counts = Counter(words)
vocab = sorted(word_counts.keys()) 
vocab_size = len(vocab)

print("Vocabulary Size:", vocab_size)

Vocabulary Size: 5808


In [32]:
most_common_10 = word_counts.most_common(10)
least_common_10 = word_counts.most_common()[-10:]

print("10 Most Frequent Words:", most_common_10)
print("10 Least Frequent Words:", least_common_10)


10 Most Frequent Words: [('<PAD>', 5540), ('the', 3190), ('.', 2967), ('and', 2049), ('to', 1604), ('of', 1135), ('a', 1121), ('he', 890), ('in', 870), ('his', 870)]
10 Least Frequent Words: [('scratching', 1), ('witing', 1), ('quicker', 1), ('contents', 1), ('fwiend', 1), ('childwen', 1), ('falls', 1), ('pua', 1), ('cweation', 1), ('abashed', 1)]


In [33]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple Silicon GPU (MPS)")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using NVIDIA GPU (CUDA)")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using Apple Silicon GPU (MPS)


In [34]:
stoi = {word: i for i, word in enumerate(vocab)}
itos = {i: word for word, i in stoi.items()}
emb_dim = 64
emb = torch.nn.Embedding(len(stoi), emb_dim)
print(stoi)
print(itos)

{'.': 0, '10': 1, '11': 2, '13th': 3, '1805': 4, '18th': 5, '2': 6, '217': 7, '3': 8, '4': 9, '7': 10, '<PAD>': 11, 'a': 12, 'abashed': 13, 'abbe': 14, 'abbes': 15, 'abc': 16, 'ability': 17, 'able': 18, 'abnegation': 19, 'abnormally': 20, 'about': 21, 'above': 22, 'abroad': 23, 'abrupt': 24, 'abruptly': 25, 'absent': 26, 'absolute': 27, 'absolutely': 28, 'abstemious': 29, 'abstract': 30, 'absurd': 31, 'abuses': 32, 'academy': 33, 'accent': 34, 'accept': 35, 'accident': 36, 'accompanied': 37, 'accompany': 38, 'accomplished': 39, 'accord': 40, 'accorded': 41, 'according': 42, 'accoucheur': 43, 'account': 44, 'accounts': 45, 'accustomed': 46, 'aches': 47, 'achilles': 48, 'acknowledged': 49, 'acknowledging': 50, 'acquaintance': 51, 'acquaintances': 52, 'acquainted': 53, 'acrid': 54, 'across': 55, 'act': 56, 'acted': 57, 'action': 58, 'actions': 59, 'active': 60, 'activity': 61, 'actor': 62, 'actress': 63, 'actresses': 64, 'acts': 65, 'acutely': 66, 'added': 67, 'addicted': 68, 'adding': 69

In [35]:
print(emb.weight.shape)  # should be (vocab_size, emb_dim)

torch.Size([5808, 64])


In [36]:
X_encoded = [[stoi[word] for word in seq] for seq in X]
Y_encoded = [stoi[word] for word in Y] 

In [37]:
print(X_encoded[:5])

[[11, 11, 11, 11], [11, 11, 11, 5629], [11, 11, 5629, 3896], [11, 5629, 3896, 4714], [5629, 3896, 4714, 2157]]


In [38]:
stoi['<PAD>']

11

In [39]:
X = torch.tensor(X_encoded, dtype=torch.long).to(device)
Y = torch.tensor(Y_encoded, dtype=torch.long).to(device)

In [40]:
emb.weight

Parameter containing:
tensor([[-0.6156, -0.2750, -0.4615,  ...,  0.4864,  0.3282, -1.6783],
        [ 2.5968,  0.3872,  0.1526,  ..., -0.0506, -0.2860, -0.2362],
        [ 0.7613, -0.0426,  0.2749,  ..., -2.1412,  0.5047,  0.8150],
        ...,
        [ 1.5270,  0.3465, -0.1628,  ...,  0.6438,  0.5925,  1.2328],
        [ 1.6342, -1.6133,  1.0764,  ..., -0.1168, -2.0465,  0.8077],
        [-1.8455, -0.0444, -0.5948,  ..., -0.9066,  0.0647,  0.7223]],
       requires_grad=True)

In [41]:
emb.weight.shape

torch.Size([5808, 64])

In [42]:
import torch
import torch.nn as nn
import torch.nn.functional as F


In [43]:
class Nextword(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim=64, hidden_size=1024):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.fc1 = nn.Linear(block_size * emb_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        # x: (batch, block_size)
        x = self.emb(x)                           # (batch, block_size, emb_dim)
        x = x.view(x.shape[0], -1)                # flatten
        x = F.tanh(self.fc1(x))
        x = F.tanh(self.fc2(x))
        x = self.fc3(x)                           # logits
        return x



In [44]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(
    X, Y, test_size=0.2, random_state=42
)
train_data = torch.utils.data.TensorDataset(X_train, Y_train)
val_data = torch.utils.data.TensorDataset(X_val, Y_val)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=32)



In [45]:
import time
def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def print_model_summary(model):
    print("Model Summary:")
    for name, module in model.named_modules():
        if len(list(module.children())) == 0:
            params = sum(p.numel() for p in module.parameters())
            print(f"{name:<20} {params:,} parameters")
    print(f"\nTotal trainable parameters: {count_params(model):,}\n")


def train_model(model, X, Y, X_val, Y_val, epochs=500, batch_size=1024, lr=1e-3, wd=1e-4, print_every=100):
    """Train the MLP model using AdamW optimizer."""
    model.train()
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
    loss_fn = nn.CrossEntropyLoss()

    losses, times, val_losses = [], [], []

    for epoch in range(epochs):
        start = time.time()
        total_loss, n_batches = 0, 0

        # ---- Training ----
        for i in range(0, X.shape[0], batch_size):
            x_batch = X[i:i+batch_size]
            y_batch = Y[i:i+batch_size]

            logits = model(x_batch)
            loss = loss_fn(logits, y_batch)

            opt.zero_grad()
            loss.backward()
            opt.step()

            total_loss += loss.item()
            n_batches += 1

        avg_loss = total_loss / n_batches
        losses.append(avg_loss)
        times.append(time.time() - start)

        # ---- Validation ----
        model.eval()
        val_total_loss, val_batches = 0, 0
        with torch.no_grad():
            for i in range(0, X_val.shape[0], batch_size):
                x_batch = X_val[i:i+batch_size]
                y_batch = Y_val[i:i+batch_size]

                logits = model(x_batch)
                loss = loss_fn(logits, y_batch)
                val_total_loss += loss.item()
                val_batches += 1

        avg_val_loss = val_total_loss / val_batches
        val_losses.append(avg_val_loss)
        model.train()

        # ---- Logging ----
        if epoch % print_every == 0:
            print(f"Epoch {epoch:4d} | Train Loss: {avg_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Time: {times[-1]:.2f}s")

    return losses, times, val_losses


In [46]:
# Create enhanced model and show before training
block_size = 4
model = Nextword(block_size, len(stoi), emb_dim, hidden_size=64).to(device)
model = torch.compile(model)

## Model parameters:
print_model_summary(model)


Model Summary:
_orig_mod.emb        371,712 parameters
_orig_mod.fc1        16,448 parameters
_orig_mod.fc2        4,160 parameters
_orig_mod.fc3        377,520 parameters

Total trainable parameters: 769,840



In [47]:
import torch

def generate_text(model, stoi, itos, block_size, device, start_context=None, max_len=20):
    """
    Generate a sequence of words from a trained model.

    Args:
        model: Trained PyTorch language model
        stoi: dict, mapping from word → index
        itos: dict, mapping from index → word
        block_size: int, context size expected by the model
        device: torch device ('cuda' or 'cpu')
        start_context: list of str (optional), seed words
        max_len: int, number of words to generate

    Returns:
        str: Generated text sequence
    """

    model.eval()  # evaluation mode (no dropout, etc.)

    # --- Initialize context ---
    if start_context is None:
        context = [stoi['<PAD>']] * block_size  # start with padding
    else:
        # convert words → indices, pad or truncate to block_size
        context = [stoi.get(w, 0) for w in start_context][-block_size:]
        context = [stoi['<PAD>']] * (block_size - len(context)) + context

    generated_words = []

    # --- Generate words one by one ---
    with torch.no_grad():
        for _ in range(max_len):
            x = torch.tensor(context).view(1, -1).to(device)
            y_pred = model(x)  # logits for next word
            ix = torch.distributions.categorical.Categorical(logits=y_pred).sample().item()
            word = itos[ix]

            # stop at end token 
            # if word == eos:
            #     break

            generated_words.append(word)

            # slide the context window forward
            context = context[1:] + [ix]

    model.train()  # restore training mode

    return ' '.join(generated_words)


In [48]:
text = generate_text(model, stoi, itos, block_size=4, device=device)
print(text)

coronation mingled swaggering reflections suite talking extricated sweetly blue tray disappeared golitsyn awful latter connections begun pomerania accompanied coiling importance


In [49]:
losses, timing , val_losses = train_model(model, X_train, Y_train,X_val,Y_val, epochs=400, batch_size=1024, lr=1e-3, wd=1e-4, print_every=100)

Epoch    0 | Train Loss: 7.8493 | Val Loss: 6.6049 | Time: 0.97s
Epoch  100 | Train Loss: 1.6963 | Val Loss: 7.1042 | Time: 0.41s
Epoch  200 | Train Loss: 0.7521 | Val Loss: 9.1304 | Time: 0.39s
Epoch  300 | Train Loss: 0.4621 | Val Loss: 10.8069 | Time: 0.35s


In [50]:
text = generate_text(model, stoi, itos, block_size=4, device=device, start_context="maid of honor and favorite", max_len=50)
print(text)

<PAD> <PAD> <PAD> <PAD> all either <PAD> <PAD> <PAD> <PAD> denisovs face puckered still important friend dolokhov tried sworn down . <PAD> <PAD> <PAD> <PAD> no impossible said the other company as to the money and just to another . as soon wrote natasha like at the door . <PAD>


In [51]:
torch.save(model.state_dict(), 'nextword-model-lightning.pth')