<a href="https://colab.research.google.com/github/Vishalbharti29/advance-nlp-assignments/blob/main/21310_vishalbharti_Advance_nlpassignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1: Imports and Setup
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
import re
import urllib.request  # for downloading if needed

print("Setup complete.")

Setup complete.


In [2]:
# Cell 2: Download Dataset (AG News for text classification, 4 classes)
# Note: Using AG News dataset as it matches the class labels in the results (4 classes: World, Sports, Business, Sci/Tech).
# If the actual dataset is different, replace the URLs with the correct ones from the assignment repo.

train_url = 'https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv'
test_url = 'https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv'

def download_csv(url, filename):
    urllib.request.urlretrieve(url, filename)

download_csv(train_url, 'train.csv')
download_csv(test_url, 'test.csv')

print("Dataset downloaded.")

Dataset downloaded.


In [3]:
# Cell 3: Load and Pre-process Data (Custom Pre-processing)
def load_data(filename):
    df = pd.read_csv(filename, header=None, names=['label', 'title', 'text'])
    df['full_text'] = df['title'] + ' ' + df['text']
    texts = df['full_text'].values
    labels = df['label'].values - 1  # 0-3 for classes
    return texts, labels

train_texts, train_labels = load_data('train.csv')
test_texts, test_labels = load_data('test.csv')

# Split train into train/val
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)

print(f"Train: {len(train_texts)}, Val: {len(val_texts)}, Test: {len(test_texts)}")

# Custom Pre-processing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # remove punctuation
    return text.split()

# Build vocabulary (limit to 5000 most common words for speed)
from collections import Counter

all_words = []
for text in train_texts:
    all_words.extend(preprocess_text(text))

word_counts = Counter(all_words)
vocab = ['<UNK>', '<PAD>'] + [word for word, _ in word_counts.most_common(5000)]
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

vocab_size = len(vocab)
print(f"Vocab size: {vocab_size}")

# Convert to indices and pad
max_len = 50  # Reduced for speed

def text_to_indices(text, word_to_idx, max_len):
    tokens = preprocess_text(text)
    indices = [word_to_idx.get(word, 0) for word in tokens]  # 0 for UNK
    if len(indices) < max_len:
        indices += [1] * (max_len - len(indices))  # 1 for PAD
    return indices[:max_len]

X_train = np.array([text_to_indices(text, word_to_idx, max_len) for text in train_texts])
X_val = np.array([text_to_indices(text, word_to_idx, max_len) for text in val_texts])
X_test = np.array([text_to_indices(text, word_to_idx, max_len) for text in test_texts])

y_train = train_labels
y_val = val_labels
y_test = test_labels

print("Data prepared.")
print(f"X_train shape: {X_train.shape}")

Train: 96000, Val: 24000, Test: 7600
Vocab size: 5002
Data prepared.
X_train shape: (96000, 50)


In [4]:
# Cell 4: Utility Functions
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

def cross_entropy_loss(y_true, y_pred):
    return -np.mean(np.log(y_pred[range(len(y_true)), y_true] + 1e-8))

def sigmoid(z):
    return 1 / (1 + np.exp(-np.clip(z, -250, 250)))

class Adam:
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.eps = eps
        self.t = 0
        self.m = {}
        self.v = {}

    def update(self, param_name, param, grad):
        if param_name not in self.m:
            self.m[param_name] = np.zeros_like(param)
            self.v[param_name] = np.zeros_like(param)
        self.t += 1
        m = self.m[param_name]
        v = self.v[param_name]
        m = self.beta1 * m + (1 - self.beta1) * grad
        v = self.beta2 * v + (1 - self.beta2) * (grad ** 2)
        m_hat = m / (1 - self.beta1 ** self.t)
        v_hat = v / (1 - self.beta2 ** self.t)
        param -= self.lr * m_hat / (np.sqrt(v_hat) + self.eps)
        self.m[param_name] = m
        self.v[param_name] = v
        return param

In [5]:
# Cell 5: Embedding Layer
class Embedding:
    def __init__(self, vocab_size, embed_dim):
        self.embed_dim = embed_dim
        self.W = np.random.normal(0, 0.1, (vocab_size, embed_dim))

    def forward(self, x):
        batch_size, seq_len = x.shape
        embeds = np.zeros((batch_size, seq_len, self.embed_dim))
        for b in range(batch_size):
            for t in range(seq_len):
                embeds[b, t] = self.W[x[b, t]]
        return embeds

In [6]:
# Cell 6: Optimized Simple RNN Model with Adam and Embedding Grad
class SimpleRNN:
    def __init__(self, input_dim, hidden_dim, output_dim, lr=0.001):
        self.hidden_dim = hidden_dim
        self.Wxh = np.random.normal(0, 0.01, (input_dim, hidden_dim))
        self.Whh = np.random.normal(0, 0.01, (hidden_dim, hidden_dim))
        self.bh = np.zeros((1, hidden_dim))
        self.Why = np.random.normal(0, 0.01, (hidden_dim, output_dim))
        self.by = np.zeros((1, output_dim))
        self.optimizer = Adam(lr=lr)

    def forward(self, x, h0=None):
        batch_size, seq_len, _ = x.shape
        if h0 is None:
            h0 = np.zeros((batch_size, self.hidden_dim))
        hs = np.zeros((batch_size, seq_len + 1, self.hidden_dim))
        hs[:, 0] = h0
        for t in range(seq_len):
            hs[:, t+1] = np.tanh(np.dot(x[:, t], self.Wxh) + np.dot(hs[:, t], self.Whh) + self.bh)
        y = np.dot(hs[:, -1], self.Why) + self.by
        p = softmax(y)
        return p, hs

    def backward(self, x, hs, dy):
        batch_size, seq_len, _ = x.shape
        # Output layer
        dWhy = np.dot(hs[:, -1].T, dy) / batch_size
        dby = np.sum(dy, axis=0, keepdims=True) / batch_size
        dh = np.dot(dy, self.Why.T)

        # BPTT
        dWxh = np.zeros_like(self.Wxh)
        dWhh = np.zeros_like(self.Whh)
        dbh = np.zeros_like(self.bh)
        d_embeds = np.zeros_like(x)
        dh_next = dh
        for t in reversed(range(seq_len)):
            dh_raw = dh_next * (1 - hs[:, t+1] ** 2)
            dWxh += np.dot(x[:, t].T, dh_raw) / batch_size
            dWhh += np.dot(hs[:, t].T, dh_raw) / batch_size
            dbh += np.sum(dh_raw, axis=0, keepdims=True) / batch_size
            dh_next = np.dot(dh_raw, self.Whh.T)
            d_embeds[:, t] += np.dot(dh_raw, self.Wxh.T) / batch_size  # Average grad

        # Update params with Adam
        self.Wxh = self.optimizer.update('Wxh', self.Wxh, dWxh)
        self.Whh = self.optimizer.update('Whh', self.Whh, dWhh)
        self.bh = self.optimizer.update('bh', self.bh, dbh)
        self.Why = self.optimizer.update('Why', self.Why, dWhy)
        self.by = self.optimizer.update('by', self.by, dby)

        return d_embeds

class RNNClassifier:
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, lr=0.001):
        self.embedding = Embedding(vocab_size, embed_dim)
        self.rnn = SimpleRNN(embed_dim, hidden_dim, num_classes, lr)
        self.optimizer = Adam(lr=lr)  # For embedding

    def forward(self, x):
        embeds = self.embedding.forward(x)
        p, hs = self.rnn.forward(embeds)
        return p

    def train_step(self, x, y):
        batch_size = x.shape[0]
        p = self.forward(x)
        dy = p.copy()
        dy[range(batch_size), y] -= 1
        embeds = self.embedding.forward(x)
        _, hs = self.rnn.forward(embeds)
        d_embeds = self.rnn.backward(embeds, hs, dy)
        # Update embedding
        grad_W = np.zeros_like(self.embedding.W)
        for b in range(batch_size):
            for t in range(max_len):
                if x[b, t] != 1:  # Skip PAD
                    grad_W[x[b, t]] += d_embeds[b, t]
        grad_W /= batch_size
        self.embedding.W = self.optimizer.update('embed_W', self.embedding.W, grad_W)
        loss = cross_entropy_loss(y, p)
        return loss

    def save_weights(self, prefix):
        np.save(f'{prefix}_embed.npy', self.embedding.W)
        np.save(f'{prefix}_Wxh.npy', self.rnn.Wxh)
        np.save(f'{prefix}_Whh.npy', self.rnn.Whh)
        np.save(f'{prefix}_bh.npy', self.rnn.bh)
        np.save(f'{prefix}_Why.npy', self.rnn.Why)
        np.save(f'{prefix}_by.npy', self.rnn.by)

    def load_weights(self, prefix):
        self.embedding.W = np.load(f'{prefix}_embed.npy')
        self.rnn.Wxh = np.load(f'{prefix}_Wxh.npy')
        self.rnn.Whh = np.load(f'{prefix}_Whh.npy')
        self.rnn.bh = np.load(f'{prefix}_bh.npy')
        self.rnn.Why = np.load(f'{prefix}_Why.npy')
        self.rnn.by = np.load(f'{prefix}_by.npy')

In [7]:
# Cell 7: Optimized Simple LSTM Model with Adam and Embedding Grad
class SimpleLSTM:
    def __init__(self, input_dim, hidden_dim, output_dim, lr=0.001):
        self.hidden_dim = hidden_dim
        self.Wf = np.random.normal(0, 0.01, (input_dim, hidden_dim))
        self.Wi = np.random.normal(0, 0.01, (input_dim, hidden_dim))
        self.Wo = np.random.normal(0, 0.01, (input_dim, hidden_dim))
        self.Wg = np.random.normal(0, 0.01, (input_dim, hidden_dim))
        self.Uf = np.random.normal(0, 0.01, (hidden_dim, hidden_dim))
        self.Ui = np.random.normal(0, 0.01, (hidden_dim, hidden_dim))
        self.Uo = np.random.normal(0, 0.01, (hidden_dim, hidden_dim))
        self.Ug = np.random.normal(0, 0.01, (hidden_dim, hidden_dim))
        self.bf = np.zeros((1, hidden_dim))
        self.bi = np.zeros((1, hidden_dim))
        self.bo = np.zeros((1, hidden_dim))
        self.bg = np.zeros((1, hidden_dim))
        self.Why = np.random.normal(0, 0.01, (hidden_dim, output_dim))
        self.by = np.zeros((1, output_dim))
        self.optimizer = Adam(lr=lr)

    def forward(self, x, h0=None, c0=None):
        batch_size, seq_len, _ = x.shape
        if h0 is None:
            h0 = np.zeros((batch_size, self.hidden_dim))
        if c0 is None:
            c0 = np.zeros((batch_size, self.hidden_dim))
        hs = np.zeros((batch_size, seq_len + 1, self.hidden_dim))
        cs = np.zeros((batch_size, seq_len + 1, self.hidden_dim))
        hs[:, 0] = h0
        cs[:, 0] = c0
        fs, is_, os_, gs = [], [], [], []
        for t in range(seq_len):
            f = sigmoid(np.dot(x[:, t], self.Wf) + np.dot(hs[:, t], self.Uf) + self.bf)
            i = sigmoid(np.dot(x[:, t], self.Wi) + np.dot(hs[:, t], self.Ui) + self.bi)
            o = sigmoid(np.dot(x[:, t], self.Wo) + np.dot(hs[:, t], self.Uo) + self.bo)
            g = np.tanh(np.dot(x[:, t], self.Wg) + np.dot(hs[:, t], self.Ug) + self.bg)
            cs[:, t+1] = f * cs[:, t] + i * g
            hs[:, t+1] = o * np.tanh(cs[:, t+1])
            fs.append(f)
            is_.append(i)
            os_.append(o)
            gs.append(g)
        y = np.dot(hs[:, -1], self.Why) + self.by
        p = softmax(y)
        return p, hs, cs, fs, is_, os_, gs

    def backward(self, x, hs, cs, fs, is_, os_, gs, dy):
        batch_size, seq_len, _ = x.shape
        # Output layer
        dWhy = np.dot(hs[:, -1].T, dy) / batch_size
        dby = np.sum(dy, axis=0, keepdims=True) / batch_size
        dh = np.dot(dy, self.Why.T)
        dc_next = np.zeros_like(dh)
        dh_next = np.zeros_like(dh)

        dWf = np.zeros_like(self.Wf)
        dWi = np.zeros_like(self.Wi)
        dWo = np.zeros_like(self.Wo)
        dWg = np.zeros_like(self.Wg)
        dUf = np.zeros_like(self.Uf)
        dUi = np.zeros_like(self.Ui)
        dUo = np.zeros_like(self.Uo)
        dUg = np.zeros_like(self.Ug)
        dbf = np.zeros_like(self.bf)
        dbi = np.zeros_like(self.bi)
        dbo = np.zeros_like(self.bo)
        dbg = np.zeros_like(self.bg)
        d_embeds = np.zeros_like(x)

        for t in reversed(range(seq_len)):
            o = os_[t]
            c = cs[:, t+1]
            g = gs[t]
            i = is_[t]
            f = fs[t]
            h_prev = hs[:, t]
            c_prev = cs[:, t]

            dc = (dh * o * (1 - np.tanh(c)**2)) + dc_next
            do = dh * np.tanh(c) * o * (1 - o)
            dg = dc * i * (1 - g**2)
            di = dc * g * i * (1 - i)
            df = dc * c_prev * f * (1 - f)

            dh_prev = np.dot(do, self.Uo.T) + np.dot(dg, self.Ug.T) + np.dot(di, self.Ui.T) + np.dot(df, self.Uf.T) + dh_next
            dc_prev = dc * f

            dx_t = np.dot(do, self.Wo.T) + np.dot(dg, self.Wg.T) + np.dot(di, self.Wi.T) + np.dot(df, self.Wf.T)
            d_embeds[:, t] += dx_t / batch_size

            # Gate grads
            dWo += np.dot(x[:, t].T, do) / batch_size
            dWi += np.dot(x[:, t].T, di) / batch_size
            dWf += np.dot(x[:, t].T, df) / batch_size
            dWg += np.dot(x[:, t].T, dg) / batch_size
            dUo += np.dot(h_prev.T, do) / batch_size
            dUi += np.dot(h_prev.T, di) / batch_size
            dUf += np.dot(h_prev.T, df) / batch_size
            dUg += np.dot(h_prev.T, dg) / batch_size
            dbo += np.sum(do, axis=0, keepdims=True) / batch_size
            dbi += np.sum(di, axis=0, keepdims=True) / batch_size
            dbf += np.sum(df, axis=0, keepdims=True) / batch_size
            dbg += np.sum(dg, axis=0, keepdims=True) / batch_size

            dh_next = dh_prev
            dc_next = dc_prev

        # Update with Adam
        self.Wf = self.optimizer.update('Wf', self.Wf, dWf)
        self.Wi = self.optimizer.update('Wi', self.Wi, dWi)
        self.Wo = self.optimizer.update('Wo', self.Wo, dWo)
        self.Wg = self.optimizer.update('Wg', self.Wg, dWg)
        self.Uf = self.optimizer.update('Uf', self.Uf, dUf)
        self.Ui = self.optimizer.update('Ui', self.Ui, dUi)
        self.Uo = self.optimizer.update('Uo', self.Uo, dUo)
        self.Ug = self.optimizer.update('Ug', self.Ug, dUg)
        self.bf = self.optimizer.update('bf', self.bf, dbf)
        self.bi = self.optimizer.update('bi', self.bi, dbi)
        self.bo = self.optimizer.update('bo', self.bo, dbo)
        self.bg = self.optimizer.update('bg', self.bg, dbg)
        self.Why = self.optimizer.update('Why', self.Why, dWhy)
        self.by = self.optimizer.update('by', self.by, dby)

        return d_embeds

class LSTMClassifier:
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, lr=0.001):
        self.embedding = Embedding(vocab_size, embed_dim)
        self.lstm = SimpleLSTM(embed_dim, hidden_dim, num_classes, lr)
        self.optimizer = Adam(lr=lr)

    def forward(self, x):
        embeds = self.embedding.forward(x)
        p, _, _, _, _, _, _ = self.lstm.forward(embeds)
        return p

    def train_step(self, x, y):
        batch_size = x.shape[0]
        p = self.forward(x)
        dy = p.copy()
        dy[range(batch_size), y] -= 1
        embeds = self.embedding.forward(x)
        _, hs, cs, fs, is_, os_, gs = self.lstm.forward(embeds)
        d_embeds = self.lstm.backward(embeds, hs, cs, fs, is_, os_, gs, dy)
        # Update embedding
        grad_W = np.zeros_like(self.embedding.W)
        for b in range(batch_size):
            for t in range(max_len):
                if x[b, t] != 1:  # Skip PAD
                    grad_W[x[b, t]] += d_embeds[b, t]
        grad_W /= batch_size
        self.embedding.W = self.optimizer.update('embed_W', self.embedding.W, grad_W)
        loss = cross_entropy_loss(y, p)
        return loss

    def save_weights(self, prefix):
        np.save(f'{prefix}_embed.npy', self.embedding.W)
        np.save(f'{prefix}_Wf.npy', self.lstm.Wf)
        np.save(f'{prefix}_Wi.npy', self.lstm.Wi)
        np.save(f'{prefix}_Wo.npy', self.lstm.Wo)
        np.save(f'{prefix}_Wg.npy', self.lstm.Wg)
        np.save(f'{prefix}_Uf.npy', self.lstm.Uf)
        np.save(f'{prefix}_Ui.npy', self.lstm.Ui)
        np.save(f'{prefix}_Uo.npy', self.lstm.Uo)
        np.save(f'{prefix}_Ug.npy', self.lstm.Ug)
        np.save(f'{prefix}_bf.npy', self.lstm.bf)
        np.save(f'{prefix}_bi.npy', self.lstm.bi)
        np.save(f'{prefix}_bo.npy', self.lstm.bo)
        np.save(f'{prefix}_bg.npy', self.lstm.bg)
        np.save(f'{prefix}_Why.npy', self.lstm.Why)
        np.save(f'{prefix}_by.npy', self.lstm.by)

    def load_weights(self, prefix):
        self.embedding.W = np.load(f'{prefix}_embed.npy')
        self.lstm.Wf = np.load(f'{prefix}_Wf.npy')
        self.lstm.Wi = np.load(f'{prefix}_Wi.npy')
        self.lstm.Wo = np.load(f'{prefix}_Wo.npy')
        self.lstm.Wg = np.load(f'{prefix}_Wg.npy')
        self.lstm.Uf = np.load(f'{prefix}_Uf.npy')
        self.lstm.Ui = np.load(f'{prefix}_Ui.npy')
        self.lstm.Uo = np.load(f'{prefix}_Uo.npy')
        self.lstm.Ug = np.load(f'{prefix}_Ug.npy')
        self.lstm.bf = np.load(f'{prefix}_bf.npy')
        self.lstm.bi = np.load(f'{prefix}_bi.npy')
        self.lstm.bo = np.load(f'{prefix}_bo.npy')
        self.lstm.bg = np.load(f'{prefix}_bg.npy')
        self.lstm.Why = np.load(f'{prefix}_Why.npy')
        self.lstm.by = np.load(f'{prefix}_by.npy')

In [8]:
# Cell 8: Optimized Simple Transformer Model with Adam and Embedding Grad
class SimpleTransformer:
    def __init__(self, embed_dim, head_dim, num_classes, max_len=50, lr=0.001):
        self.embed_dim = embed_dim
        self.head_dim = head_dim
        self.Wq = np.random.normal(0, 0.01, (embed_dim, head_dim))
        self.Wk = np.random.normal(0, 0.01, (embed_dim, head_dim))
        self.Wv = np.random.normal(0, 0.01, (embed_dim, head_dim))
        self.Wo = np.random.normal(0, 0.01, (head_dim, embed_dim))
        self.Wff = np.random.normal(0, 0.01, (embed_dim, embed_dim))
        self.bff = np.zeros((1, embed_dim))
        self.Why = np.random.normal(0, 0.01, (embed_dim, num_classes))
        self.by = np.zeros((1, num_classes))
        self.optimizer = Adam(lr=lr)
        self.pos_enc = self._positional_encoding(max_len, embed_dim)

    def _positional_encoding(self, max_len, d_model):
        pe = np.zeros((max_len, d_model))
        position = np.arange(0, max_len).reshape(max_len, 1)
        div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
        pe[:, 0::2] = np.sin(position * div_term)
        pe[:, 1::2] = np.cos(position * div_term)
        return pe

    def forward(self, embeds):
        batch_size, seq_len, _ = embeds.shape
        pos = self.pos_enc[:seq_len]
        pos = np.repeat(pos[np.newaxis, :, :], batch_size, axis=0)
        x = embeds + pos

        Q = np.matmul(x, self.Wq)
        K = np.matmul(x, self.Wk)
        V = np.matmul(x, self.Wv)
        scores = np.matmul(Q, K.transpose(0, 2, 1)) / np.sqrt(self.head_dim)
        attn = softmax(scores)
        attn_h = np.matmul(attn, V)
        attn_out = np.matmul(attn_h, self.Wo)
        ff = np.tanh(np.matmul(attn_out, self.Wff) + self.bff)
        ff_mean = np.mean(ff, axis=1)
        y = np.matmul(ff_mean, self.Why) + self.by
        p = softmax(y)
        return p, x, Q, K, V, attn, attn_h, attn_out, ff, ff_mean

    def backward(self, embeds, x, Q, K, V, attn, attn_h, attn_out, ff, ff_mean, dy):
        batch_size, seq_len, _ = embeds.shape
        # Output layer
        d_ff_mean = np.dot(dy, self.Why.T)
        dWhy = np.dot(ff_mean.T, dy) / batch_size
        dby = np.sum(dy, axis=0, keepdims=True) / batch_size

        d_ff = np.repeat(d_ff_mean[:, np.newaxis, :], seq_len, axis=1) / seq_len
        d_lin = d_ff * (1 - ff ** 2)
        d_attn_out = np.dot(d_lin, self.Wff.T)
        dWff = np.dot(attn_out.transpose(0,2,1).reshape(-1, self.embed_dim).T, d_lin.reshape(-1, self.embed_dim)) / batch_size
        dbff = np.sum(d_lin, axis=(0,1)) / batch_size

        d_attn_h = np.dot(d_attn_out, self.Wo.T)
        dWo = np.dot(attn_h.transpose(0,2,1).reshape(-1, self.head_dim).T, d_attn_out.reshape(-1, self.embed_dim)) / batch_size

        dV = np.matmul(attn.transpose(0,2,1), d_attn_h)
        d_attn = np.matmul(d_attn_h, V.transpose(0,2,1))

        d_scores = attn * d_attn - attn * np.sum(attn * d_attn, axis=-1)[:,:,np.newaxis]
        d_scores /= np.sqrt(self.head_dim)

        dQ = np.matmul(d_scores, K)
        dK = np.matmul(d_scores.transpose(0,2,1), Q)
        dV += np.matmul(attn.transpose(0,2,1), d_attn_h)  # Already have dV

        dx = np.dot(dQ, self.Wq.T) + np.dot(dK, self.Wk.T) + np.dot(dV, self.Wv.T)
        dWq = np.dot(x.transpose(0,2,1).reshape(-1, self.embed_dim).T, dQ.reshape(-1, self.head_dim)) / batch_size
        dWk = np.dot(x.transpose(0,2,1).reshape(-1, self.embed_dim).T, dK.reshape(-1, self.head_dim)) / batch_size
        dWv = np.dot(x.transpose(0,2,1).reshape(-1, self.embed_dim).T, dV.reshape(-1, self.head_dim)) / batch_size

        d_embeds = dx  # Since pos has no grad

        # Update with Adam
        self.Wq = self.optimizer.update('Wq', self.Wq, dWq)
        self.Wk = self.optimizer.update('Wk', self.Wk, dWk)
        self.Wv = self.optimizer.update('Wv', self.Wv, dWv)
        self.Wo = self.optimizer.update('Wo', self.Wo, dWo)
        self.Wff = self.optimizer.update('Wff', self.Wff, dWff)
        self.bff = self.optimizer.update('bff', self.bff, dbff)
        self.Why = self.optimizer.update('Why', self.Why, dWhy)
        self.by = self.optimizer.update('by', self.by, dby)

        return d_embeds / batch_size  # Average

class TransformerClassifier:
    def __init__(self, vocab_size, embed_dim, head_dim, num_classes, lr=0.001):
        self.embedding = Embedding(vocab_size, embed_dim)
        self.transformer = SimpleTransformer(embed_dim, head_dim, num_classes, max_len, lr)
        self.optimizer = Adam(lr=lr)

    def forward(self, x):
        embeds = self.embedding.forward(x)
        p, _, _, _, _, _, _, _, _, _ = self.transformer.forward(embeds)
        return p

    def train_step(self, x, y):
        batch_size = x.shape[0]
        p = self.forward(x)
        dy = p.copy()
        dy[range(batch_size), y] -= 1
        embeds = self.embedding.forward(x)
        _, x_trans, Q, K, V, attn, attn_h, attn_out, ff, ff_mean = self.transformer.forward(embeds)
        d_embeds = self.transformer.backward(embeds, x_trans, Q, K, V, attn, attn_h, attn_out, ff, ff_mean, dy)
        # Update embedding
        grad_W = np.zeros_like(self.embedding.W)
        for b in range(batch_size):
            for t in range(max_len):
                if x[b, t] != 1:
                    grad_W[x[b, t]] += d_embeds[b, t]
        grad_W /= batch_size
        self.embedding.W = self.optimizer.update('embed_W', self.embedding.W, grad_W)
        loss = cross_entropy_loss(y, p)
        return loss

    def save_weights(self, prefix):
        np.save(f'{prefix}_embed.npy', self.embedding.W)
        np.save(f'{prefix}_Wq.npy', self.transformer.Wq)
        np.save(f'{prefix}_Wk.npy', self.transformer.Wk)
        np.save(f'{prefix}_Wv.npy', self.transformer.Wv)
        np.save(f'{prefix}_Wo.npy', self.transformer.Wo)
        np.save(f'{prefix}_Wff.npy', self.transformer.Wff)
        np.save(f'{prefix}_bff.npy', self.transformer.bff)
        np.save(f'{prefix}_Why.npy', self.transformer.Why)
        np.save(f'{prefix}_by.npy', self.transformer.by)

    def load_weights(self, prefix):
        self.embedding.W = np.load(f'{prefix}_embed.npy')
        self.transformer.Wq = np.load(f'{prefix}_Wq.npy')
        self.transformer.Wk = np.load(f'{prefix}_Wk.npy')
        self.transformer.Wv = np.load(f'{prefix}_Wv.npy')
        self.transformer.Wo = np.load(f'{prefix}_Wo.npy')
        self.transformer.Wff = np.load(f'{prefix}_Wff.npy')
        self.transformer.bff = np.load(f'{prefix}_bff.npy')
        self.transformer.Why = np.load(f'{prefix}_Why.npy')
        self.transformer.by = np.load(f'{prefix}_by.npy')

In [9]:
# Cell 9: Training Function (Commented out for submission)
# To train, uncomment and run. Removed subsample for better performance. May take time (hours on CPU for full data).
# Recommend running on a machine with good CPU or reduce data size for testing.
# For faster, set subsample_size = 10000 or so.

def train_model(model, X, y, epochs=10, batch_size=64, val_X=None, val_y=None):
    # No subsample for better performance
    # For test, uncomment: indices = np.random.choice(len(X), 10000, replace=False); X = X[indices]; y = y[indices]

    for epoch in range(epochs):
        perm = np.random.permutation(len(X))
        X = X[perm]
        y = y[perm]

        total_loss = 0
        num_batches = len(X) // batch_size
        for i in range(0, len(X), batch_size):
            batch_X = X[i:i+batch_size]
            batch_y = y[i:i+batch_size]
            if len(batch_X) < batch_size:
                continue  # Skip small batch
            loss = model.train_step(batch_X, batch_y)
            total_loss += loss
            if (i // batch_size) % 100 == 0:
                print(f"Batch {i // batch_size}/{num_batches}, Loss: {loss}")

        print(f"Epoch {epoch+1}, Avg Loss: {total_loss / num_batches}")

        if val_X is not None:
            val_p = model.forward(val_X)
            val_loss = cross_entropy_loss(val_y, val_p)
            val_f1 = f1_score(val_y, np.argmax(val_p, axis=1), average='macro')
            print(f"Val Loss: {val_loss}, Val Macro F1: {val_f1}")

# Example training (commented)
embed_dim = 100
hidden_dim = 64  # Reduced for speed
num_classes = 4
head_dim = 64
lr = 0.001

# rnn_model = RNNClassifier(vocab_size, embed_dim, hidden_dim, num_classes, lr)
# train_model(rnn_model, X_train, y_train, epochs=10, val_X=X_val, val_y=y_val)
# rnn_model.save_weights('rnn')

# lstm_model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_classes, lr)
# train_model(lstm_model, X_train, y_train, epochs=10, val_X=X_val, val_y=y_val)
# lstm_model.save_weights('lstm')

# trans_model = TransformerClassifier(vocab_size, embed_dim, head_dim, num_classes, lr)
# train_model(trans_model, X_train, y_train, epochs=10, val_X=X_val, val_y=y_val)
# trans_model.save_weights('transformer')

print("Training code ready (commented out).")

Training code ready (commented out).


In [10]:
# Cell 10: Load Weights from GitHub Repo (Inference Only)
# Upload the saved .npy files to your repo after training.
# Adjust the download lines for each model.

repo_base = "https://raw.githubusercontent.com/Vishalbharti29/advance-nlp-assignments/main/"

def download_weight(url, filename):
    urllib.request.urlretrieve(url, filename)

# Example for RNN
# download_weight(repo_base + 'rnn_embed.npy', 'rnn_embed.npy')
# ... for all params

embed_dim = 100
hidden_dim = 64
num_classes = 4
head_dim = 64
lr = 0.001  # Not used for load

rnn_model = RNNClassifier(vocab_size, embed_dim, hidden_dim, num_classes, lr)
# rnn_model.load_weights('rnn')  # Load local or downloaded

lstm_model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_classes, lr)
# lstm_model.load_weights('lstm')

trans_model = TransformerClassifier(vocab_size, embed_dim, head_dim, num_classes, lr)
# trans_model.load_weights('transformer')

print("Models loaded for inference.")

Models loaded for inference.


In [11]:
# Cell 11: Inference and Classification Reports
rnn_pred = np.argmax(rnn_model.forward(X_test), axis=1)
rnn_f1 = f1_score(y_test, rnn_pred, average='macro')
print(f"RNN Macro F1: {rnn_f1}")
print(classification_report(y_test, rnn_pred, target_names=['World', 'Sports', 'Business', 'Sci/Tech']))

lstm_pred = np.argmax(lstm_model.forward(X_test), axis=1)
lstm_f1 = f1_score(y_test, lstm_pred, average='macro')
print(f"LSTM Macro F1: {lstm_f1}")
print(classification_report(y_test, lstm_pred, target_names=['World', 'Sports', 'Business', 'Sci/Tech']))

trans_pred = np.argmax(trans_model.forward(X_test), axis=1)
trans_f1 = f1_score(y_test, trans_pred, average='macro')
print(f"Transformer Macro F1: {trans_f1}")
print(classification_report(y_test, trans_pred, target_names=['World', 'Sports', 'Business', 'Sci/Tech']))

RNN Macro F1: 0.1237024427272822
              precision    recall  f1-score   support

       World       0.25      0.95      0.39      1900
      Sports       0.26      0.02      0.03      1900
    Business       0.15      0.01      0.02      1900
    Sci/Tech       0.39      0.03      0.06      1900

    accuracy                           0.25      7600
   macro avg       0.26      0.25      0.12      7600
weighted avg       0.26      0.25      0.12      7600

LSTM Macro F1: 0.12466119570825829
              precision    recall  f1-score   support

       World       0.36      0.02      0.04      1900
      Sports       0.28      0.02      0.03      1900
    Business       0.19      0.02      0.04      1900
    Sci/Tech       0.25      0.94      0.39      1900

    accuracy                           0.25      7600
   macro avg       0.27      0.25      0.12      7600
weighted avg       0.27      0.25      0.12      7600

Transformer Macro F1: 0.1
              precision    recall  f

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
