In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from collections import Counter
from tqdm import tqdm  # Import tqdm for progress bars

In [2]:

# Paths
train_path = r"C:\Users\roeyn\Coding_Enviroment\NLP\Ex1\.venv\train.csv"
val_path = r"C:\Users\roeyn\Coding_Enviroment\NLP\Ex1\.venv\validation.csv"


# Text preprocessing
def preprocess_text(text):
    text = str(text).lower() #change text to be string type and make all letters to lower case.
    text = re.sub(r'[^a-z\s]', '', text) #delete any char that is not alphabetic or space.
    return text.split() # split each word (by using space) to be a different element in the returned string.


# Load data
print("Loading data...")
train_df = pd.read_csv(train_path)  # read train set
val_df = pd.read_csv(val_path)  # read validation set

Loading data...


In [3]:

# Apply preprocessing with progress bar
print("Preprocessing text...")
tqdm.pandas(desc="Preprocessing Train")
train_df['text'] = train_df['text'].progress_apply(lambda x: preprocess_text(x))

tqdm.pandas(desc="Preprocessing Val")
val_df['text'] = val_df['text'].progress_apply(lambda x: preprocess_text(x))

# Build vocabulary
print("Building vocabulary...")
all_words = []
for text in tqdm(train_df['text'], desc="Collecting words"):
    all_words.extend(text)

word_counts = Counter(all_words)
vocab = ['<PAD>', '<UNK>'] + [word for word, count in word_counts.most_common(5000)]
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}


# Word2Vec-like embedding (using co-occurrence matrix)
def create_embeddings(texts, vocab_size, embed_dim=100, window=5):
    np.random.seed(42)
    embeddings = np.random.randn(vocab_size, embed_dim) * 0.01
    return embeddings


embeddings = create_embeddings(train_df['text'], len(vocab), embed_dim=100)


# Convert text to sequences
def text_to_sequence(text, word_to_idx, max_len=50):
    seq = [word_to_idx.get(word, word_to_idx['<UNK>']) for word in text]
    if len(seq) < max_len:
        seq = seq + [word_to_idx['<PAD>']] * (max_len - len(seq))
    else:
        seq = seq[:max_len]
    return seq


max_len = 50
print("Converting text to sequences...")
X_train = np.array(
    [text_to_sequence(text, word_to_idx, max_len) for text in tqdm(train_df['text'], desc="Tokenizing Train")])
X_val = np.array([text_to_sequence(text, word_to_idx, max_len) for text in tqdm(val_df['text'], desc="Tokenizing Val")])
y_train = train_df['label'].values
y_val = val_df['label'].values


# Activation functions
def sigmoid(x):
    return 1 / (1 + np.exp(-np.clip(x, -500, 500)))


def tanh(x):
    return np.tanh(np.clip(x, -500, 500))


def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)


# LSTM Cell Implementation
class LSTMCell:
    def __init__(self, input_size, hidden_size):
        self.hidden_size = hidden_size

        # Xavier initialization
        scale = np.sqrt(2.0 / (input_size + hidden_size))
        self.Wf = np.random.randn(input_size + hidden_size, hidden_size) * scale
        self.Wi = np.random.randn(input_size + hidden_size, hidden_size) * scale
        self.Wc = np.random.randn(input_size + hidden_size, hidden_size) * scale
        self.Wo = np.random.randn(input_size + hidden_size, hidden_size) * scale

        self.bf = np.ones((1, hidden_size))  # Forget gate bias initialized to 1
        self.bi = np.zeros((1, hidden_size))
        self.bc = np.zeros((1, hidden_size))
        self.bo = np.zeros((1, hidden_size))

    def forward(self, x, h_prev, c_prev):
        concat = np.concatenate([x, h_prev], axis=1)

        f = sigmoid(np.dot(concat, self.Wf) + self.bf)  # Forget gate
        i = sigmoid(np.dot(concat, self.Wi) + self.bi)  # Input gate
        c_tilde = tanh(np.dot(concat, self.Wc) + self.bc)  # Candidate cell state
        o = sigmoid(np.dot(concat, self.Wo) + self.bo)  # Output gate

        c = f * c_prev + i * c_tilde  # New cell state
        h = o * tanh(c)  # New hidden state

        cache = (x, h_prev, c_prev, concat, f, i, c_tilde, o, c)
        return h, c, cache

    def backward(self, dh, dc_next, cache):
        x, h_prev, c_prev, concat, f, i, c_tilde, o, c = cache

        dc = dc_next + dh * o * (1 - tanh(c) ** 2)

        do = dh * tanh(c)
        do_input = do * o * (1 - o)

        dc_tilde = dc * i
        dc_tilde_input = dc_tilde * (1 - c_tilde ** 2)

        di = dc * c_tilde
        di_input = di * i * (1 - i)

        df = dc * c_prev
        df_input = df * f * (1 - f)

        dconcat = (np.dot(df_input, self.Wf.T) +
                   np.dot(di_input, self.Wi.T) +
                   np.dot(dc_tilde_input, self.Wc.T) +
                   np.dot(do_input, self.Wo.T))

        dx = dconcat[:, :x.shape[1]]
        dh_prev = dconcat[:, x.shape[1]:]
        dc_prev = dc * f

        self.dWf = np.dot(concat.T, df_input)
        self.dWi = np.dot(concat.T, di_input)
        self.dWc = np.dot(concat.T, dc_tilde_input)
        self.dWo = np.dot(concat.T, do_input)

        self.dbf = np.sum(df_input, axis=0, keepdims=True)
        self.dbi = np.sum(di_input, axis=0, keepdims=True)
        self.dbc = np.sum(dc_tilde_input, axis=0, keepdims=True)
        self.dbo = np.sum(do_input, axis=0, keepdims=True)

        return dx, dh_prev, dc_prev


# GRU Cell Implementation
class GRUCell:
    def __init__(self, input_size, hidden_size):
        self.hidden_size = hidden_size

        scale = np.sqrt(2.0 / (input_size + hidden_size))
        self.Wr = np.random.randn(input_size + hidden_size, hidden_size) * scale  # Reset gate
        self.Wz = np.random.randn(input_size + hidden_size, hidden_size) * scale  # Update gate
        self.Wh = np.random.randn(input_size + hidden_size, hidden_size) * scale  # Candidate

        self.br = np.zeros((1, hidden_size))
        self.bz = np.zeros((1, hidden_size))
        self.bh = np.zeros((1, hidden_size))

    def forward(self, x, h_prev):
        concat = np.concatenate([x, h_prev], axis=1)

        r = sigmoid(np.dot(concat, self.Wr) + self.br)  # Reset gate
        z = sigmoid(np.dot(concat, self.Wz) + self.bz)  # Update gate

        concat_r = np.concatenate([x, r * h_prev], axis=1)
        h_tilde = tanh(np.dot(concat_r, self.Wh) + self.bh)  # Candidate hidden state

        h = z * h_prev + (1 - z) * h_tilde  # New hidden state

        cache = (x, h_prev, concat, concat_r, r, z, h_tilde)
        return h, cache

    def backward(self, dh, cache):
        x, h_prev, concat, concat_r, r, z, h_tilde = cache

        dh_tilde = dh * (1 - z)
        dh_tilde_input = dh_tilde * (1 - h_tilde ** 2)

        dz = dh * (h_prev - h_tilde)
        dz_input = dz * z * (1 - z)

        dconcat_r = np.dot(dh_tilde_input, self.Wh.T)
        dx_from_h = dconcat_r[:, :x.shape[1]]
        dr_h_prev = dconcat_r[:, x.shape[1]:]

        dr = dr_h_prev * h_prev
        dr_input = dr * r * (1 - r)

        dconcat = np.dot(dr_input, self.Wr.T) + np.dot(dz_input, self.Wz.T)
        dx = dconcat[:, :x.shape[1]] + dx_from_h
        dh_prev = dconcat[:, x.shape[1]:] + dr_h_prev * r + dh * z

        self.dWr = np.dot(concat.T, dr_input)
        self.dWz = np.dot(concat.T, dz_input)
        self.dWh = np.dot(concat_r.T, dh_tilde_input)

        self.dbr = np.sum(dr_input, axis=0, keepdims=True)
        self.dbz = np.sum(dz_input, axis=0, keepdims=True)
        self.dbh = np.sum(dh_tilde_input, axis=0, keepdims=True)

        return dx, dh_prev


# LSTM Network
class LSTMNetwork:
    def __init__(self, vocab_size, embed_dim, hidden_size, num_classes, embeddings):
        self.embeddings = embeddings
        self.lstm_cell = LSTMCell(embed_dim, hidden_size)
        self.hidden_size = hidden_size

        self.Wy = np.random.randn(hidden_size, num_classes) * 0.01
        self.by = np.zeros((1, num_classes))

    def forward(self, X):
        batch_size, seq_len = X.shape
        h = np.zeros((batch_size, self.hidden_size))
        c = np.zeros((batch_size, self.hidden_size))

        caches = []
        for t in range(seq_len):
            x = self.embeddings[X[:, t]]
            h, c, cache = self.lstm_cell.forward(x, h, c)
            caches.append(cache)

        logits = np.dot(h, self.Wy) + self.by
        probs = softmax(logits)

        return probs, (h, c, caches)

    def backward(self, dout, cache):
        h, c, caches = cache
        batch_size = dout.shape[0]

        dh = np.dot(dout, self.Wy.T)
        dc = np.zeros_like(c)

        self.dWy = np.dot(h.T, dout)
        self.dby = np.sum(dout, axis=0, keepdims=True)

        for t in reversed(range(len(caches))):
            dx, dh, dc = self.lstm_cell.backward(dh, dc, caches[t])

    def update(self, lr):
        self.lstm_cell.Wf -= lr * self.lstm_cell.dWf
        self.lstm_cell.Wi -= lr * self.lstm_cell.dWi
        self.lstm_cell.Wc -= lr * self.lstm_cell.dWc
        self.lstm_cell.Wo -= lr * self.lstm_cell.dWo

        self.lstm_cell.bf -= lr * self.lstm_cell.dbf
        self.lstm_cell.bi -= lr * self.lstm_cell.dbi
        self.lstm_cell.bc -= lr * self.lstm_cell.dbc
        self.lstm_cell.bo -= lr * self.lstm_cell.dbo

        self.Wy -= lr * self.dWy
        self.by -= lr * self.dby


# GRU Network
class GRUNetwork:
    def __init__(self, vocab_size, embed_dim, hidden_size, num_classes, embeddings):
        self.embeddings = embeddings
        self.gru_cell = GRUCell(embed_dim, hidden_size)
        self.hidden_size = hidden_size

        self.Wy = np.random.randn(hidden_size, num_classes) * 0.01
        self.by = np.zeros((1, num_classes))

    def forward(self, X):
        batch_size, seq_len = X.shape
        h = np.zeros((batch_size, self.hidden_size))

        caches = []
        for t in range(seq_len):
            x = self.embeddings[X[:, t]]
            h, cache = self.gru_cell.forward(x, h)
            caches.append(cache)

        logits = np.dot(h, self.Wy) + self.by
        probs = softmax(logits)

        return probs, (h, caches)

    def backward(self, dout, cache):
        h, caches = cache

        dh = np.dot(dout, self.Wy.T)

        self.dWy = np.dot(h.T, dout)
        self.dby = np.sum(dout, axis=0, keepdims=True)

        for t in reversed(range(len(caches))):
            dx, dh = self.gru_cell.backward(dh, caches[t])

    def update(self, lr):
        self.gru_cell.Wr -= lr * self.gru_cell.dWr
        self.gru_cell.Wz -= lr * self.gru_cell.dWz
        self.gru_cell.Wh -= lr * self.gru_cell.dWh

        self.gru_cell.br -= lr * self.gru_cell.dbr
        self.gru_cell.bz -= lr * self.gru_cell.dbz
        self.gru_cell.bh -= lr * self.gru_cell.dbh

        self.Wy -= lr * self.dWy
        self.by -= lr * self.dby


# Training function
def train_model(model, X_train, y_train, X_val, y_val, epochs=20, batch_size=32, lr=0.001):
    train_losses, val_accs = [], []

    for epoch in range(epochs):
        # Shuffle training data
        indices = np.random.permutation(len(X_train))
        X_train_shuffled = X_train[indices]
        y_train_shuffled = y_train[indices]

        epoch_loss = 0
        num_batches = len(X_train) // batch_size

        # Wrap the loop with tqdm
        progress_bar = tqdm(range(num_batches), desc=f"Epoch {epoch + 1}/{epochs}", unit="batch")

        for i in progress_bar:
            start_idx = i * batch_size
            end_idx = start_idx + batch_size

            X_batch = X_train_shuffled[start_idx:end_idx]
            y_batch = y_train_shuffled[start_idx:end_idx]

            # Forward pass
            probs, cache = model.forward(X_batch)

            # Compute loss
            loss = -np.mean(np.log(probs[range(batch_size), y_batch] + 1e-8))
            epoch_loss += loss

            # Backward pass
            dout = probs.copy()
            dout[range(batch_size), y_batch] -= 1
            dout /= batch_size

            model.backward(dout, cache)
            model.update(lr)

            # Update progress bar with current loss
            progress_bar.set_postfix({'loss': f'{loss:.4f}'})

        # Validation accuracy
        val_probs, _ = model.forward(X_val)
        val_pred = np.argmax(val_probs, axis=1)
        val_acc = np.mean(val_pred == y_val)

        train_losses.append(epoch_loss / num_batches)
        val_accs.append(val_acc)

        print(f"Epoch {epoch + 1}/{epochs} Completed. Avg Loss: {epoch_loss / num_batches:.4f}, Val Acc: {val_acc:.4f}")

    return train_losses, val_accs


# Hyperparameter configurations
lstm_configs = [
    {'hidden_size': 64, 'lr': 0.001},
    {'hidden_size': 128, 'lr': 0.0005},
    {'hidden_size': 256, 'lr': 0.0001}
]

gru_configs = [
    {'hidden_size': 64, 'lr': 0.001},
    {'hidden_size': 128, 'lr': 0.0005},
    {'hidden_size': 256, 'lr': 0.0001}
]

print("=" * 50)
print("Training LSTM Models")
print("=" * 50)

lstm_results = []
# Outer loop tqdm for configurations
for i, config in enumerate(tqdm(lstm_configs, desc="LSTM Configs")):
    print(f"\nLSTM Config {i + 1}: {config}")
    model = LSTMNetwork(len(vocab), 100, config['hidden_size'], 6, embeddings)
    losses, accs = train_model(model, X_train, y_train, X_val, y_val,
                               epochs=10, lr=config['lr'])
    lstm_results.append({'config': config, 'val_acc': accs[-1]})

print("\n" + "=" * 50)
print("Training GRU Models")
print("=" * 50)

gru_results = []
# Outer loop tqdm for configurations
for i, config in enumerate(tqdm(gru_configs, desc="GRU Configs")):
    print(f"\nGRU Config {i + 1}: {config}")
    model = GRUNetwork(len(vocab), 100, config['hidden_size'], 6, embeddings)
    losses, accs = train_model(model, X_train, y_train, X_val, y_val,
                               epochs=10, lr=config['lr'])
    gru_results.append({'config': config, 'val_acc': accs[-1]})

print("\n" + "=" * 50)
print("Results Summary")
print("=" * 50)
print("\nLSTM Results:")
for i, r in enumerate(lstm_results):
    print(f"Config {i + 1}: Val Acc = {r['val_acc']:.4f}")

print("\nGRU Results:")
for i, r in enumerate(gru_results):
    print(f"Config {i + 1}: Val Acc = {r['val_acc']:.4f}")


Preprocessing text...


Preprocessing Train: 100%|██████████| 16000/16000 [00:00<00:00, 128437.53it/s]
Preprocessing Val: 100%|██████████| 2000/2000 [00:00<00:00, 165668.17it/s]


Building vocabulary...


Collecting words: 100%|██████████| 16000/16000 [00:00<00:00, 679205.14it/s]


Converting text to sequences...


Tokenizing Train: 100%|██████████| 16000/16000 [00:00<00:00, 67259.39it/s] 
Tokenizing Val: 100%|██████████| 2000/2000 [00:00<00:00, 60745.20it/s]


Training LSTM Models


LSTM Configs:   0%|          | 0/3 [00:00<?, ?it/s]


LSTM Config 1: {'hidden_size': 64, 'lr': 0.001}



Epoch 1/10:   0%|          | 0/500 [00:00<?, ?batch/s][A
Epoch 1/10:   0%|          | 0/500 [00:00<?, ?batch/s, loss=1.7917][A
Epoch 1/10:   0%|          | 1/500 [00:00<01:58,  4.22batch/s, loss=1.7917][A
Epoch 1/10:   0%|          | 1/500 [00:00<01:58,  4.22batch/s, loss=1.7916][A
Epoch 1/10:   0%|          | 2/500 [00:00<04:07,  2.01batch/s, loss=1.7916][A
Epoch 1/10:   0%|          | 2/500 [00:01<04:07,  2.01batch/s, loss=1.7915][A
Epoch 1/10:   1%|          | 3/500 [00:01<02:48,  2.95batch/s, loss=1.7915][A
Epoch 1/10:   1%|          | 3/500 [00:01<02:48,  2.95batch/s, loss=1.7915][A
Epoch 1/10:   1%|          | 4/500 [00:01<02:05,  3.94batch/s, loss=1.7915][A
Epoch 1/10:   1%|          | 4/500 [00:01<02:05,  3.94batch/s, loss=1.7915][A
Epoch 1/10:   1%|          | 4/500 [00:01<02:05,  3.94batch/s, loss=1.7912][A
Epoch 1/10:   1%|          | 6/500 [00:01<01:29,  5.55batch/s, loss=1.7912][A
Epoch 1/10:   1%|          | 6/500 [00:01<01:29,  5.55batch/s, loss=1.7909][A
E

KeyboardInterrupt: 