In [1]:
import numpy as np
import pandas as pd
import os
import random
import re
import tensorflow as tf
import tqdm
from tensorflow.keras import layers
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

2025-03-10 15:19:49.631682: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Task 1

In [2]:
#Data
column_names = ["Category","Title","Description"]

train = pd.read_csv("AG_train.csv", names = column_names, header = None)
test = pd.read_csv("AG_test.csv", names = column_names, header = None)

train["Text"] = train["Title"] + " " + train["Description"]
test["Text"] = test["Title"] + " " + test["Description"]
train.drop(columns=["Title", "Description"], inplace=True)
test.drop(columns=["Title", "Description"], inplace=True)

In [3]:
#Prepocessing
stop_words = set(stopwords.words("english"))

def preprocess_text(sentence):
    sentence = re.sub(r"[^\w\s]", "", sentence)  
    tokens = sentence.lower().split()  
    tokens = [word for word in tokens if word not in stop_words] 
    return " ".join(tokens)

train["Text"] = train["Text"].apply(preprocess_text)
test["Text"] = test["Text"].apply(preprocess_text)

In [4]:
#Creating vocabulary
all_tokens = []
for text in train["Text"]:
    all_tokens.extend(text.split())  # Tokenizing by space

# Get unique words
unique_tokens = set(all_tokens)
print(f"Total unique words: {len(unique_tokens)}")

Total unique words: 102031


In [5]:
# Initialize vocab with a padding token
vocab = {"<pad>": 0}  # Start indexing from 1
index = 1  

# Assign an index to each unique word
for token in unique_tokens:
    vocab[token] = index
    index += 1

vocab_size = len(vocab)  # Number of unique words including <pad>
print(f"Vocabulary size: {vocab_size}")

# Reverse dictionary to get index → word mapping
inverse_vocab = {index: token for token, index in vocab.items()}

Vocabulary size: 102032


In [6]:
# Convert dataset text into sequences of word indices
train["Sequences"] = train["Text"].apply(lambda x: [vocab[word] for word in x.split() if word in vocab])
test["Sequences"] = test["Text"].apply(lambda x: [vocab[word] for word in x.split() if word in vocab])

# Show an example
print(train[["Text", "Sequences"]].head())

                                                Text  \
0  wall st bears claw back black reuters reuters ...   
1  carlyle looks toward commercial aerospace reut...   
2  oil economy cloud stocks outlook reuters reute...   
3  iraq halts oil exports main southern pipeline ...   
4  oil prices soar alltime record posing new mena...   

                                           Sequences  
0  [31825, 26071, 69558, 76225, 77386, 68455, 287...  
1  [23682, 21340, 65845, 56306, 80426, 28778, 287...  
2  [36205, 73414, 57035, 8986, 92250, 28778, 2877...  
3  [88395, 42873, 36205, 73514, 68286, 59651, 340...  
4  [36205, 70085, 95566, 33346, 79939, 34931, 152...  


In [7]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
    # Elements of each training example are appended to these lists.
    targets, contexts, labels = [], [], []

    # Build the sampling table for `vocab_size` tokens.
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

    # Iterate over all sequences (sentences) in the dataset.
    for sequence in tqdm.tqdm(sequences):

        # Generate positive skip-gram pairs for a sequence (sentence).
        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
              sequence,
              vocabulary_size=vocab_size,
              sampling_table=sampling_table,
              window_size=window_size,
              negative_samples=0,
              seed=seed)

        # Iterate over each positive skip-gram pair to produce training examples
        # with a positive context word and negative samples.
        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(
              tf.constant([context_word], dtype="int64"), 1)
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
              true_classes=context_class,
              num_true=1,
              num_sampled=num_ns,
              unique=True,
              range_max=vocab_size,
              seed=seed,
              name="negative_sampling")

          # Build context and label vectors (for one target word)
            context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
            label = tf.constant([1] + [0]*num_ns, dtype="int64")

          # Append each element from the training example to global lists.
            targets.append(target_word)
            contexts.append(context)
            labels.append(label)

    return targets, contexts, labels

In [None]:
#targets, contexts, labels = generate_training_data(
#    sequences=train["Sequences"],
#    window_size=3,
#    num_ns=4,
#    vocab_size=vocab_size,
#    seed=407)

#targets = np.array(targets)
#contexts = np.array(contexts)
#labels = np.array(labels)

#print('\n')
#print(f"targets.shape: {targets.shape}")
#print(f"contexts.shape: {contexts.shape}")
#print(f"labels.shape: {labels.shape}")

 62%|█████████████████████             | 74483/120000 [1:19:34<47:35, 15.94it/s]

In [9]:
targets = np.load('train_targets.npy')
contexts = np.load('train_contexts.npy')
labels = np.load('train_labels.npy')

In [10]:
BATCH_SIZE = 200
BUFFER_SIZE = 5000
AUTOTUNE = tf.data.AUTOTUNE

dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [11]:
#Word2Vec Model
class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
        self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

    def call(self, pair):
        target, context = pair
        # target: (batch, ) 
        # context: (batch, context)
        if len(target.shape) == 2:
            target = tf.squeeze(target, axis=1)
        # target: (batch,)
        word_emb = self.target_embedding(target)
        # word_emb: (batch, embed)
        context_emb = self.context_embedding(context)
        # context_emb: (batch, context, embed)
        dots = tf.einsum('be,bce->bc', word_emb, context_emb) # 'be, bce ->' indicates the output shape 
        # dots: (batch, context)
        return tf.nn.softmax(dots)

num_ns=4
embedding_dim = 10
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss="CategoricalCrossentropy",
                 metrics=['accuracy'])



In [14]:
word2vec.fit(dataset, epochs=1, verbose=1, batch_size = 1024)

[1m82432/82432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1853s[0m 22ms/step - accuracy: 0.9497 - loss: 0.1700


<keras.src.callbacks.history.History at 0x7fb45c656b50>

In [45]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]

In [16]:
np.save('skipgram_embeddings.npy', weights)

In [46]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense, Dropout, Input


In [47]:
MAX_SEQUENCE_LENGTH = 100  # Truncate/pad sequences to this length
NUM_CLASSES = len(train["Category"].unique())  # Number of categories

# Convert categories to numerical labels
train_labels = train["Category"].values -1

# Pad sequences to ensure consistent input shape
padded_sequences = pad_sequences(train["Sequences"], maxlen=MAX_SEQUENCE_LENGTH, padding="post")

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, train_labels, test_size=0.2, random_state=42)

# Convert labels to categorical (one-hot encoding)
y_train = tf.keras.utils.to_categorical(y_train, num_classes=NUM_CLASSES)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=NUM_CLASSES)

In [49]:
# Load trained embeddings from Word2Vec
embedding_dim = 100  # Ensure this matches your learned embedding size

# Create an embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in vocab.items():
    if word in weights:  # word_vectors is your trained Skip-gram embeddings
        embedding_matrix[i] = weights[word]

  if word in weights:  # word_vectors is your trained Skip-gram embeddings


In [60]:
#Model 1: Deep learning model using skip-gram word embedding


from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, LSTM, GRU, Dense, Dropout, Bidirectional

# Explicit input layer
input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,), name="Input_Layer")

# Embedding layer using Word2Vec embeddings
embedding_layer = Embedding(input_dim=weights.shape[0],  # vocab_size
                            output_dim=weights.shape[1],  # embedding_dim (10)
                            weights=[weights], trainable=False)(input_layer)

# First recurrent layer (Simple RNN)
rnn_layer = SimpleRNN(128, return_sequences=True, activation='relu')(embedding_layer)

# Second recurrent layer (LSTM)
lstm_layer = LSTM(128, return_sequences=True, activation='relu')(rnn_layer)

# Third recurrent layer (GRU)
gru_layer = GRU(128, return_sequences=False, activation='relu')(lstm_layer)

# Dropout for regularization
dropout_layer = Dropout(0.3)(gru_layer)

# Fully connected output layer for classification
output_layer = Dense(NUM_CLASSES, activation="softmax")(dropout_layer)

# Define model using functional API
model = Model(inputs=input_layer, outputs=output_layer, name="Text_Classification_RNN")

# Compile the model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Display the model summary
model.summary()


In [51]:
# Train the model
history = model.fit(
    X_train, y_train, 
    validation_data=(X_test, y_test), 
    epochs=2,  
    batch_size=256,
    verbose=1
)

# Evaluate performance
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

Epoch 1/2
[1m 82/375[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m5:40[0m 1s/step - accuracy: 0.2426 - loss: 1.3867

KeyboardInterrupt: 

### GloVe prep

In [55]:
!curl -O https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
!unzip -q glove.6B.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  822M  100  822M    0     0  4925k      0  0:02:50  0:02:50 --:--:-- 5165k3952k      0  0:03:33  0:00:26  0:03:07 5114k766k      0  0:02:56  0:01:35  0:01:21 5136k    0     0  4828k      0  0:02:54  0:01:52  0:01:02 4914k   0  4837k      0  0:02:54  0:01:56  0:00:58 5056k4908k      0  0:02:51  0:02:38  0:00:13 5032k16M    0     0  4924k      0  0:02:50  0:02:49  0:00:01 5182k


In [56]:
path_to_glove_file = "glove.6B.100d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [57]:
import numpy as np

# Ensure vocab_size is correctly set
embedding_dim = 100  # Use 100D GloVe embeddings
vocab_size = len(vocab)  # Ensure this matches your tokenized dataset

# Initialize an empty embedding matrix
glove_embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Fill embedding matrix with GloVe vectors where available
for word, i in vocab.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        glove_embedding_matrix[i] = embedding_vector  # Assign GloVe vector

print("GloVe embedding matrix shape:", glove_embedding_matrix.shape)  # Should be (vocab_size, embedding_dim)

GloVe embedding matrix shape: (102032, 100)


In [65]:
#Model 2: # Explicit input layer
# Explicit input layer
input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,), name="Input_Layer")

# Embedding layer using pre-trained GloVe embeddings
embedding_layer = Embedding(input_dim=vocab_size,  # Should match vocab size
                            output_dim=embedding_dim,  # Should match GloVe dim (100)
                            weights=[glove_embedding_matrix],  # Load GloVe weights
                            trainable=False)(input_layer)  # Freeze or set trainable=True

# First recurrent layer (Simple RNN)
rnn_layer = SimpleRNN(128, return_sequences=True, activation='relu')(embedding_layer)

# Second recurrent layer (LSTM)
lstm_layer = Bidirectional(LSTM(128, return_sequences=True, activation='relu'))(rnn_layer)

# Third recurrent layer (GRU)
gru_layer = Bidirectional(GRU(128, return_sequences=False, activation='relu'))(lstm_layer)

# Dropout for regularization
dropout_layer = Dropout(0.3)(gru_layer)

# Fully connected output layer for classification
output_layer = Dense(NUM_CLASSES, activation="softmax")(dropout_layer)

# Define model using functional API
model_GloVe = Model(inputs=input_layer, outputs=output_layer, name="Text_Classification_RNN_GloVe")

# Compile the model
model_GloVe.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Display the model summary
model_GloVe.summary()

In [66]:
# Train the model
history_GloVe = model_GloVe.fit(
    X_train, y_train, 
    validation_data=(X_test, y_test), 
    epochs=1,  
    batch_size=256,
    verbose=1
)

# Evaluate performance
test_loss, test_acc = model_GloVe.evaluate(X_test, y_test)
print(f"GloVe Model Test Accuracy: {test_acc:.4f}")

[1m 18/375[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m14:05[0m 2s/step - accuracy: 0.4074 - loss: 27.0601

KeyboardInterrupt: 

# Task 2

In [None]:
# Find the odd one out!