In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from collections import Counter
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Sigmoid function with clipping to prevent overflow
def sigmoid(x):
    return 1 / (1 + np.exp(-np.clip(x, -15, 15)))

# Data Preprocessing
def clean_text(text):
    return re.sub(r'[^\w\s]', '', text.lower()).split()  # Remove punctuation and lowercase

# Building vocabulary
def build_vocab(corpus):
    return Counter(word for sentence in corpus for word in sentence)

# Generating skip-grams
def generate_skip_grams(sentence, window_size=2):
    return [
        (sentence[idx], sentence[context_idx])
        for idx in range(len(sentence))
        for context_idx in range(max(0, idx - window_size), min(len(sentence), idx + window_size + 1))
        if idx != context_idx
    ]

# Training Word2Vec embeddings
def train_word2vec(skip_grams, vocab_size, word_to_idx, embedding_dim=10, lr=0.01, epochs=10, negative_samples=5, batch_size=512):
    W1 = np.random.uniform(-1, 1, (vocab_size, embedding_dim)) / np.sqrt(vocab_size)
    W2 = np.random.uniform(-1, 1, (embedding_dim, vocab_size)) / np.sqrt(embedding_dim)
    epsilon = 1e-10

    for epoch in range(epochs):
        np.random.shuffle(skip_grams)
        total_loss = 0

        for i in range(0, len(skip_grams), batch_size):
            batch = skip_grams[i:i + batch_size]
            batch_center = np.array([word_to_idx[center] for center, _ in batch])
            batch_target = np.array([word_to_idx[target] for _, target in batch])

            h = W1[batch_center]  # Hidden layer
            u = np.dot(h, W2)  # Output layer
            y_pred = sigmoid(u)

            # Generate negative samples
            labels = np.zeros((len(batch), vocab_size))
            labels[np.arange(len(batch)), batch_target] = 1

            # Backward pass
            error = y_pred - labels
            grad_w2 = np.dot(h.T, error)
            grad_w1 = np.dot(error, W2.T)

            # Loss
            total_loss -= np.sum(np.log(y_pred[np.arange(len(batch)), batch_target] + epsilon))

            # Update weights
            W2 -= lr * np.clip(grad_w2, -5, 5)
            W1[batch_center] -= lr * np.clip(grad_w1, -5, 5)

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

    return W1, W2

# Main Code
if __name__ == "__main__":
    data = pd.read_csv('/content/drive/MyDrive/spam_or_not_spam.csv').dropna()
    data['clean_text'] = data['email'].apply(clean_text)

    # Balancing the dataset
    spam = data[data['label'] == 1]
    not_spam = data[data['label'] == 0].sample(len(spam), random_state=42)
    balanced_data = pd.concat([spam, not_spam])

    # Building vocabulary and generating skip-grams
    vocab = build_vocab(balanced_data['clean_text'])
    word_to_idx = {word: idx for idx, word in enumerate(vocab.keys())}

    training_pairs = [pair for sentence in balanced_data['clean_text'] for pair in generate_skip_grams(sentence)]

    # Training Word2Vec
    vocab_size = len(vocab)
    W1, W2 = train_word2vec(training_pairs, vocab_size, word_to_idx)
    word_embeddings = {word: W1[word_to_idx[word]] for word in word_to_idx.keys()}

    # Saving embeddings to a CSV file
    embeddings_df = pd.DataFrame.from_dict(word_embeddings, orient='index')
    embeddings_df.to_csv('/content/drive/MyDrive/word_embeddings.csv', header=False, index=True)
    print("Word embeddings saved to 'word_embeddings.csv'.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Epoch 1/10, Loss: 7523462.2106
Epoch 2/10, Loss: 7507331.7654
Epoch 3/10, Loss: 7360076.2931
Epoch 4/10, Loss: 7278070.7343
Epoch 5/10, Loss: 7221587.0452
Epoch 6/10, Loss: 7178375.7396
Epoch 7/10, Loss: 7144252.8949
Epoch 8/10, Loss: 7114454.1199
Epoch 9/10, Loss: 7089366.5954
Epoch 10/10, Loss: 7067221.5159
Word embeddings saved to 'word_embeddings.csv'.
