# Problem 1: SMS Spam Detection with Word2Vec

In [None]:
!pip install nltk gensim tqdm

In [17]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import gensim.downloader as api
from tqdm import tqdm

In [18]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


## Download SMS Spam Collection Dataset

From Kaggle:
https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset

## Load Dataset

In [None]:
df = pd.read_csv('spam.csv', encoding='latin-1')[['v1','v2']]
df.columns = ['Label', 'Message']
df.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Preprocessing Functions

In [None]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = word_tokenize(text)
    words = [w for w in words if w not in stop_words]
    return words

df['Processed'] = df['Message'].apply(preprocess_text)
df.head()

Unnamed: 0,Label,Message,Processed
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t..."


## Load Word2Vec Model

In [None]:
w2v_model = api.load('word2vec-google-news-300')



## Convert Messages to Vectors

In [None]:
def get_message_vector(tokens, model, dim=300):
    vecs = [model[w] for w in tokens if w in model]
    if vecs:
        return np.mean(vecs, axis=0)
    else:
        return np.zeros(dim)

X_vectors = np.array([get_message_vector(msg, w2v_model) for msg in tqdm(df['Processed'])])

100%|██████████| 5572/5572 [00:00<00:00, 11264.47it/s]


## Split Data

In [None]:
y = df['Label'].map({'ham': 0, 'spam': 1})
X_train, X_test, y_train, y_test = train_test_split(X_vectors, y, test_size=0.2, random_state=42)

## Train Logistic Regression

In [None]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

## Evaluate the Model

In [None]:
y_pred = clf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Test Accuracy: 0.9426008968609866


## Prediction Function

In [None]:
def predict_message_class(model, w2v_model, text):
    tokens = preprocess_text(text)
    vec = get_message_vector(tokens, w2v_model).reshape(1, -1)
    return 'spam' if model.predict(vec)[0] else 'ham'

# Example usage
print(predict_message_class(clf, w2v_model, "Congratulations! You've won a free ticket!"))
print(predict_message_class(clf, w2v_model, "Hey, are we meeting today?"))

spam
ham
