In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import zipfile
import requests
import io

In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
response = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(response.content))

In [None]:
df = pd.read_csv(z.open('SMSSpamCollection'), sep='\t', names=['label', 'message'])

In [None]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
texts = df['message'].values
labels = df['label'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)
word_index = tokenizer.word_index

In [None]:
max_len = 100
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len)

In [None]:
w2v_model = Word2Vec(sentences=[text.split() for text in X_train], vector_size=100, window=5, min_count=1, workers=4)
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

In [None]:
def create_model(input_dim, embedding_matrix=None, use_embedding=False):
    model = tf.keras.Sequential()
    if use_embedding:
        model.add(tf.keras.layers.Embedding(input_dim=len(word_index) + 1,
                                            output_dim=100,
                                            weights=[embedding_matrix],
                                            input_length=max_len,
                                            trainable=False))
        model.add(tf.keras.layers.GlobalAveragePooling1D())
    else:
        model.add(tf.keras.layers.Dense(128, input_dim=input_dim, activation='relu'))
    model.add(tf.keras.layers.Dense(64, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
tfidf_model = create_model(input_dim=X_train_tfidf.shape[1])
tfidf_model.fit(X_train_tfidf, y_train, epochs=10, batch_size=32, validation_data=(X_test_tfidf, y_test), verbose=2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
140/140 - 5s - 32ms/step - accuracy: 0.9071 - loss: 0.2561 - val_accuracy: 0.9865 - val_loss: 0.0820
Epoch 2/10
140/140 - 3s - 18ms/step - accuracy: 0.9912 - loss: 0.0333 - val_accuracy: 0.9857 - val_loss: 0.0544
Epoch 3/10
140/140 - 4s - 31ms/step - accuracy: 0.9982 - loss: 0.0075 - val_accuracy: 0.9892 - val_loss: 0.0517
Epoch 4/10
140/140 - 2s - 14ms/step - accuracy: 0.9993 - loss: 0.0032 - val_accuracy: 0.9901 - val_loss: 0.0568
Epoch 5/10
140/140 - 3s - 22ms/step - accuracy: 0.9998 - loss: 0.0018 - val_accuracy: 0.9910 - val_loss: 0.0595
Epoch 6/10
140/140 - 3s - 23ms/step - accuracy: 0.9998 - loss: 0.0015 - val_accuracy: 0.9910 - val_loss: 0.0617
Epoch 7/10
140/140 - 2s - 17ms/step - accuracy: 0.9998 - loss: 0.0013 - val_accuracy: 0.9865 - val_loss: 0.0650
Epoch 8/10
140/140 - 2s - 18ms/step - accuracy: 0.9998 - loss: 0.0012 - val_accuracy: 0.9857 - val_loss: 0.0673
Epoch 9/10
140/140 - 2s - 15ms/step - accuracy: 0.9998 - loss: 0.0011 - val_accuracy: 0.9901 - val_loss:

<keras.src.callbacks.history.History at 0x7ac900a75b40>

In [None]:
embedding_model = create_model(input_dim=len(word_index) + 1, embedding_matrix=embedding_matrix, use_embedding=True)
embedding_model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_data=(X_test_padded, y_test), verbose=2)

Epoch 1/10




140/140 - 2s - 12ms/step - accuracy: 0.8658 - loss: 0.5229 - val_accuracy: 0.8664 - val_loss: 0.4566
Epoch 2/10
140/140 - 0s - 3ms/step - accuracy: 0.8658 - loss: 0.4136 - val_accuracy: 0.8664 - val_loss: 0.3774
Epoch 3/10
140/140 - 1s - 5ms/step - accuracy: 0.8658 - loss: 0.3642 - val_accuracy: 0.8664 - val_loss: 0.3495
Epoch 4/10
140/140 - 1s - 5ms/step - accuracy: 0.8658 - loss: 0.3481 - val_accuracy: 0.8664 - val_loss: 0.3432
Epoch 5/10
140/140 - 1s - 5ms/step - accuracy: 0.8658 - loss: 0.3429 - val_accuracy: 0.8664 - val_loss: 0.3379
Epoch 6/10
140/140 - 1s - 9ms/step - accuracy: 0.8658 - loss: 0.3406 - val_accuracy: 0.8664 - val_loss: 0.3352
Epoch 7/10
140/140 - 0s - 3ms/step - accuracy: 0.8658 - loss: 0.3388 - val_accuracy: 0.8664 - val_loss: 0.3336
Epoch 8/10
140/140 - 0s - 3ms/step - accuracy: 0.8658 - loss: 0.3383 - val_accuracy: 0.8664 - val_loss: 0.3327
Epoch 9/10
140/140 - 0s - 2ms/step - accuracy: 0.8658 - loss: 0.3372 - val_accuracy: 0.8664 - val_loss: 0.3325
Epoch 10/10

<keras.src.callbacks.history.History at 0x7ac911d544c0>

In [None]:
tfidf_pred = tfidf_model.predict(X_test_tfidf)
embedding_pred = embedding_model.predict(X_test_padded)

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [None]:
tfidf_acc = accuracy_score(y_test, (tfidf_pred > 0.5).astype(int))
embedding_acc = accuracy_score(y_test, (embedding_pred > 0.5).astype(int))

In [None]:
print(f"TF-IDF Model Accuracy: {tfidf_acc * 100:.2f}%")
print(f"Word2Vec Model Accuracy: {embedding_acc * 100:.2f}%")

TF-IDF Model Accuracy: 99.01%
Word2Vec Model Accuracy: 86.64%
