In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('spam.csv',encoding = 'ISO-8859-1')
data = data[['v1','v2']]
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data.columns = ['label','text']
data

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
data['label'] = data['label'].map({'ham':0,'spam' :1})
data.head(3)

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [5]:
import nltk
import string

In [6]:
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [7]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer,PorterStemmer


In [8]:
stopwords = set(stopwords.words("english"))
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [9]:
def text_cleaning(text):
    text = text.lower()
    text = text.translate(str.maketrans('','',string.punctuation))
    tockens =nltk.word_tokenize(text)
    stem_text = [ps.stem(word) for word in tockens if word not in stopwords]
    lemma_text = [lemmatizer.lemmatize(word) for word in stem_text]
    return ' '.join(lemma_text)
data['clean_text'] = data['text'].apply(text_cleaning)

In [10]:
data.head(2)

Unnamed: 0,label,text,clean_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni


In [11]:
X,y = data['clean_text'],data['label']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)

In [13]:
X_train_vector = cv.fit_transform(X_train)
X_test_vector = cv.transform(X_test)

## Classical ML model

In [14]:
from sklearn.naive_bayes import MultinomialNB
ML_model = MultinomialNB().fit(X_train_vector,y_train)
y_pred_ML = ML_model.predict(X_test_vector)
y_pred_ML

array([0, 0, 1, ..., 0, 0, 1])

In [15]:
from sklearn.metrics import accuracy_score,confusion_matrix
acc_ML = accuracy_score(y_test,y_pred_ML)
print("Accuracy:",acc_ML)
cm_ML = confusion_matrix(y_test,y_pred_ML)
print("Confusion_matrix:\n",cm_ML)

Accuracy: 0.9811659192825112
Confusion_matrix:
 [[960   5]
 [ 16 134]]


## Deep Learning model
LSTM

In [20]:
import tensorflow as tf
from tensorflow.keras.layers import Dense,Input,LSTM,GlobalMaxPool2D,Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [21]:
  tokenizer = Tokenizer(num_words=5000)
  tokenizer.fit_on_texts(X_train)

In [23]:
X_train_seq = tokenizer.texts_to_sequences(X_train.tolist())
X_test_seq = tokenizer.texts_to_sequences(X_test.tolist())

In [24]:
X_train_pad = pad_sequences(X_train_seq,maxlen=100)
X_test_pad = pad_sequences(X_test_seq,maxlen=100)

In [26]:
DL_model = Sequential([
    Embedding(input_dim=5000,output_dim=64,input_length = 100),
    LSTM(64),
    Dense(1,activation = 'sigmoid')
])



In [27]:
DL_model.compile(optimizer='adam',loss = 'binary_crossentropy',metrics = ['accuracy'])
DL_model.fit(X_train_pad,y_train,validation_data=(X_test_pad,y_test),batch_size = 32,epochs = 5)

Epoch 1/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 52ms/step - accuracy: 0.8864 - loss: 0.3253 - val_accuracy: 0.9803 - val_loss: 0.0694
Epoch 2/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 51ms/step - accuracy: 0.9880 - loss: 0.0502 - val_accuracy: 0.9803 - val_loss: 0.0619
Epoch 3/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 56ms/step - accuracy: 0.9960 - loss: 0.0171 - val_accuracy: 0.9830 - val_loss: 0.0662
Epoch 4/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 59ms/step - accuracy: 0.9991 - loss: 0.0056 - val_accuracy: 0.9848 - val_loss: 0.0742
Epoch 5/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 62ms/step - accuracy: 0.9995 - loss: 0.0036 - val_accuracy: 0.9830 - val_loss: 0.0776


<keras.src.callbacks.history.History at 0x7ce8aeeb30e0>

In [33]:
loss, acc_DL = DL_model.evaluate(X_test_pad, y_test)
print(f"Deep Learning Model Accuracy: {acc_DL:.4f}")

print(f"Classical ML Accuracy:{acc_ML:.4f}")


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.9835 - loss: 0.0669
Deep Learning Model Accuracy: 0.9830
Classical ML Accuracy:0.9812


### Testing on  sample input

In [40]:
sample = ["Congratulations! You won a free ticket. Call now!"]
sample_clean = [text_cleaning(sample[0])]

sample_vec = cv.transform(sample_clean)
print("ML Model predict as spam" if(ML_model.predict(sample_vec)) else "ML model predict as not spam")

sample_seq = tokenizer.texts_to_sequences(sample_clean)
sample_pad = pad_sequences(sample_seq, maxlen=100)
prediction = DL_model.predict(sample_pad)
print(
    "DL Model predicts as spam" if prediction[0][0] > 0.5
    else "DL Model predicts as not spam"
)

ML Model predict as spam
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step
DL Model predicts as spam
