In [378]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [379]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kulsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kulsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [380]:
data = pd.read_csv('spam.csv') 
data['Category'] = np.where(data['Category'] == 'spam', 1, 0)

In [381]:
data

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [382]:
def preprocess_text(text):
    # Tokenization
    words = nltk.word_tokenize(text.lower())
    
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Stemming
    # stemmer = PorterStemmer()
    # words = [stemmer.stem(word) for word in words]
    
    # Lemmatization
    # lemmatizer = WordNetLemmatizer()
    # words = [lemmatizer.lemmatize(word) for word in words]
    
    # Joining the processed words back into a text
    return ' '.join(words)

In [383]:
data['Message'] = data['Message'].apply(preprocess_text)

In [384]:
data

Unnamed: 0,Category,Message
0,0,"go jurong point , crazy .. available bugis n g..."
1,0,ok lar ... joking wif u oni ...
2,1,free entry 2 wkly comp win fa cup final tkts 2...
3,0,u dun say early hor ... u c already say ...
4,0,"nah n't think goes usf , lives around though"
...,...,...
5567,1,2nd time tried 2 contact u. u £750 pound prize...
5568,0,ü b going esplanade fr home ?
5569,0,"pity , * mood . ... suggestions ?"
5570,0,guy bitching acted like 'd interested buying s...


In [385]:
X = data['Message']
y = data['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [386]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train)

In [387]:
X_train = vectorizer.transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

In [388]:
model = Sequential()
model.add(Dense(128, activation="relu")) 
model.add(Dropout(0.5)) 
model.add(Dense(64, activation="relu")) 
model.add(Dense(1, activation="sigmoid")) 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [389]:
model.fit(X_train, y_train, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x21ed5fd57c0>

In [390]:
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)



In [391]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9901


In [392]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.99      0.94      0.96       149

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

[[964   2]
 [  9 140]]


In [393]:
new_emails = ["Congrats! You've won a prize! Please call our customer service representative", "Important meeting tomorrow."]
new_email_sequences = vectorizer.transform(new_emails).toarray()
predictions = model.predict(new_email_sequences)



In [394]:
for i, email in enumerate(new_emails):
    spam_probability = predictions[i][0]
    if spam_probability > 0.5:
        print(f"'{email}' is spam with probability {spam_probability:.2f}")
    else:
        print(f"'{email}' is not spam with probability {1 - spam_probability:.2f}")

'Congrats! You've won a prize! Please call our customer service representative' is spam with probability 0.92
'Important meeting tomorrow.' is not spam with probability 1.00


In [395]:
data = pd.read_csv('spam.csv') 
data['Category'] = np.where(data['Category'] == 'spam', 1, 0)

In [396]:
data

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [397]:
data['Message'] = data['Message'].apply(preprocess_text)

In [398]:
X = data['Message']
y = data['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [399]:
vectorizer = CountVectorizer()
vectorizer.fit(X_train)

In [400]:
X_train = vectorizer.transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

In [401]:
model = Sequential()
model.add(Dense(128, activation="relu")) 
model.add(Dropout(0.5)) 
model.add(Dense(64, activation="relu")) 
model.add(Dense(1, activation="sigmoid")) 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [402]:
model.fit(X_train, y_train, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x21ed8a34c40>

In [403]:
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)



In [404]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.9910


In [405]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       1.00      0.93      0.97       149

    accuracy                           0.99      1115
   macro avg       0.99      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

[[966   0]
 [ 10 139]]


In [406]:
new_emails = ["Congrats! You've won a prize! Please call our customer service representative", "Important meeting tomorrow."]
new_email_sequences = vectorizer.transform(new_emails).toarray()
predictions = model.predict(new_email_sequences)



In [407]:
for i, email in enumerate(new_emails):
    spam_probability = predictions[i][0]
    if spam_probability > 0.5:
        print(f"'{email}' is spam with probability {spam_probability:.2f}")
    else:
        print(f"'{email}' is not spam with probability {1 - spam_probability:.2f}")

'Congrats! You've won a prize! Please call our customer service representative' is spam with probability 0.95
'Important meeting tomorrow.' is not spam with probability 1.00
