In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import fetch_20newsgroups

In [7]:
url = '/content/sample_data/spam.csv'
df = pd.read_csv(url, encoding='latin-1')

#Just to display first few rows of the content
df.columns = ['label', 'message', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']
df = df[['label', 'message']]
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
#Training model
# Load dataset
data = fetch_20newsgroups(subset='all')
X = data.data
y = data.target
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SVC(kernel='linear'))
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
# Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')


Accuracy: 0.8339228864520694
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.85      0.85       236
           1       0.67      0.77      0.72       287
           2       0.80      0.78      0.79       290
           3       0.67      0.71      0.69       285
           4       0.81      0.78      0.80       312
           5       0.84      0.79      0.81       308
           6       0.78      0.82      0.80       276
           7       0.86      0.88      0.87       304
           8       0.93      0.89      0.91       279
           9       0.85      0.90      0.87       308
          10       0.92      0.90      0.91       309
          11       0.96      0.89      0.92       290
          12       0.74      0.72      0.73       304
          13       0.86      0.85      0.85       300
          14       0.91      0.88      0.89       297
          15       0.86      0.93      0.89       292
          16       0.85      

In [9]:
df['label']=df['label'].map({'spam': 1,'ham': 0})
#Using 20% data for testing purpose
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

#Naive Bayes
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf',MultinomialNB())
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Classification Report:\n{classification_report(y_test, y_pred)}')

#Conclusion:MultinomialNB gives better results than SVM for email spaq detection

Accuracy: 0.9838565022421525
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.89      0.94       150

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [10]:
testing_mail = ["Hurray!Once in a lifetime oppurtunity is unlocked!Here's your free ticket to Switzerland.Click on the link below to claim it.Link to claim:'http://freeswitzerlandtrip.ghh.in'",
                "Hello sir,Here's your requested free materials for IELTS test preparation",
               ]

predict = pipeline.predict(testing_mail)
print(predict)

[1 0]


In [11]:
#To get more clear output(instead of 1 for spam and 0 for ham)
for email, label in zip(testing_mail, predict):
    print(f'Email: {email}\nLabel: {"Spam" if label else "Ham(Not Spam)"}\n')

Email: Hurray!Once in a lifetime oppurtunity is unlocked!Here's your free ticket to Switzerland.Click on the link below to claim it.Link to claim:'http://freeswitzerlandtrip.ghh.in'
Label: Spam

Email: Hello sir,Here's your requested free materials for IELTS test preparation
Label: Ham(Not Spam)

