Spam email classifier

In [1]:
import pandas as pd
import numpy as np
import string 
import re

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
df = pd.read_csv("C:\\Users\\srine\\Downloads\\Machine learning learning\\archive (7)\\spam.csv", encoding='ISO-8859-1')
print(df.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


In [4]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [5]:
df.rename(columns={'v1' : 'label' , 'v2' :'messages'}, inplace=True)

In [6]:
df.describe()

Unnamed: 0,label,messages,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [7]:
#converting label column into numerical
df['label_num'] = df['label'].map({'ham' :0, 'spam':1})

TEXT PREPROCESSING

In [8]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http?://\S+|WWW\.\S+",'',text) #this will remove URLs from data
    text = re.sub(r'\d+','',text) #remove numbers
    text = text.translate(str.maketrans('','',string.punctuation)) #remove punctuations
    text = text.strip()
    return text

df['clean_text'] = df['messages'].apply(preprocess_text)

VECTORIZATION

In [9]:
#convert text into numbers using TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=3000)
X =tfidf.fit_transform(df['clean_text']).toarray()

In [10]:
#Input and output split
Y = df['label_num']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

Model Training

In [11]:
model = LogisticRegression()
model.fit(X_train, Y_train)

Model Evaluation

In [12]:
y_pred = model.predict(X_test)

#performance metrics
print("accuray: " , accuracy_score(Y_test, y_pred))
print("\nClassification report:\n", classification_report(Y_test, y_pred))

#confusion matrix
print("\nConfusion matrix:\n", confusion_matrix(Y_test, y_pred))

accuray:  0.9632286995515695

Classification report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.73      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115


Confusion matrix:
 [[965   0]
 [ 41 109]]


Testing new messages

In [13]:
new_message = ["Congratulations!! You won a free ticket. Call now"]
new_message_clean = [preprocess_text(msg) for msg in new_message]
new_message_vector = tfidf.transform(new_message_clean)

prediction = model.predict(new_message_vector)
print("Prediction (1 = Spam, 0 = Not Spam):", prediction)

Prediction (1 = Spam, 0 = Not Spam): [1]
