# Spam Mail Detection Model

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
spam_mails = pd.read_csv('spam.csv',encoding='latin-1')

In [3]:
spam_mails.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## Data Preprocessing

In [4]:
spam_mails = spam_mails.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [5]:
spam_mails.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
spam_mails.isna().sum()

v1    0
v2    0
dtype: int64

In [7]:
spam_mails.v1.loc[spam_mails.v1 == 'spam'] = 0
spam_mails.v1.loc[spam_mails.v1 == 'ham'] = 1

In [8]:
spam_mails.head()

Unnamed: 0,v1,v2
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


## Training and Test Sets

In [9]:
X = spam_mails.v2
Y = spam_mails.v1
X_train, X_test, Y_train , Y_test = train_test_split(X,Y, test_size=0.2 , random_state=1)

## Converting the textual data to numerical data

In [10]:
feature_extraction = TfidfVectorizer(stop_words='english', lowercase=True, min_df=1)
X_train_extraction = feature_extraction.fit_transform(X_train)
X_test_extraction = feature_extraction.transform(X_test)

In [11]:
print(feature_extraction.get_feature_names_out())
print(X_test_extraction)

['00' '000' '000pes' ... 'ûïharry' 'ûò' 'ûówell']
  (0, 5511)	0.7437236777699796
  (0, 1952)	0.668487166013152
  (1, 7503)	0.5779589423031507
  (1, 3101)	0.3521437100777044
  (1, 2603)	0.7361781499504947
  (2, 7045)	0.25798112082710245
  (2, 6163)	0.3695619725630751
  (2, 5860)	0.2713600032866372
  (2, 5629)	0.25867721226632806
  (2, 5563)	0.2396367460286043
  (2, 5023)	0.28961803789430846
  (2, 1913)	0.3534909409512345
  (2, 1837)	0.3185106937068062
  (2, 1190)	0.3534909409512345
  (2, 763)	0.40454221980750343
  (3, 6162)	0.372347872612999
  (3, 4079)	0.6988568203359501
  (3, 2278)	0.610701405295992
  (4, 7219)	0.23883418991085034
  (4, 7088)	0.33853500562686667
  (4, 6620)	0.3456851754159831
  (4, 6323)	0.366246041031354
  (4, 6183)	0.2898395062422636
  (4, 6081)	0.33853500562686667
  (4, 5764)	0.25770611232488294
  :	:
  (1111, 4114)	0.3199758043577637
  (1111, 3832)	0.28732648680047795
  (1111, 3779)	0.14226662954159577
  (1111, 2929)	0.22016385014817363
  (1111, 2365)	0.1804890523

## Training a Logistic Regression Model

In [12]:
model = LogisticRegression()
model.fit(X_train_extraction, Y_train.astype('int'))

## Making Predictions and Evaluating the Model

In [13]:
prediction = model.predict(X_test_extraction)

In [14]:
for i in range(1115):
    if prediction[i] == 1:
        print('ham')
    else:
        print('spam')
    

ham
ham
ham
ham
spam
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
spam
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
spam
ham
spam
ham
ham
ham
ham
ham
ham
ham
spam
spam
ham
ham
spam
ham
ham
ham
ham
ham
ham
ham
spam
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
spam
ham
ham
ham
spam
ham
ham
spam
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
spam
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
spam
ham
ham
ham
ham
ham
ham
spam
spam
spam
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
spam
ham
ham
ham
ham
ham
spam
ham
ham
ham
spam
ham
ham
ham
ham
ham
ham
ham
spam
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
spam
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
spam
ham
ham
ham
ham
ham
spam
ham
ham
ham
ham
spam
ham
ham
ham
ham
spam
ham
ham
ham
ham
ham
ham
ham
ham
ham
spam
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
ham
spam
ham
ham
ham
h

In [15]:
Y_test

1078    1
4028    1
958     1
4642    1
4674    0
       ..
324     1
1163    1
86      1
4214    1
90      1
Name: v1, Length: 1115, dtype: object

In [16]:
print("Train Data Accuracy of Model is: " , accuracy_score(Y_train.astype('int'),model.predict(X_train_extraction)))
print("Test Data Accuracy of Model is: " , accuracy_score(Y_test.astype('int'),prediction))

Train Data Accuracy of Model is:  0.9670181736594121
Test Data Accuracy of Model is:  0.9775784753363229


In [17]:
input_mail = ["URGENT! You have won a 1 week FREE membership in our å£100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18"]
input_data_extraction = feature_extraction.transform(input_mail)
predict = model.predict(input_data_extraction)
if (predict[0]==1):
  print('Ham mail')
else:
  print('Spam mail')

Spam mail


## Exporting Model

In [18]:
import joblib

In [19]:
joblib.dump(model , 'Spam_Mail_Detection_Model')

['Spam_Mail_Detection_Model']