Importing Libraries

In [74]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

Loading the Data

In [75]:
df = pd.read_csv('mail_data.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Checking the Data

In [76]:
print(f'Shape of the Dataset : {df.shape}')
print('\n')
print('Dataset Information')
print(df.info())
print('\n')
print('Empty Datapoints')
print(df.isnull().sum())

Shape of the Dataset : (5572, 2)


Dataset Information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None


Empty Datapoints
Category    0
Message     0
dtype: int64


In [77]:
df['Category'].value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

Feature and Targets

In [78]:
x = df['Message']
y = df['Category']

Encoding the Data

In [79]:
y.replace({'ham' : 1, 'spam' : 0}, inplace = True)

Splitting the Data into Training and Testing Data

In [80]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state =3)

Vectorizing the Features from Text to Numerical

In [81]:
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase = 'True')
x_trainFeature = vectorizer.fit_transform(x_train)
x_testFeature = vectorizer.transform(x_test)

y_train = y_train.astype('int')
y_test = y_test.astype('int')

Training the Model

In [82]:
model = XGBClassifier()
model.fit(x_trainFeature, y_train)

Model Prediction on Testing Data

In [83]:
y_predTest = model.predict(x_testFeature)

Comparing Actual Class vs Predicted Class

In [84]:
dfrt = pd.DataFrame({'Actual Values': y_test, 'Predicted Class': y_predTest})
print(dfrt)

      Actual Values  Predicted Class
2632              0                0
454               1                1
983               0                0
1282              1                1
4610              1                1
...             ...              ...
4827              1                1
5291              1                1
3325              1                1
3561              1                1
1136              1                1

[1115 rows x 2 columns]


Model Score

In [85]:
print(f"The score for the model is {model.score(x_testFeature,y_test)}")

The score for the model is 0.9721973094170404


Predictive System 

In [98]:
x_new = x_testFeature[0]
prediction = model.predict(x_new)
if prediction[0] == 0:
    print('Spam Mail')
else:
    print('Ham Mail')

Spam Mail
