In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

### In this project, I am going to use 3 different datasets and merge them together.

In [2]:
data1 = pd.read_csv('mail_data.csv')
data1.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data2 = pd.read_csv('spam_dataset.csv')
data2.head()

Unnamed: 0,message_content,is_spam
0,"Hello Lonnie,\r\n\r\nJust wanted to touch base...",0
1,"Congratulations, you've won a prize! Call us n...",1
2,You have been pre-approved for a credit card w...,1
3,"Limited time offer, act now! Only a few spots ...",1
4,Your loan has been approved! Transfer funds to...,1


In [4]:
data2.is_spam.replace({1:'spam',
                       0:'ham'},
                       inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data2.is_spam.replace({1:'spam',


In [5]:
data2.columns = ['Message','Category']

In [6]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Message   1000 non-null   object
 1   Category  1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


In [7]:
data2.head(1)

Unnamed: 0,Message,Category
0,"Hello Lonnie,\r\n\r\nJust wanted to touch base...",ham


In [8]:
data2 = data2.loc[:,['Category','Message']]
data2.head(1)

Unnamed: 0,Category,Message
0,ham,"Hello Lonnie,\r\n\r\nJust wanted to touch base..."


In [9]:
data3 = pd.read_csv('email_classification.csv')
data3.head(1)

Unnamed: 0,email,label
0,Upgrade to our premium plan for exclusive acce...,ham


In [142]:
data3.columns = ['Message','Category']
data3.head(1)

Unnamed: 0,Message,Category
0,Upgrade to our premium plan for exclusive acce...,ham


In [143]:
data3 = data3.loc[:,['Category','Message']]
data3.head(1)

Unnamed: 0,Category,Message
0,ham,Upgrade to our premium plan for exclusive acce...


#### Joining the 3 datasets

In [144]:
data = pd.concat([data1,data2,data3],
axis=0)

In [145]:
data.head(2)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...


In [146]:
data.Category.value_counts()

Category
ham     5425
spam    1326
Name: count, dtype: int64

In [147]:
1326/(1326+5425)

0.19641534587468523

In [148]:
data.Category.replace({'ham':0,
                       'spam':1},
                       inplace = True)

In [149]:
data.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


##### Separating the data into x and y

In [150]:
x = data.Message
y = data.Category


In [151]:
#vectorization is the process of converting the text data into tokens
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(x)

In [152]:
feature_names = vectorizer.get_feature_names_out()
print("Features:\n", feature_names)

Features:
 ['00' '000' '000pes' ... 'èn' 'ú1' '〨ud']


In [153]:
import pickle

pickle.dump(vectorizer,open('vect.pkl','wb'))

In [154]:
np.random.seed(42)
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [155]:
x_train.shape,x_test.shape,y_test.shape,y_train.shape

((5400, 9537), (1351, 9537), (1351,), (5400,))

In [156]:
model = MultinomialNB()
model.fit(x_train,y_train)

In [157]:
model.score(x_test,y_test)

0.9785344189489267

In [158]:
pickle.dump(model,open('model.pkl','wb'))

In [159]:
from sklearn.metrics import classification_report
y_pred = model.predict(x_test)

In [160]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1095
           1       0.92      0.97      0.94       256

    accuracy                           0.98      1351
   macro avg       0.96      0.98      0.97      1351
weighted avg       0.98      0.98      0.98      1351



In [162]:
import sklearn
sklearn.__version__

'1.5.1'

In [163]:
import flask
flask.__version__

'2.0.3'

In [164]:
np.__version__

'1.26.4'