In [1]:
!kaggle datasets download -d ashfakyeafi/spam-email-classification

Dataset URL: https://www.kaggle.com/datasets/ashfakyeafi/spam-email-classification
License(s): Apache 2.0
Downloading spam-email-classification.zip to /content
  0% 0.00/207k [00:00<?, ?B/s]
100% 207k/207k [00:00<00:00, 15.5MB/s]


In [2]:
!unzip spam-email-classification.zip

Archive:  spam-email-classification.zip
  inflating: email.csv               


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [6]:
df = pd.read_csv('email.csv')

df.head(10)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [23]:
!pip install plotly==5.13.1

Collecting plotly==5.13.1
  Downloading plotly-5.13.1-py2.py3-none-any.whl (15.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.2/15.2 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: plotly
  Attempting uninstall: plotly
    Found existing installation: plotly 5.15.0
    Uninstalling plotly-5.15.0:
      Successfully uninstalled plotly-5.15.0
Successfully installed plotly-5.13.1


In [27]:
import plotly.express as px


fig = px.bar(df['Category'].value_counts(),
             x=df['Category'].value_counts().index,
             y=df['Category'].value_counts().values,
             labels={'x':'Category', 'y':'Count'},
             title='Number of Spam and Non-Spam Emails',)
fig.show()

In [35]:
import plotly.express as px

fig = px.pie(df['Category'].value_counts().reset_index(name='Count'),
             values='Count',
             names='Category',  # Use 'Category' instead of 'index'
             title='Percent of Spam and Non-Spam Emails',
             labels={'Category':'Category', 'Count':'Count'}) # Update labels accordingly
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [7]:
 df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4
"{""mode"":""full""",1,1,isActive:false},1


## We need to convert the first column to which contatins texts to numbers



In [8]:
df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

df.head()


Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


### convert message text column to numbers using count vectorization technique

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['spam'], test_size=0.2)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_count = v.fit_transform(X_train.values)

X_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [13]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(X_train_count, y_train)

In [14]:
emails = [

    'Hey mohan, can we get together to watch football game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'

      ]

emails_count = v.transform(emails)

model.predict(emails_count)

array([0, 1])

let make prediction

In [15]:
X_test_count = v.transform(X_test)

model.score(X_test_count, y_test)

0.9847533632286996

# We could also use SKLEARN pipeLine

We are trying to simplify the code word base

In [16]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()), #convet test into vectors
    ('nb', MultinomialNB())  # apply the mulitnomial Naive Baiyes on the numbers
])



here we can directly feed the X_train contents into the model unlike before where we had to create a list to hold the counts of massage

In [17]:
clf.fit(X_train, y_train)

The model has a score of <h2> **98.5% accuracy** <h2>

In [18]:
clf.score(X_test, y_test)

0.9847533632286996

In [19]:
clf.predict(emails)

array([0, 1])