## Importing all libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
import io
import requests

## Importing csv

In [None]:
url="https://raw.githubusercontent.com/Pramukh660/AI-ML/main/datasets/emails.csv"
s=requests.get(url).content
df=pd.read_csv(io.StringIO(s.decode('utf-8')))

## Data analysis

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [4]:
df.tail()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0
5170,4807,spam,Subject: important online banking alert\r\ndea...,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [6]:
df.shape

(5171, 4)

In [7]:
df.columns

Index(['Unnamed: 0', 'label', 'text', 'label_num'], dtype='object')

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.shape

(5171, 4)

In [10]:
df.isnull().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [11]:
df = df.drop(['Unnamed: 0', 'label'], axis=1)

In [12]:
df.rename(columns={'text': 'text', 'label_num': 'spam'}, inplace=True)

In [13]:
df.head(2)

Unnamed: 0,text,spam
0,Subject: enron methanol ; meter # : 988291\r\n...,0
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",0


## NLP 

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
def process_text(text):
    #1 remove punct
    #2 remove stopword
    #3 return list of clean words
    
    #1 
    nopunct = [char for char in text if char not in string.punctuation]
    nopunct = ''.join(nopunct)
    
    #2
    clean_words = [word for word in nopunct.split() if word.lower() not in stopwords.words('english')]
    
    #3
    return clean_words

In [16]:
#tokenization
df['text'].head().apply(process_text)

0    [Subject, enron, methanol, meter, 988291, foll...
1    [Subject, hpl, nom, january, 9, 2001, see, att...
2    [Subject, neon, retreat, ho, ho, ho, around, w...
3    [Subject, photoshop, windows, office, cheap, m...
4    [Subject, indian, springs, deal, book, teco, p...
Name: text, dtype: object

In [17]:
#convert to matrix of tokens
from sklearn.feature_extraction.text import CountVectorizer
msg_bow = CountVectorizer(analyzer = process_text).fit_transform(df['text'])

## Splitting data

In [18]:
#split to train n test data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(msg_bow, df['spam'], test_size=0.20, random_state=0)

In [19]:
msg_bow.shape

(5171, 50381)

## Naive Bayes Classifier

In [20]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB().fit(X_train, Y_train)

In [21]:
print(classifier.predict(X_train))

[0 0 0 ... 1 0 0]


In [22]:
print(Y_train.values)

[0 0 0 ... 1 0 0]


## Evaluating model

In [23]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
pred = classifier.predict(X_train)

## Training data

In [24]:
print(classification_report(Y_train, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(Y_train, pred))
print()
print("Accuracy: ", accuracy_score(Y_train, pred))
print()

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2940
           1       0.98      0.97      0.98      1196

    accuracy                           0.99      4136
   macro avg       0.99      0.98      0.98      4136
weighted avg       0.99      0.99      0.99      4136


Confusion Matrix: 
 [[2918   22]
 [  30 1166]]

Accuracy:  0.9874274661508704



## Testing data

In [25]:
pred = classifier.predict(X_test)
print(classification_report(Y_test, pred))
print()
print("Confusion Matrix: \n", confusion_matrix(Y_test, pred))
print()
print("Accuracy: ", accuracy_score(Y_test, pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       732
           1       0.95      0.96      0.96       303

    accuracy                           0.97      1035
   macro avg       0.97      0.97      0.97      1035
weighted avg       0.97      0.97      0.97      1035


Confusion Matrix: 
 [[718  14]
 [ 13 290]]

Accuracy:  0.9739130434782609
