# Import Libraries

In [None]:
# Analysis Libraries
import numpy as np
import pandas as pd
# Regex
import re 
# Natural Language Toolkil
import nltk
# Tokenization
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
# Stopwords
from nltk.corpus import stopwords
# vectorization
from sklearn.feature_extraction.text import CountVectorizer
# Model Buidling
## Feature Engineering
from sklearn.model_selection import train_test_split
## BernoulliNB Model
from sklearn.naive_bayes import BernoulliNB
## Logistic Regression Model
from sklearn.linear_model import LogisticRegression
# save the model
import pickle

# Download the Packages

In [None]:
from sqlalchemy.sql.expression import text
# Tokennize the text 
nltk.download('punkt')
# Lemmentize the text
nltk.download('wordnet')
# Stop words 
nltk.download('stopwords')
stopword = stopwords.words('english')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Read Dataset

In [None]:
spam_email=pd.read_csv('/content/spam_ham_dataset.csv')
print(spam_email)

      Unnamed: 0 label                                               text  \
0            605   ham  Subject: enron methanol ; meter # : 988291\r\n...   
1           2349   ham  Subject: hpl nom for january 9 , 2001\r\n( see...   
2           3624   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...   
3           4685  spam  Subject: photoshop , windows , office . cheap ...   
4           2030   ham  Subject: re : indian springs\r\nthis deal is t...   
...          ...   ...                                                ...   
5166        1518   ham  Subject: put the 10 on the ft\r\nthe transport...   
5167         404   ham  Subject: 3 / 4 / 2000 and following noms\r\nhp...   
5168        2933   ham  Subject: calpine daily gas nomination\r\n>\r\n...   
5169        1409   ham  Subject: industrial worksheets for august 2000...   
5170        4807  spam  Subject: important online banking alert\r\ndea...   

      label_num  
0             0  
1             0  
2             0  
3  

In [None]:
# shape
spam_email.shape

(5171, 4)

# Data Preprocessing

In [None]:
# identify the null values
spam_email.isnull().sum()

Unnamed: 0    0
label         0
text          0
label_num     0
dtype: int64

In [None]:
# drop the columns
spam_email=spam_email.drop(columns=['Unnamed: 0' , 'label'])

In [None]:
spam_email['text'][0]

"Subject: enron methanol ; meter # : 988291\r\nthis is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary\r\nflow data provided by daren } .\r\nplease override pop ' s daily volume { presently zero } to reflect daily\r\nactivity you can obtain from gas control .\r\nthis change is needed asap for economics purposes ."

# Text Data Preprocessing

In [None]:
def text_preprocess(text):
  # remove the punctiation
  cleaned_punc = re.sub(r'[^\w\s]+', '', text)
  # remove the url
  cleaned_url = re.sub(r'http\S+', '', cleaned_punc)
  # remove the special character & number
  cleaned_spec_number = re.sub(r'[^A-Za-z.]+', ' ',cleaned_url)
  # remove the tag from text
  cleaned_tag = re.sub('<[^<]+?>', ' ', cleaned_spec_number)
  return cleaned_tag

In [None]:
# Iterate over DataFrame and clean the 'text' column
spam_email['text'] =[text_preprocess(spam_email['text'][i]) for i in range(len(spam_email))]

In [None]:
spam_email.head(5)

Unnamed: 0,text,label_num
0,Subject enron methanol meter this is a follow ...,0
1,Subject hpl nom for january see attached file ...,0
2,Subject neon retreat ho ho ho we re around to ...,0
3,Subject photoshop windows office cheap main tr...,1
4,Subject re indian springs this deal is to book...,0


### Tokenization and Lower Case

In [None]:
from nltk.tokenize import word_tokenize
# Function to remove tags
def Token(text):
  tokens = word_tokenize(text)
  lowercase_tokens = [token.lower() for token in tokens]
  return lowercase_tokens

# Iterate over DataFrame and clean the 'text' column
spam_email['text'] = [Token(spam_email['text'][i]) for i in range(len(spam_email))]

### Lemmatization

In [None]:
def lem(text):
  lemmatized_word = [wordnet_lemmatizer.lemmatize(word) for word in text]
  return lemmatized_word

# Iterate over DataFrame and clean the 'text' column
spam_email['text'] = [lem(spam_email['text'][i]) for i in range(len(spam_email))]

### Stopwords

In [None]:
def stop_words(text):
  removing_stopwords = [word for word in text if word not in stopword]
  return removing_stopwords

# Iterate over DataFrame and clean the 'text' column
spam_email['text'] = [stop_words(spam_email['text'][i]) for i in range(len(spam_email))]

In [None]:
def join_text(text):
  text=' '.join(text)
  return text

# Iterate over DataFrame and clean the 'text' column
spam_email['text'] = [join_text(spam_email['text'][i]) for i in range(len(spam_email))]

In [None]:
# Split the Dataset
x=spam_email['text']   # Training Features
y=spam_email['label_num']   # Target Feature

In [None]:
print(x.shape)
print(y.shape)

(5171,)
(5171,)


# Text Feature Extraction 

### Count Vectorization

In [None]:
MIN_DF = 10
vec = CountVectorizer(min_df=MIN_DF)
vec_class = vec.fit_transform(x)
vec_arr = vec_class.toarray()
x = pd.DataFrame(vec_arr,columns=vec.get_feature_names_out())
x.head()

Unnamed: 0,aa,abdominal,ability,able,absence,absolute,absolutely,ac,accept,acceptance,...,yr,yvette,zajac,zenith,zero,zeroed,zivley,zone,zonedubai,zyban
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
x.shape

(5171, 4038)

# Model Building

### Feature Engineering

In [None]:
# Split the target and training dataset into tarining and label dataset
x_train ,  x_test , y_train , y_test=train_test_split(x,y , test_size=0.25)

In [None]:
# Shape
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(3878, 4038)
(3878,)
(1293, 4038)
(1293,)


## Logistic Regression Model

In [None]:
# Create and fit the logistic regression model
model_LR = LogisticRegression()
model_LR.fit(x_train, y_train)

# Calculate the accuracy score
score = model_LR.score(x_test, y_test)
print("Accuracy score:", score)


Accuracy score: 0.9721577726218097


## Testing Through Logistic Regression Model

In [None]:
comment1 = ["100% of daily 1.50 GB data quota exhausted as on 26-May-23 00:31. Jio Number : Daily high speed data quota will be restored on 26-May-23 01:04.To know where you have consumed your data quota, click "]
vect = vec.transform(comment1).toarray()
model_LR.predict(vect)




array([0])

## BernoulliNB Model

In [None]:
model_BL = BernoulliNB()
# fit the model
model_BL.fit(x_train,y_train)
# calculate the accuracy
score=model_BL.score(x_test,y_test)
print("Accuracy score:", score)

Accuracy score: 0.9195668986852281


## Testing Through BernoulliNB Model

In [None]:
comment1 = ["100% of daily 1.50 GB data quota exhausted as on 26-May-23 00:31. Jio Number : Daily high speed data quota will be restored on 26-May-23 01:04.To know where you have consumed your data quota, click "]
vect = vec.transform(comment1).toarray()
model_BL.predict(vect)



array([0])

In [None]:
# save the Logistic Regression Model to disk
filename = 'model_LR.sav'
pickle.dump(model_LR, open(filename, 'wb'))

In [None]:
# save the BernoulliNB Model to disk
filename = 'model_BN.sav'
pickle.dump(model_BL, open(filename, 'wb'))

# Conclution <br>

Our focus on implementing and evaluating two classification models, Logistic Regression and BernoulliNB, to predict spam and non-spam emails. We used a labeled dataset containing instances of both spam and non-spam emails for training and testing purposes.<br>

After training the models on the dataset, we achieved an accuracy of 97% with Logistic Regression and 91% with BernoulliNB. These high accuracy scores indicate that both models performed well in distinguishing between spam and non-spam emails.<br>

We adopted the convention of assigning the label 0 to represent non-spam emails and 1 to represent spam emails. By utilizing these models, we were able to effectively predict the spam or non-spam status of new, unseen emails.<br>