<a href="https://colab.research.google.com/github/Nikitachand/AI_projects/blob/main/Email_Spam_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import string                                                                        #importing libraries
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [None]:
nltk.download('stopwords')                                                            #downloading stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
df=pd.read_csv('/content/spam_ham_dataset.csv')                                       #reading dataset

In [None]:
df.head()                                                                            #checking dataset

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [None]:
df['text'] = df['text'].apply(lambda x: x.replace('\r\n', ' '))                        #cleaning dataset


In [None]:
df.text.iloc[2]

"Subject: neon retreat ho ho ho , we ' re around to that most wonderful time of the year - - - neon leaders retreat time ! i know that this time of year is extremely hectic , and that it ' s tough to think about anything past the holidays , but life does go on past the week of december 25 through january 1 , and that ' s what i ' d like you to think about for a minute . on the calender that i handed out at the beginning of the fall semester , the retreat was scheduled for the weekend of january 5 - 6 . but because of a youth ministers conference that brad and dustin are connected with that week , we ' re going to change the date to the following weekend , january 12 - 13 . now comes the part you need to think about . i think we all agree that it ' s important for us to get together and have some time to recharge our batteries before we get to far into the spring semester , but it can be a lot of trouble and difficult for us to get away without kids , etc . so , brad came up with a pote

In [None]:
df.info()                                                                          #checking info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [None]:
stemmer=PorterStemmer()                                                              #stemming
corpus=[]

stopwords_set=set(stopwords.words('english'))

for i in range(len(df)):
  text=df['text'].iloc[i].lower()
  text=text.translate(str.maketrans('','',string.punctuation)).split()
  text=[stemmer.stem(word) for word in text if word not in stopwords_set]
  text=' '.join(text)
  corpus.append(text)

In [None]:
vectorizer=CountVectorizer()                                                         #vectorizing

X=vectorizer.fit_transform(corpus).toarray()

y=df.label_num

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)


In [None]:
model=RandomForestClassifier(n_jobs=-1)                                              #training
model.fit(X_train,y_train)

In [None]:
model.score(X_test,y_test)                                                            #testing

0.9768115942028985

In [None]:
email_to_classify=df.text.values[10]

In [None]:
email_to_classify

"Subject: vocable % rnd - word asceticism vcsc - brand new stock for your attention vocalscape inc - the stock symbol is : vcsc vcsc will be our top stock pick for the month of april - stock expected to bounce to 12 cents level the stock hit its all time low and will bounce back stock is going to explode in next 5 days - watch it soar watch the stock go crazy this and next week . breaking news - vocalscape inc . announces agreement to resell mix network services current price : $ 0 . 025 we expect projected speculative price in next 5 days : $ 0 . 12 we expect projected speculative price in next 15 days : $ 0 . 15 vocalscape networks inc . is building a company that ' s revolutionizing the telecommunications industry with the most affordable phone systems , hardware , online software , and rates in canada and the us . vocalscape , a company with global reach , is receiving international attention for the development of voice over ip ( voip ) application solutions , including the award 

In [None]:
email_text=email_to_classify.lower().translate(str.maketrans('','',string.punctuation)).split()
email_text=[stemmer.stem(word) for word in email_text if word not in stopwords_set]
email_text=' '.join(email_text)


email_corpus=[email_text]

x_email=vectorizer.transform(email_corpus)

In [None]:
model.predict(x_email)                                                           #predicting

array([1])

In [None]:
y_predict=model.predict(X_test)

In [None]:
df.label_num.iloc[10]

1

In [None]:
accuracy=model.score(X_test,y_test)
accuracy

0.9777777777777777

In [None]:
precision=precision_score(y_test,y_predict)                                     #calculating precison score
precision


0.9622641509433962

In [None]:
recall = recall_score(y_test, y_predict, average='weighted')                     #calculating reacall
recall

0.9777777777777777

In [None]:
f1 = f1_score(y_test, y_predict, average='weighted')                             #calculating f1 score
f1

0.977787532655979

In [None]:
print("\nClassification Report:")                                                #classification report
print(classification_report(y_test, y_predict))


Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       718
           1       0.96      0.97      0.96       317

    accuracy                           0.98      1035
   macro avg       0.97      0.97      0.97      1035
weighted avg       0.98      0.98      0.98      1035

