# Project - Email Spam Classification
## AIM : Classify the emails into spam or genuine based on the body of the email using the various Machine Learning techniques and algorithms
### About the dataset:
### 1. text : Contains the subject of the email messages
### 2. spam : Classified as spam or genuine

## Import the dependencies

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#### Downloading the stopwords

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vansh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Printing stopwords in english

In [3]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## Loading the dataset

In [34]:
data = pd.read_csv('emails.csv')

In [35]:
data

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,Spam
1,Subject: the stock trading gunslinger fanny i...,Spam
2,Subject: unbelievable new homes made easy im ...,Spam
3,Subject: 4 color printing special request add...,Spam
4,"Subject: do not have money , get software cds ...",Spam
...,...,...
5721,Subject: re : research and development charges...,Genuine
5722,"Subject: re : receipts from visit jim , than...",Genuine
5723,Subject: re : enron case study update wow ! a...,Genuine
5724,"Subject: re : interest david , please , call...",Genuine


In [36]:
data.rename(columns = {'spam' : 'result'}, inplace = True)
data.head()

Unnamed: 0,text,result
0,Subject: naturally irresistible your corporate...,Spam
1,Subject: the stock trading gunslinger fanny i...,Spam
2,Subject: unbelievable new homes made easy im ...,Spam
3,Subject: 4 color printing special request add...,Spam
4,"Subject: do not have money , get software cds ...",Spam


In [37]:
data['result'].value_counts()

result
Genuine    4358
Spam       1368
Name: count, dtype: int64

## Data Preprocessing

### Using Natural Language Processing in text column

In [38]:
# Checking the null values 
data.isnull().sum()

text      0
result    0
dtype: int64

### Applying stemming i.e. reducing word to its root word

In [39]:
port_stem = PorterStemmer()

In [40]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [41]:
data['text'] = data['text'].apply(stemming)

In [42]:
data['text']

0       subject natur irresist corpor ident lt realli ...
1       subject stock trade gunsling fanni merril muzo...
2       subject unbeliev new home made easi im want sh...
3       subject color print special request addit info...
4       subject money get softwar cd softwar compat gr...
                              ...                        
5721    subject research develop charg gpg forward shi...
5722    subject receipt visit jim thank invit visit ls...
5723    subject enron case studi updat wow day super t...
5724    subject interest david pleas call shirley cren...
5725    subject news aurora updat aurora version faste...
Name: text, Length: 5726, dtype: object

### Sepearting data and label

In [43]:
x = data['text']
y = data['result']

In [44]:
print(x)

0       subject natur irresist corpor ident lt realli ...
1       subject stock trade gunsling fanni merril muzo...
2       subject unbeliev new home made easi im want sh...
3       subject color print special request addit info...
4       subject money get softwar cd softwar compat gr...
                              ...                        
5721    subject research develop charg gpg forward shi...
5722    subject receipt visit jim thank invit visit ls...
5723    subject enron case studi updat wow day super t...
5724    subject interest david pleas call shirley cren...
5725    subject news aurora updat aurora version faste...
Name: text, Length: 5726, dtype: object


In [45]:
print(y)

0          Spam
1          Spam
2          Spam
3          Spam
4          Spam
         ...   
5721    Genuine
5722    Genuine
5723    Genuine
5724    Genuine
5725    Genuine
Name: result, Length: 5726, dtype: object


### Converting text into numerical values

In [70]:
vectorizer = TfidfVectorizer()
vectorizer.fit(x)
x = vectorizer.transform(x)

In [71]:
print(x)

  (0, 372)	0.09611145440683241
  (0, 477)	0.09764864656172705
  (0, 810)	0.07675149660769723
  (0, 1521)	0.14089546197020977
  (0, 1961)	0.14985988502484038
  (0, 2059)	0.07666832744153454
  (0, 2669)	0.08563124228583953
  (0, 2866)	0.09083300948423838
  (0, 2962)	0.09604550727103181
  (0, 3357)	0.11531736520667987
  (0, 3587)	0.11428941978497648
  (0, 3854)	0.13173997357408934
  (0, 3990)	0.08344437770415174
  (0, 4179)	0.09276206560678589
  (0, 4298)	0.15920197987906035
  (0, 4575)	0.15451756602525055
  (0, 4622)	0.07531048596160969
  (0, 4718)	0.06907233726808736
  (0, 4884)	0.08215944878019439
  (0, 5343)	0.04733142571892892
  (0, 6038)	0.10645096248329243
  (0, 6194)	0.0702507941549344
  (0, 6294)	0.08149556091461164
  (0, 6560)	0.08075088132858242
  (0, 6561)	0.09338373219690309
  :	:
  (5725, 21194)	0.05828695969958239
  (5725, 21234)	0.03594228646230994
  (5725, 21570)	0.008310218937670791
  (5725, 21963)	0.04282370035945728
  (5725, 22145)	0.034366862349842445
  (5725, 22179)	

In [72]:
y.replace({'Spam': 1, 'Genuine': 0}, inplace=True)
print(y)

0       1
1       1
2       1
3       1
4       1
       ..
5721    0
5722    0
5723    0
5724    0
5725    0
Name: result, Length: 5726, dtype: int64


## Validating the data - Splitting into train and test

In [73]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify=y, random_state = 40)

In [74]:
x_train.shape

(4580, 25536)

In [75]:
x_test.shape

(1146, 25536)

## Training the model - Logistic Regression

In [76]:
model = LogisticRegression()

In [79]:
model.fit(x_train, y_train)

## Evaluation

### Testing accuracy on training set

In [80]:
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)

In [85]:
print('Accuracy score of the training data : ', training_data_accuracy* 100, "%")

Accuracy score of the training data :  99.65065502183405 %


### Testing accuracy on testing set

In [82]:
x_test_prediction = model.predict(x_test)
testing_data_accuracy = accuracy_score(x_test_prediction, y_test)

In [84]:
print('Accuracy score of the testing data : ', testing_data_accuracy * 100, "%")

Accuracy score of the testing data :  98.42931937172776 %


## Making a predictive system

In [87]:
x_new = x_test[45]

prediction = model.predict(x_new)
print(prediction)

if (prediction[0]==0):
  print('The email is not spam')
else:
  print('The news is spam')

[0]
The email is not spam


In [89]:
print(y_test.iloc[45])

0


## Saving model using pickle

In [90]:
import pickle

with open('email_spam_detection.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)
