# **EMAIL SPAM DETECTION USING MACHINE LEARNING**

In [3]:
import string
import pandas as pd
import nltk
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer

## Getting Dataset

In [4]:
path = '/content/drive/MyDrive/Datasets/spam.csv'
df = pd.read_csv(path, encoding='latin-1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


## Cleaning unnecessary data

In [5]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


## Removing Stopwords

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


## Stemmerizing the mails and removing all special characters

In [8]:
stemmer = PorterStemmer()

stopwords_ = set(stopwords.words('english'))

corpus = []

for i in range(len(df)):
  text = df['v2'].iloc[i].lower()
  text = text.translate(str.maketrans('', '', string.punctuation)).split()
  text = [stemmer.stem(word) for word in text if word not in stopwords_]
  text = ' '.join(text)
  corpus.append(text)

## Encoding Mails into matrix form

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
vect = CountVectorizer()

x = vect.fit_transform(corpus).toarray()
y = df['v1']
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [11]:
y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: v1, Length: 5572, dtype: object

## Splitting data into training and testing variables and training the model

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [14]:
rfc = RandomForestClassifier(n_jobs=-1)

In [15]:
rfc.fit(x_train, y_train)

## Checking the Accuracy score

In [16]:
print(f'Accuracy score is: {rfc.score(x_test, y_test)}')

Accuracy score is: 0.967713004484305


## Predicting the outcome

In [17]:
y_pred = rfc.predict(x_test)
y_pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)

## Cross Checking / Evaluating the model

In [18]:
mail = df['v2'].iloc[2]
text = mail.lower()
text = text.translate(str.maketrans('', '', string.punctuation)).split()
text = [stemmer.stem(word) for word in text if word not in stopwords_]
text = ' '.join(text)

corpus_mail = [text]

x_mail = vect.transform(corpus_mail)

In [19]:
rfc.predict(x_mail)

array(['spam'], dtype=object)

In [20]:
df['v1'].iloc[2]

'spam'