# Spam Classifier

Import Packages

In [1]:
import pandas as pd
import numpy as np
import os
import nltk
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

Read Spam-Ham csv file

In [2]:
#Get file path
root = os.path.dirname( os.path.abspath('SpamClassifier.ipynb'))
data_path = "{0}/spam_ham_dataset.csv".format(root)
#Read csv file
data = pd.read_csv(data_path)[['label', 'text']]
#Print first five rows of csv
data.head()

Unnamed: 0,label,text
0,ham,enron methanol ; meter # : 988291\r\nthis is ...
1,ham,"hpl nom for january 9 , 2001\r\n( see attache..."
2,ham,"neon retreat\r\nho ho ho , we ' re around to ..."
3,spam,"photoshop , windows , office . cheap . main t..."
4,ham,re : indian springs\r\nthis deal is to book t...


In [3]:
#Dataframe Describe

print("There are {0} datapoints and {1} feature(s) in this dataset. \n".format(data.shape[0],int(data.shape[1])-1))


There are 5171 datapoints and 1 feature(s) in this dataset. 



# Pre-Processing Text

Remove stopwords, punctuations and stemm the tokens

In [4]:
def rem_punctuations(text):
    new_text =  "".join([t for t in text if t not in string.punctuation])
    return new_text


def text_cleaning(text):
    new_text =  "".join([t for t in text if t not in string.punctuation])
    tokens = nltk.tokenize.word_tokenize(new_text)
    ps = nltk.PorterStemmer()
    stopwords = nltk.corpus.stopwords.words('english')
    new_text = [ps.stem(word) for word in tokens if word not in stopwords]
    return new_text



Test text_cleaning method

In [5]:
print(data['text'][3])
data['clean_text'] = data['text'].apply(lambda x: text_cleaning(x))
print(data.head())
#print(text_cleaning(rem_punctuations(data['text'][0:4])))


 photoshop , windows , office . cheap . main trending
abasements darer prudently fortuitous undergone
lighthearted charm orinoco taster
railroad affluent pornographic cuvier
irvin parkhouse blameworthy chlorophyll
robed diagrammatic fogarty clears bayda
inconveniencing managing represented smartness hashish
academies shareholders unload badness
danielson pure caffein
spaniard chargeable levin

  label                                               text  \
0   ham   enron methanol ; meter # : 988291\r\nthis is ...   
1   ham   hpl nom for january 9 , 2001\r\n( see attache...   
2   ham   neon retreat\r\nho ho ho , we ' re around to ...   
3  spam   photoshop , windows , office . cheap . main t...   
4   ham   re : indian springs\r\nthis deal is to book t...   

                                          clean_text  
0  [enron, methanol, meter, 988291, follow, note,...  
1  [hpl, nom, januari, 9, 2001, see, attach, file...  
2  [neon, retreat, ho, ho, ho, around, wonder, ti...  
3  [photos

Data Analysis

In [83]:
#Check for null values

print(data['clean_text'].isna().sum())



0


Train Test Split of data

In [13]:

train_X, test_X, train_y, test_y = train_test_split(data[['clean_text']], data['label'], test_size=0.20, random_state=42)
train_y.head()

5132     ham
2067    spam
4716     ham
4710     ham
2268    spam
Name: label, dtype: object

Text Vectorizing

In [85]:
tfidf_vect = TfidfVectorizer(analyzer=text_cleaning)
tfidf_vect_fit = tfidf_vect.fit(train_X['clean_text'])
tfidf_train = tfidf_vect_fit.transform(train_X['clean_text'])
tfidf_test = tfidf_vect_fit.transform(test_X['clean_text'])
tfidf_train_vect = pd.DataFrame(tfidf_train.toarray())
tfidf_test_vect = pd.DataFrame(tfidf_test.toarray())
tfidf_train_vect.head()



[1. 1. 1. ... 1. 1. 1.]


Building a Logistic Regression model for Classification. Calculate fitting time and prediction time for the given dataset

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support as score
import time
start = time.time()
logReg = LogisticRegression(random_state=10, max_iter=500, n_jobs = -1).fit(tfidf_train, train_y)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = logReg.predict(tfidf_test)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, support = score(test_y, y_pred, pos_label='spam', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==test_y).sum()/len(y_pred), 3)))

Fit time: 1.978 / Predict time: 0.001 ---- Precision: 0.0 / Recall: 0.0 / Accuracy: 0.717


  _warn_prf(average, modifier, msg_start, len(result))


Building a Random Forest model for Classification. Calculate fitting time and prediction time for the given dataset

In [94]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [97]:
start = time.time()
randFor = RandomForestClassifier(n_estimators=3000, max_depth=None, n_jobs = -1).fit(tfidf_train, train_y)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = randFor.predict(tfidf_test)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, support = score(test_y, y_pred, pos_label='spam', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==test_y).sum()/len(y_pred), 3)))

Fit time: 48.563 / Predict time: 10.266 ---- Precision: 0.306 / Recall: 1.0 / Accuracy: 0.357
