In [177]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import string
import matplotlib.pyplot as plt
%matplotlib inline

### Data processing using panda library

In [178]:
dset = pd.read_csv("spam.csv",encoding="latin_1")
dset.head(8)

Unnamed: 0,class,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...


### EDA

In [179]:
dinfo=dset.info()
dinfo

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
class    5572 non-null object
SMS      5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [180]:
dset.describe()

Unnamed: 0,class,SMS
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


As from continuosly performing EDA it leads to start thinking about the features we are going to be using.Here comes the general idea of feature engineering comes. The better your domain knowledge on the data, the better your ability to engineer more features from it. Feature engineering is a very large part of spam detection in general.

Let's make a new column to detect how long the text messages are:

In [181]:
dset['Length'] = dset['SMS'].apply(len)

In [182]:
dset.head(8)

Unnamed: 0,class,SMS,Length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61
5,spam,FreeMsg Hey there darling it's been 3 week's n...,148
6,ham,Even my brother is not like to speak with me. ...,77
7,ham,As per your request 'Melle Melle (Oru Minnamin...,160


In [183]:
dset.groupby('class').count()

Unnamed: 0_level_0,SMS,Length
class,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,4825,4825
spam,747,747


### Data Visualization

In [184]:
dset['Length'].describe()

count    5572.000000
mean       80.118808
std        59.690841
min         2.000000
25%        36.000000
50%        61.000000
75%       121.000000
max       910.000000
Name: Length, dtype: float64

See what we found, A 910 character long message.
Let's use masking to find this message:

In [185]:
dset[dset['Length']==910]['SMS'].iloc[0]

"For me the love should start with attraction.i should feel that I need her every time around me.she should be the first thing which comes in my thoughts.I would start the day and end it with her.she should be there every time I dream.love will be then when my every breath has her name.my life should happen around her.my life will be named to her.I would cry for her.will give all my happiness and take all her sorrows.I will be ready to fight with anyone for her.I will be in love when I will be doing the craziest things for her.love will be when I don't have to proove anyone that my girl is the most beautiful lady on the whole planet.I will always be singing praises for her.love will be when I start up making chicken curry and end up makiing sambar.life will be the most beautiful then.will get every morning and thank god for the day because she is with me.I would like to say a lot..will tell later.."

### Text Pre-processing

In [186]:
dObject = dset['class'].values
dObject

array(['ham', 'ham', 'spam', ..., 'ham', 'ham', 'ham'], dtype=object)

In [187]:
dset.loc[dset['class']=="ham","class"] = 1

In [188]:
dset.loc[dset['class']=="spam","class"] = 0

In [189]:
dObject2=dset['class'].values
dObject2

array([1, 1, 0, ..., 1, 1, 1], dtype=int64)

In [190]:
dset.head(8)

Unnamed: 0,class,SMS,Length
0,1,"Go until jurong point, crazy.. Available only ...",111
1,1,Ok lar... Joking wif u oni...,29
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,1,U dun say so early hor... U c already then say...,49
4,1,"Nah I don't think he goes to usf, he lives aro...",61
5,0,FreeMsg Hey there darling it's been 3 week's n...,148
6,1,Even my brother is not like to speak with me. ...,77
7,1,As per your request 'Melle Melle (Oru Minnamin...,160


First removing punctuation. We can just take advantage of Python's built-in string library to get a quick list of all the possible punctuation:

In [191]:
#clean message from punctuations
def cleanMessage(message):
    nonPunc = [char for char in message if char not in string.punctuation]
    nonPunc = "".join(nonPunc)
    return nonPunc

Tokenization-( process of converting the normal text strings in to a list of tokens(also known as lemmas)).

In [192]:
dset['SMS'] = dset['SMS'].apply(cleanMessage)

In [193]:
dset.head(8)

Unnamed: 0,class,SMS,Length
0,1,Go until jurong point crazy Available only in ...,111
1,1,Ok lar Joking wif u oni,29
2,0,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,1,U dun say so early hor U c already then say,49
4,1,Nah I dont think he goes to usf he lives aroun...,61
5,0,FreeMsg Hey there darling its been 3 weeks now...,148
6,1,Even my brother is not like to speak with me T...,77
7,1,As per your request Melle Melle Oru Minnaminun...,160


Now we need to convert each of those messages into a vector the SciKit Learn's algorithm models can work with and machine learning model which we will gonig to  use can understand.

In [194]:
CV = CountVectorizer(stop_words="english",lowercase = True)

In [195]:
new_x = dset['SMS'].values
new_y = dset['class'].values

### Splitting Train and Test Data

In [196]:
new_x_train,new_x_test,y_train,y_test = train_test_split(new_x,new_y,test_size=0.2)

In [197]:
new_x_train = CV.fit_transform(new_x_train)

### Training a model

With messages represented as vectors, we can finally train our spam/ham classifier. Now we can actually use almost any sort of classification algorithms. For a variety of reasons, the Naive Bayes classifier algorithm is a good choice.

In [198]:
NB = MultinomialNB()

In [199]:
NB.fit(new_x_train_CV,new_y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Data Munging (or Data Wrangling)- which means preparing the data for dedicated purpose, taking the data from its raw state and transforming and mapping into another format.

In [200]:
new_x_test = CV.transform(new_x_test)

In [201]:
new_y_predict = NB.predict(new_x_test_CV)

In [202]:
accuracyScore = accuracy_score(new_y_test,new_y_predict)*100

In [203]:
print("Prediction Accuracy :",accuracyScore)

Prediction Accuracy : 85.38116591928251


## Linear Regression

In [204]:
from sklearn.linear_model import LogisticRegression

Regressor1 = LogisticRegression()
Regressor1.fit(new_x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [205]:
y_pred1 = Regressor1.predict(new_x_test)

In [206]:
accuracyScore = accuracy_score(new_y_test,y_pred1)*100

In [207]:
print("Prediction Accuracy :",accuracyScore)

Prediction Accuracy : 78.56502242152466


# KNN

In [208]:
from sklearn.neighbors import KNeighborsClassifier

Regressor2 = KNeighborsClassifier()
Regressor2.fit(new_x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [209]:
y_pred2 = Regressor2.predict(new_x_test)


In [210]:
accuracyScore = accuracy_score(new_y_test,y_pred2)*100

In [211]:
print("Prediction Accuracy :",accuracyScore)

Prediction Accuracy : 83.85650224215246


# SVM With Linear Kernel

In [212]:

from sklearn.svm import SVC

Regressor3 = SVC(kernel='linear')
Regressor3.fit(new_x_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [213]:
y_pred3 = Regressor3.predict(new_x_test)


In [214]:
accuracyScore = accuracy_score(new_y_test,y_pred3)*100

In [215]:
print("Prediction Accuracy :",accuracyScore)

Prediction Accuracy : 78.20627802690584


# DECISION TREE

In [216]:
from sklearn.tree import DecisionTreeClassifier

Regressor6 = DecisionTreeClassifier(criterion='gini')
Regressor6.fit(new_x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [217]:
y_pred6 = Regressor6.predict(new_x_test)


In [218]:
accuracyScore = accuracy_score(new_y_test,y_pred6)*100

In [219]:
print("Prediction Accuracy :",accuracyScore)

Prediction Accuracy : 78.9237668161435


# RANDOM FOREST

In [220]:
from sklearn.ensemble import RandomForestClassifier

Regressor7 = RandomForestClassifier(criterion='gini')
Regressor7.fit(new_x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [221]:
y_pred7 = Regressor7.predict(new_x_test)


In [222]:
accuracyScore = accuracy_score(new_y_test,y_pred7)*100

In [223]:
print("Prediction Accuracy :",accuracyScore)

Prediction Accuracy : 79.28251121076232
