In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import pickle

In [2]:
#Loading the data
data=pd.read_csv('spam.csv',encoding='ISO-8859-1')

In [3]:
#Just looking at first five data points.
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
#Dropping the extra three columns as they are NAN type and are not useful for us.
data=data.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis=1)

In [5]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
#Renaming the columns as v1-->label and v2-->SMS
data.columns=['label','SMS']

In [7]:
#Remapping the values of ham and spam as 1 and 0 as Machine Learning can work on numbers only.
new_labels=data['label'].map({'ham':0,'spam':1})

In [8]:
#Replacing older label with new one.
data['new_labels']=new_labels

In [9]:
g = data.groupby('new_labels')
data=g.apply(lambda x: x.sample(747).reset_index(drop=True))


In [10]:
data['new_labels'].value_counts()

1    747
0    747
Name: new_labels, dtype: int64

In [11]:
targets=data.iloc[:,[2]]

In [12]:
#Initialsing count vectorizer.
Count_Vec=CountVectorizer(decode_error='ignore')

In [13]:
#Converting each SMS into Vectors(Bag of Words.)
data_vectors=Count_Vec.fit_transform(data['SMS'])

In [14]:
#Splitting the data.
X_train,X_test,Y_train,Y_test=train_test_split(data_vectors,targets,test_size=0.33)

In [15]:
clf=LogisticRegression()
clf.fit(X_train,Y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [16]:
from sklearn.metrics import accuracy_score
pred=clf.predict(X_test)
print("Accuracy_score=",accuracy_score(pred,Y_test))

Accuracy_score= 0.9392712550607287


In [17]:
#creating a dataframe with a single column for testing.
cust_data=pd.DataFrame(['Thanks for your Ringtone Order, Reference T91. You will be charged GBP 4 per week. You can unsubscribe at anytime by calling customer services on 09057039994'],columns=['SMS'])

In [18]:
cust_data.head()

Unnamed: 0,SMS
0,"Thanks for your Ringtone Order, Reference T91...."


In [19]:
#Transforming the data into vectors.
cnt_vec=CountVectorizer(decode_error='ignore')
cnt_vec.fit(data['SMS'])
vec=cnt_vec.transform(cust_data['SMS'])

In [20]:
predicted_label=pd.DataFrame([clf.predict(vec)],columns=['pred'])

In [21]:
predicted_label.head()

Unnamed: 0,pred
0,1


In [22]:
predicted_label=predicted_label['pred'].map({1:'Spam',0:'Not Spam'}).values

In [23]:
print("The above message is=",predicted_label[0])

The above message is= Spam


In [24]:
re=clf.predict(X_test)

In [25]:
#Lets save the model.
from sklearn.externals import joblib
joblib.dump(clf,"model.pkl")


['model.pkl']

In [26]:
#Lets test whether the model has been saved successfully or not.
model=joblib.load('model.pkl')
model.predict(X_test)

array([0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1,

In [27]:
#Lets also dump the count vectorizer for later using it.
pickle.dump(cnt_vec.vocabulary_,open("count_vec.pkl","wb"))
