In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
data=pd.read_csv('spam.csv',encoding = "ISO-8859-1", skiprows=1, usecols=[0,1], names=["label", "message"])

In [4]:
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
#if there were a null value, we would replace it with an empty string - ''
data.isnull().sum()

label      0
message    0
dtype: int64

In [7]:
#labelling spam mail as 0 and ham mail as 1
data['Id']=data['label'].apply(lambda x:0 if x=='spam' else 1)
#data.replace({'label':{'spam':0,'ham':1}},inplace=True)
#OR
#data.loc[data.label=='spam','label',]=0
#data.loc[data.label=='ham','label',]=1

In [8]:
data.head()

Unnamed: 0,label,message,Id
0,ham,"Go until jurong point, crazy.. Available only ...",1
1,ham,Ok lar... Joking wif u oni...,1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0
3,ham,U dun say so early hor... U c already then say...,1
4,ham,"Nah I don't think he goes to usf, he lives aro...",1


In [9]:
x=data.message
y=data.Id

In [10]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=3)

In [11]:
#feature extraction
#min_df - The min score given to a word
#stop_words to be ignored
#all letters will be changed to lowercase
feature=TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

In [12]:
x_train_features=feature.fit_transform(x_train)
x_test_features=feature.transform(x_test)

In [13]:
print(x_train_features)

  (0, 741)	0.3219352588930141
  (0, 3979)	0.2410582143632299
  (0, 4296)	0.3891385935794867
  (0, 6599)	0.20296878731699391
  (0, 3386)	0.3219352588930141
  (0, 2122)	0.38613577623520473
  (0, 3136)	0.440116181574609
  (0, 3262)	0.25877035357606315
  (0, 3380)	0.21807195185332803
  (0, 4513)	0.2909649098524696
  (1, 4061)	0.380431198316959
  (1, 6872)	0.4306015894277422
  (1, 6417)	0.4769136859540388
  (1, 6442)	0.5652509076654626
  (1, 7443)	0.35056971070320353
  (2, 933)	0.4917598465723273
  (2, 2109)	0.42972812260098503
  (2, 3917)	0.40088501350982736
  (2, 2226)	0.413484525934624
  (2, 5825)	0.4917598465723273
  (3, 6140)	0.4903863168693604
  (3, 1599)	0.5927091854194291
  (3, 1842)	0.3708680641487708
  (3, 7453)	0.5202633571003087
  (4, 2531)	0.7419319091456392
  :	:
  (4452, 2122)	0.31002103760284144
  (4453, 999)	0.6760129013031282
  (4453, 7273)	0.5787739591782677
  (4453, 1762)	0.45610005640082985
  (4454, 3029)	0.42618909997886
  (4454, 2086)	0.3809693742808703
  (4454, 3088)

In [14]:
log_reg=LogisticRegression()

In [15]:
log_reg.fit(x_train_features,y_train)

In [16]:
log_reg.score(x_test_features,y_test)

0.9623318385650225

In [17]:
log_reg.score(x_train_features,y_train)

0.9661207089970832

In [18]:
#As train and test scores are nearby, model is well fitted

In [19]:
y_train_pred=log_reg.predict(x_train_features)
accuracy_train=accuracy_score(y_train,y_train_pred)
accuracy_train

0.9661207089970832

In [20]:
y_test_pred=log_reg.predict(x_test_features)
accuracy_test=accuracy_score(y_test,y_test_pred)
accuracy_test

0.9623318385650225

In [21]:
#Building a predictive system
input_mail=["Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030"]
input_features=feature.transform(input_mail)
pred=log_reg.predict(input_features)
if pred[0]==1:
    print("Not a Spam Mail")
else:
    print("Spam Mail")

Spam Mail


In [22]:
x_features=feature.transform(x)
y_pred=log_reg.predict(x_features)

In [27]:
data['Id_pred']=y_pred
data['label_pred']=data['Id_pred'].apply(lambda x:'spam' if x==0 else 'ham')
data

Unnamed: 0,label,message,Id,Id_pred,label_pred
0,ham,"Go until jurong point, crazy.. Available only ...",1,1,ham
1,ham,Ok lar... Joking wif u oni...,1,1,ham
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,0,0,spam
3,ham,U dun say so early hor... U c already then say...,1,1,ham
4,ham,"Nah I don't think he goes to usf, he lives aro...",1,1,ham
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,0,0,spam
5568,ham,Will Ì_ b going to esplanade fr home?,1,1,ham
5569,ham,"Pity, * was in mood for that. So...any other s...",1,1,ham
5570,ham,The guy did some bitching but I acted like i'd...,1,1,ham


In [29]:
result={'Label':data.label,'Message':data.message,'Label_pred':data.label_pred}
pd.DataFrame(result).to_csv("Result.csv")