In [54]:
import numpy as np
import pandas as pd
from nltk.stem.porter import *
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn import metrics


In [55]:
df=pd.read_csv("H:\ML\dataset\spam.csv",encoding="ISO 8859-1")

In [56]:
df.head(2)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
v1            5572 non-null object
v2            5572 non-null object
Unnamed: 2    50 non-null object
Unnamed: 3    12 non-null object
Unnamed: 4    6 non-null object
dtypes: object(5)
memory usage: 217.8+ KB


# Add non Null values

In [58]:
df.fillna(' ',inplace=True)
df['v2']=df['v2'] + df['Unnamed: 2'] + df['Unnamed: 3'] + df['Unnamed: 4']

In [59]:
df.head(2)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,


# Drop Extra column

In [60]:
df.drop(labels=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1,inplace=True)

In [61]:
df.head(2)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
v1    5572 non-null object
v2    5572 non-null object
dtypes: object(2)
memory usage: 87.2+ KB


# Remove whitespace

In [63]:
df['v2']=df['v2'].str.strip()

# Converting all words to lowercase

In [64]:
df['v2']=df['v2'].str.lower()

In [65]:
df.head(2)

Unnamed: 0,v1,v2
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...


# Remove punctuations

In [66]:
punc=string.punctuation
table=str.maketrans('','',punc)
df['v2']=df['v2'].apply(lambda x: x.translate(table))

# tokenize each message

In [67]:
df['v2']=df.apply(lambda x: x['v2'].split(" "),axis=1)

In [68]:
df.head(2)

Unnamed: 0,v1,v2
0,ham,"[go, until, jurong, point, crazy, available, o..."
1,ham,"[ok, lar, joking, wif, u, oni]"


# Stemming

In [69]:
ps=PorterStemmer()
df['v2']=df.apply(lambda x : [ps.stem(word) for word in x['v2']],axis=1)

In [70]:
df.head(2)

Unnamed: 0,v1,v2
0,ham,"[go, until, jurong, point, crazi, avail, onli,..."
1,ham,"[ok, lar, joke, wif, u, oni]"


# remove stopwords

In [71]:
df['v2']=df.apply(lambda x: [word for word in x["v2"] if word not in stopwords.words('english')],axis=1 )

In [72]:
df.head(2)

Unnamed: 0,v1,v2
0,ham,"[go, jurong, point, crazi, avail, onli, bugi, ..."
1,ham,"[ok, lar, joke, wif, u, oni]"


# remove single words

In [73]:
df['v2']=df.apply(lambda x :' '.join( [word for word in x['v2'] if len(word)>1]),axis=1)

In [74]:
df.head(2)

Unnamed: 0,v1,v2
0,ham,go jurong point crazi avail onli bugi great wo...
1,ham,ok lar joke wif oni


# Remane

In [75]:
df.rename(columns={'v1':'Class','v2':'Text'},inplace=True)

In [76]:
df.head(2)

Unnamed: 0,Class,Text
0,ham,go jurong point crazi avail onli bugi great wo...
1,ham,ok lar joke wif oni


# Labelling

In [77]:
df['labels'] = df['Class'].map({'ham':0, 'spam':1})

In [78]:
df.head(2)

Unnamed: 0,Class,Text,labels
0,ham,go jurong point crazi avail onli bugi great wo...,0
1,ham,ok lar joke wif oni,0


# Count Spam and Ham mail

In [79]:
spam=df[df.labels==1]
ham=df[df.labels==0]

In [80]:
print("Number of spam message: ",spam['Class'].count())

Number of spam message:  747


In [81]:
print("Number of ham message: ",ham['Class'].count())

Number of ham message:  4825


# vectorize

In [82]:
vectorizer=TfidfVectorizer(ngram_range=(1,1))
X=vectorizer.fit_transform(df['Text'].apply(lambda x: np.str_(x)))
y = df.Class

In [83]:
print(X)

  (0, 7671)	0.19177805016257746
  (0, 1146)	0.34521951095024594
  (0, 3380)	0.16200861823714563
  (0, 2022)	0.2916428378784585
  (0, 1741)	0.32954933862004965
  (0, 4260)	0.2916428378784585
  (0, 7873)	0.23460288784037356
  (0, 3418)	0.19214437897428346
  (0, 1743)	0.2916428378784585
  (0, 5268)	0.1664763353788876
  (0, 1335)	0.2625115173167652
  (0, 2241)	0.2673487474020604
  (0, 5604)	0.23572318078087146
  (0, 4116)	0.34521951095024594
  (0, 3327)	0.138659833241173
  (1, 5265)	0.5633498837724461
  (1, 7786)	0.44483654514496557
  (1, 4082)	0.4773478663822099
  (1, 4295)	0.42081977871680865
  (1, 5234)	0.2825014776211812
  (2, 71)	0.23427765466356648
  (2, 1218)	0.16757634584983688
  (2, 5888)	0.23427765466356648
  (2, 7365)	0.12401193821639256
  (2, 5842)	0.23427765466356648
  :	:
  (5568, 3130)	0.5770406346252941
  (5568, 3679)	0.3753220393700477
  (5568, 3327)	0.2963254483546816
  (5569, 6541)	0.5201967728029802
  (5569, 5541)	0.5201967728029802
  (5569, 4850)	0.43260510603313906
  

In [84]:
print(y)

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: Class, Length: 5572, dtype: object
