# SMS Spam Collection Classification

In [48]:
import chardet
import pandas as pd

# Detect the encoding
with open('spam.csv', 'rb') as f:
    result = chardet.detect(f.read())
print(result['encoding'])

# Use the detected encoding
messages = pd.read_csv("spam.csv", encoding=result['encoding'])


Windows-1252


In [49]:
messages=messages.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

In [50]:
messages=messages.rename(columns={'v1':'label','v2':'message'})

In [51]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Data Cleaning and Preprocessing

In [52]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vinay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [53]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps=PorterStemmer()


In [54]:
corpus=[]
for i in range(0,len(messages)):
    review=re.sub('[^a-zA-Z]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

In [55]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chg send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breather

### Create the Bg of Words

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
#cv=CountVectorizer(max_features=2500,binary=True) # For binary Bag of Words
cv=CountVectorizer(max_features=2500)

In [57]:
X=cv.fit_transform(corpus).toarray()

In [58]:
X.shape

(5572, 2500)

In [59]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [60]:
cv.vocabulary_

{'go': np.int64(871),
 'point': np.int64(1592),
 'crazi': np.int64(484),
 'avail': np.int64(158),
 'bugi': np.int64(294),
 'great': np.int64(903),
 'world': np.int64(2426),
 'la': np.int64(1163),
 'cine': np.int64(391),
 'got': np.int64(888),
 'wat': np.int64(2355),
 'ok': np.int64(1488),
 'lar': np.int64(1171),
 'joke': np.int64(1126),
 'wif': np.int64(2393),
 'oni': np.int64(1496),
 'free': np.int64(811),
 'entri': np.int64(681),
 'wkli': np.int64(2414),
 'comp': np.int64(431),
 'win': np.int64(2397),
 'fa': np.int64(724),
 'cup': np.int64(498),
 'final': np.int64(767),
 'tkt': np.int64(2182),
 'st': np.int64(2015),
 'may': np.int64(1315),
 'text': np.int64(2130),
 'receiv': np.int64(1704),
 'question': np.int64(1663),
 'std': np.int64(2026),
 'txt': np.int64(2257),
 'rate': np.int64(1684),
 'appli': np.int64(114),
 'dun': np.int64(620),
 'say': np.int64(1838),
 'earli': np.int64(626),
 'hor': np.int64(1013),
 'alreadi': np.int64(73),
 'nah': np.int64(1425),
 'think': np.int64(2146),

### N-Grams

In [61]:
from sklearn.feature_extraction.text import CountVectorizer
#cv=CountVectorizer(max_features=2500,binary=True) # For binary Bag of Words
cv=CountVectorizer(max_features=100,binary=True,ngram_range=(1,1))
X=cv.fit_transform(corpus)

In [62]:
cv.vocabulary_

{'go': np.int64(22),
 'great': np.int64(25),
 'got': np.int64(24),
 'wat': np.int64(90),
 'ok': np.int64(56),
 'free': np.int64(18),
 'win': np.int64(94),
 'text': np.int64(77),
 'txt': np.int64(85),
 'say': np.int64(67),
 'alreadi': np.int64(0),
 'think': np.int64(80),
 'hey': np.int64(28),
 'week': np.int64(92),
 'back': np.int64(3),
 'like': np.int64(38),
 'still': np.int64(73),
 'send': np.int64(69),
 'even': np.int64(15),
 'friend': np.int64(19),
 'prize': np.int64(62),
 'claim': np.int64(7),
 'call': np.int64(4),
 'mobil': np.int64(47),
 'co': np.int64(8),
 'home': np.int64(30),
 'want': np.int64(89),
 'today': np.int64(82),
 'cash': np.int64(6),
 'day': np.int64(12),
 'repli': np.int64(64),
 'www': np.int64(96),
 'right': np.int64(65),
 'thank': np.int64(78),
 'take': np.int64(75),
 'time': np.int64(81),
 'use': np.int64(87),
 'messag': np.int64(44),
 'oh': np.int64(55),
 'ye': np.int64(97),
 'make': np.int64(42),
 'way': np.int64(91),
 'feel': np.int64(16),
 'dont': np.int64(14

In [63]:
from sklearn.feature_extraction.text import CountVectorizer
#cv=CountVectorizer(max_features=2500,binary=True) # For binary Bag of Words
cv=CountVectorizer(max_features=500,binary=True,ngram_range=(1,2))
X=cv.fit_transform(corpus)

In [64]:
cv.vocabulary_

{'go': np.int64(155),
 'point': np.int64(332),
 'great': np.int64(165),
 'world': np.int64(483),
 'got': np.int64(163),
 'wat': np.int64(463),
 'ok': np.int64(297),
 'lar': np.int64(217),
 'wif': np.int64(473),
 'free': np.int64(142),
 'entri': np.int64(126),
 'win': np.int64(475),
 'final': np.int64(135),
 'st': np.int64(399),
 'may': np.int64(253),
 'text': np.int64(414),
 'receiv': np.int64(353),
 'question': np.int64(344),
 'txt': np.int64(446),
 'rate': np.int64(346),
 'appli': np.int64(18),
 'dun': np.int64(115),
 'say': np.int64(367),
 'earli': np.int64(117),
 'alreadi': np.int64(9),
 'think': np.int64(420),
 'goe': np.int64(157),
 'live': np.int64(235),
 'around': np.int64(20),
 'though': np.int64(422),
 'hey': np.int64(187),
 'week': np.int64(466),
 'word': np.int64(481),
 'back': np.int64(29),
 'like': np.int64(231),
 'fun': np.int64(149),
 'still': np.int64(402),
 'xxx': np.int64(490),
 'send': np.int64(375),
 'even': np.int64(127),
 'brother': np.int64(46),
 'speak': np.int

In [65]:
from sklearn.feature_extraction.text import CountVectorizer
#cv=CountVectorizer(max_features=2500,binary=True) # For binary Bag of Words
cv=CountVectorizer(max_features=100,binary=True,ngram_range=(2,2))
X=cv.fit_transform(corpus)

In [66]:
cv.vocabulary_

{'free entri': np.int64(32),
 'claim call': np.int64(17),
 'call claim': np.int64(3),
 'free call': np.int64(31),
 'call mobil': np.int64(9),
 'chanc win': np.int64(16),
 'txt word': np.int64(90),
 'let know': np.int64(54),
 'go home': np.int64(36),
 'mobil free': np.int64(59),
 'pleas call': np.int64(68),
 'lt gt': np.int64(58),
 'want go': np.int64(97),
 'like lt': np.int64(55),
 'sorri call': np.int64(81),
 'call later': np.int64(8),
 'ur award': np.int64(91),
 'call custom': np.int64(4),
 'custom servic': np.int64(24),
 'cash prize': np.int64(15),
 'tri contact': np.int64(87),
 'draw show': np.int64(29),
 'show prize': np.int64(79),
 'prize guarante': np.int64(73),
 'guarante call': np.int64(43),
 'valid hr': np.int64(95),
 'select receiv': np.int64(76),
 'privat account': np.int64(71),
 'account statement': np.int64(0),
 'statement show': np.int64(82),
 'call identifi': np.int64(5),
 'identifi code': np.int64(50),
 'code expir': np.int64(21),
 'urgent mobil': np.int64(94),
 'call 

In [67]:
from sklearn.feature_extraction.text import CountVectorizer
#cv=CountVectorizer(max_features=2500,binary=True) # For binary Bag of Words
cv=CountVectorizer(max_features=100,binary=True,ngram_range=(2,3))
X=cv.fit_transform(corpus)

In [68]:
cv.vocabulary_

{'free entri': np.int64(33),
 'claim call': np.int64(18),
 'call claim': np.int64(4),
 'free call': np.int64(32),
 'chanc win': np.int64(17),
 'txt word': np.int64(91),
 'let know': np.int64(54),
 'go home': np.int64(36),
 'pleas call': np.int64(70),
 'lt gt': np.int64(61),
 'want go': np.int64(97),
 'like lt': np.int64(55),
 'like lt gt': np.int64(56),
 'sorri call': np.int64(83),
 'call later': np.int64(12),
 'sorri call later': np.int64(84),
 'ur award': np.int64(92),
 'call custom': np.int64(5),
 'custom servic': np.int64(25),
 'cash prize': np.int64(16),
 'call custom servic': np.int64(6),
 'tri contact': np.int64(89),
 'draw show': np.int64(29),
 'show prize': np.int64(81),
 'prize guarante': np.int64(75),
 'guarante call': np.int64(42),
 'valid hr': np.int64(95),
 'draw show prize': np.int64(30),
 'show prize guarante': np.int64(82),
 'prize guarante call': np.int64(76),
 'select receiv': np.int64(78),
 'privat account': np.int64(72),
 'account statement': np.int64(0),
 'call id

In [69]:
from sklearn.feature_extraction.text import CountVectorizer
#cv=CountVectorizer(max_features=2500,binary=True) # For binary Bag of Words
cv=CountVectorizer(max_features=100,binary=True,ngram_range=(3,3))
X=cv.fit_transform(corpus)

In [70]:
cv.vocabulary_

{'call claim code': np.int64(8),
 'like lt gt': np.int64(44),
 'sorri call later': np.int64(81),
 'pleas call custom': np.int64(66),
 'call custom servic': np.int64(9),
 'custom servic repres': np.int64(24),
 'guarante cash prize': np.int64(36),
 'draw show prize': np.int64(25),
 'show prize guarante': np.int64(79),
 'prize guarante call': np.int64(71),
 'special select receiv': np.int64(83),
 'speak live oper': np.int64(82),
 'live oper claim': np.int64(46),
 'privat account statement': np.int64(69),
 'account statement show': np.int64(0),
 'call identifi code': np.int64(10),
 'identifi code expir': np.int64(41),
 'bonu caller prize': np.int64(5),
 'select receiv award': np.int64(78),
 'match pleas call': np.int64(55),
 'urgent tri contact': np.int64(97),
 'lt decim gt': np.int64(48),
 'secret admir look': np.int64(77),
 'admir look make': np.int64(1),
 'look make contact': np.int64(47),
 'make contact find': np.int64(54),
 'contact find reveal': np.int64(22),
 'find reveal think': np

### TF-IDF Using Lemmatization

In [71]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordlemmatize=WordNetLemmatizer()

In [72]:
corpus=[]
for i in range(0,len(messages)):
    review=re.sub('[^a-zA-Z]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review=[wordlemmatize.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

In [73]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash pound txt csh send cost p day day tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw'

In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_features=100)
X=tfidf.fit_transform(corpus).toarray()

In [75]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_features=100,ngram_range=(2,2))
X=tfidf.fit_transform(corpus).toarray()

In [77]:
tfidf.vocabulary_

{'free entry': np.int64(30),
 'claim call': np.int64(15),
 'call claim': np.int64(3),
 'free call': np.int64(29),
 'chance win': np.int64(14),
 'txt word': np.int64(88),
 'let know': np.int64(52),
 'please call': np.int64(64),
 'lt gt': np.int64(56),
 'want go': np.int64(97),
 'like lt': np.int64(53),
 'sorry call': np.int64(79),
 'call later': np.int64(8),
 'ur awarded': np.int64(90),
 'hi hi': np.int64(45),
 'call customer': np.int64(4),
 'customer service': np.int64(22),
 'guaranteed cash': np.int64(40),
 'cash prize': np.int64(13),
 'trying contact': np.int64(85),
 'draw show': np.int64(27),
 'show prize': np.int64(77),
 'prize guaranteed': np.int64(71),
 'guaranteed call': np.int64(39),
 'valid hr': np.int64(95),
 'selected receive': np.int64(74),
 'private account': np.int64(69),
 'account statement': np.int64(0),
 'statement show': np.int64(80),
 'call identifier': np.int64(5),
 'identifier code': np.int64(48),
 'code expires': np.int64(19),
 'urgent mobile': np.int64(94),
 'cal

In [78]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])