In [1]:
import pandas as pd

In [2]:
messages=pd.read_csv('/content/SMSSpamCollection',sep='\t',names=["label","message"])

In [3]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

**Using Bag of Words and Stemming**

In [6]:
ps=PorterStemmer()

In [7]:
corpus=[]

In [8]:
for i in range(len(messages)):
  review=re.sub('[^a-zA-Z]',' ',messages['message'][i])
  review=review.lower()
  review=review.split()
  review=[ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
  review=' '.join(review)
  corpus.append(review)

In [9]:
corpus[0:5]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though']

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
X=cv.fit_transform(corpus).toarray()

In [11]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [12]:
y=pd.get_dummies(messages['label'])

In [13]:
y=y.iloc[:,1].values

In [14]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=0)

In [15]:
from sklearn.naive_bayes import MultinomialNB

In [16]:
spam=MultinomialNB()
spam.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
y_pred=spam.predict(X_test)

In [18]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)

In [19]:
cm

array([[940,  15],
       [  8, 152]])

In [20]:
from sklearn.metrics import accuracy_score
score=accuracy_score(y_test,y_pred)

In [21]:
score

0.979372197309417

**Using TFIDF and Lemmatization**

In [22]:
from nltk.stem import WordNetLemmatizer

In [23]:
wordnet=WordNetLemmatizer()

In [24]:
corpus=[]

In [31]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [32]:
for i in range(len(messages)):
  review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
  review = review.lower()
  review = review.split()
  review = [wordnet.lemmatize(word) for word in review if word not in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
X=tfidf.fit_transform(corpus).toarray()

In [34]:
y=pd.get_dummies(messages['label'],drop_first=True)

In [35]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=0)

In [36]:
from sklearn.naive_bayes import MultinomialNB

In [37]:
spam=MultinomialNB()
spam.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [38]:
y_pred=spam.predict(X_test)

In [39]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)

In [40]:
cm

array([[955,   0],
       [ 31, 129]])

In [41]:
from sklearn.metrics import accuracy_score
score=accuracy_score(y_test,y_pred)

In [42]:
score

0.9721973094170404