In [81]:
import pandas as pd
import numpy as np
import nltk
import re

In [82]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [83]:
df = pd.read_csv('SMSSpamCollection', sep='\t',
                           names=["label", "message"])

In [84]:
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [85]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
lem=WordNetLemmatizer()
ps=PorterStemmer()

In [86]:
corpus=[]
for i in range(len(df)):
  text= re.sub('[^a-zA-Z]',' ',df['message'][i])
  text= text.lower()
  text= text.split()
  
  text= [ps.stem(word) for word in text if word not in set(stopwords.words('english'))]
  text = ' '.join(text)
  corpus.append(text)

In [87]:
corpus2=[]
for i in range(len(df)):
  text= re.sub('[^a-zA-Z]',' ',df['message'][i])
  text= text.lower()
  text= text.split()
  
  text= [lem.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]
  text = ' '.join(text)
  corpus2.append(text)

### Creating a BoW Model:

In [88]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=2000)
x= cv.fit_transform(corpus).toarray()

In [89]:
y= pd.get_dummies(df['label'])
y=y.iloc[:,1].values #spam column

In [90]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [91]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(x_train,y_train)

y_pred=model.predict(x_test)
from sklearn.metrics import accuracy_score,confusion_matrix
print("\t-----STEMMING BoW-----\t \n")
print("Accuracy :",accuracy_score(y_test,y_pred))
print('\n')
print("Confusion Matrix: \n",confusion_matrix(y_test,y_pred))

	-----STEMMING BoW-----	 

Accuracy : 0.9847533632286996


Confusion Matrix: 
 [[956  10]
 [  7 142]]


In [92]:
cv=CountVectorizer(max_features=2000)
x1= cv.fit_transform(corpus2).toarray()

In [93]:
x1_train,x1_test,y_train,y_test = train_test_split(x1,y,test_size=0.2,random_state=42)

In [94]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(x1_train,y_train)

y_pr=model.predict(x1_test)
from sklearn.metrics import accuracy_score,confusion_matrix
print("\t-----LEMMATIZER BoW-----\t \n")
print("Accuracy :",accuracy_score(y_test,y_pr))
print('\n')
print("Confusion Matrix: \n",confusion_matrix(y_test,y_pr))

	-----LEMMATIZER BoW-----	 

Accuracy : 0.9847533632286996


Confusion Matrix: 
 [[955  11]
 [  6 143]]


### Creating a TFIDF model:


In [95]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf= TfidfVectorizer(max_features=2000)
X = tf.fit_transform(corpus)

In [96]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [97]:
from sklearn.naive_bayes import MultinomialNB
model2 = MultinomialNB().fit(X_train,y_train)

y_preds=model2.predict(X_test)
from sklearn.metrics import accuracy_score,confusion_matrix
print("\t-----STEMMING TFIDF-----\t \n")
print("Accuracy :",accuracy_score(y_test,y_preds))
print('\n')
print("Confusion Matrix:",confusion_matrix(y_test,y_preds))

	-----STEMMING TFIDF-----	 

Accuracy : 0.9820627802690582


Confusion Matrix: [[964   2]
 [ 18 131]]


In [98]:
tf= TfidfVectorizer(max_features=2000)
X1 = tf.fit_transform(corpus2)

In [99]:
X1_train,X1_test,y_train,y_test = train_test_split(X1,y,test_size=0.2,random_state=42)

In [100]:
from sklearn.naive_bayes import MultinomialNB
model2 = MultinomialNB().fit(X1_train,y_train)

y_p=model2.predict(X1_test)
from sklearn.metrics import accuracy_score,confusion_matrix
print("\t-----LEMMATIZER TFIDF-----\t \n")
print("Accuracy :",accuracy_score(y_test,y_p))
print('\n')
print("Confusion Matrix:",confusion_matrix(y_test,y_p))

	-----LEMMATIZER TFIDF-----	 

Accuracy : 0.9802690582959641


Confusion Matrix: [[963   3]
 [ 19 130]]
