# Importing the dataset

In [1]:
import pandas as pd
df = pd.read_csv('SMSSpamCollection', sep='\t',names=["label", "message"])

In [2]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df['y']=df.label.apply(lambda x : 1 if x=='spam' else 0)

In [4]:
df=df.drop(columns='label',axis=1)

In [5]:
df.head()

Unnamed: 0,message,y
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


# Data cleaning and preprocessing

In [6]:
print(df.message[0])
print(df.message[2])

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's


In [7]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [8]:
stemming=PorterStemmer()
lemma=WordNetLemmatizer()

In [9]:
final=[]
for i in range(len(df)):
    review=re.sub('[^a-zA-Z]',' ',df.message[i])
    review=review.split()
    long_island=[stemming.stem(each) for each in review if each not in stopwords.words('english')]
    margarita=' '.join(long_island)
    final.append(margarita)


### Comparing 

In [10]:
df.message[2]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [11]:
final[2]

'free entri wkli comp win FA cup final tkt st may text FA receiv entri question std txt rate T C appli'

### Creating the Bag of Words

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
ccv=CountVectorizer(max_features=2500) # selecting the number of columns/features,coz some words may be present only 
                                       ## once or twice

In [13]:
X=ccv.fit_transform(final)

In [14]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
X.shape

(5572, 2500)

### Train Test Split

In [16]:
df.head()

Unnamed: 0,message,y
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [17]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [18]:
Y=df[['y']]

In [19]:
Y.head()

Unnamed: 0,y
0,0
1,0
2,1
3,0
4,0


In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

In [21]:
X.shape

(5572, 2500)

In [22]:
X_train.shape

(4457, 2500)

In [23]:
X_test.shape

(1115, 2500)

### Training model using Naive bayes classifier

In [24]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()

In [25]:
import warnings
warnings.filterwarnings("ignore")
mnb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
mnb.score(X_test,y_test)

0.9856502242152466

In [27]:
y_pred=mnb.predict(X_test)

In [28]:
import numpy as np

In [29]:
comparison=pd.DataFrame({
    'actual' : np.array(y_test).flatten(),
    'predicted' : y_pred
})

In [30]:
comparison.head(10)

Unnamed: 0,actual,predicted
0,0,0
1,1,1
2,0,0
3,0,0
4,0,0
5,0,0
6,1,1
7,0,0
8,0,0
9,0,0


In [31]:
from sklearn.metrics import confusion_matrix
confusionmatrix=confusion_matrix(y_test,y_pred)
confusionmatrix

array([[947,   8],
       [  8, 152]], dtype=int64)

In [32]:
pd.DataFrame(confusionmatrix,columns=['Predicted 0','Predicted 1']) # Row = Actual O/P, Column = Predicted O/P

Unnamed: 0,Predicted 0,Predicted 1
0,947,8
1,8,152
