In [58]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer     # STEMMING
from sklearn.feature_extraction.text import CountVectorizer      # BOW
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import pickle

In [18]:
df = pd.read_csv('SMSSpamCollection',sep='\t',names=['label','message'])
df.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [16]:
ps = PorterStemmer()   # for the stemming

In [19]:
for i in range(0,len(df)):
    mesage = re.sub('[^a-zA-Z]',' ',df['message'][i])
    mesage = mesage.lower()
    mesage = mesage.split()
    mesage = [ps.stem(word) for word in mesage if word not in set(stopwords.words('english'))]
    mesage = ' '.join(mesage)
    df['message'][i] = mesage
df.head(10)

# cleaned data of message

Unnamed: 0,label,message
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri wkli comp win fa cup final tkt st m...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah think goe usf live around though
5,spam,freemsg hey darl week word back like fun still...
6,ham,even brother like speak treat like aid patent
7,ham,per request mell mell oru minnaminungint nurun...
8,spam,winner valu network custom select receivea pri...
9,spam,mobil month u r entitl updat latest colour mob...


# pre-processing

In [20]:
x = df['message']
y = df['label']

In [21]:
x.head()

0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri wkli comp win fa cup final tkt st m...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
Name: message, dtype: object

In [22]:
y.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: label, dtype: object

In [25]:
train = x.values
train[0:5]

array(['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
       'ok lar joke wif u oni',
       'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
       'u dun say earli hor u c alreadi say',
       'nah think goe usf live around though'], dtype=object)

In [26]:
train.shape

(5572,)

In [28]:
CV = CountVectorizer(max_features=2500)      # maximum 2500 words
final_train = CV.fit_transform(train).toarray()
final_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [29]:
final_train.shape

(5572, 2500)

In [32]:
y.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: label, dtype: object

In [33]:
label = pd.get_dummies(y,drop_first=True)

In [35]:
label.head()

Unnamed: 0,spam
0,0
1,0
2,1
3,0
4,0


In [36]:
y.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: label, dtype: object

In [37]:
x_train,x_test,y_train,y_test = train_test_split(final_train,label, random_state=0,test_size=0.2)

In [39]:
x_train.shape,x_test.shape

((4457, 2500), (1115, 2500))

In [45]:
round(x_train.shape[0]/final_train.shape[0]*100)    # 80% training data size

80

In [46]:
round(x_test.shape[0]/final_train.shape[0]*100)    # 20% testing data size

20

# MultinomialNB

In [54]:
mb  = MultinomialNB()

In [55]:
mb.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


## accuracy

In [56]:
mb.score(x_train,y_train).round(2)*100     # on training data

99.0

In [57]:
mb.score(x_test,y_test).round(2)*100        # on testing data

99.0

In [59]:
pickle.dump(mb,open('spamclassifier_MnB.pkl','wb'))

In [63]:
model=pickle.load(open('spamclassifier_MnB.pkl','rb'))


In [65]:
y_pred_Mb = model.predict(x_test)
y_pred_Mb

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

# RandomForest

In [49]:
rdf = RandomForestClassifier(n_estimators=200)

In [50]:
rdf.fit(x_train,y_train)

  rdf.fit(x_train,y_train)


## accuracy

In [53]:
rdf.score(x_train,y_train).round(2)*100     # on training data

100.0

In [52]:
rdf.score(x_test,y_test).round(2)*100        # on testing data

98.0

## save the model

In [66]:
pickle.dump(rdf,open('spamclassifier_RdmFOr.pkl','wb'))


## load the model

In [68]:
model2 = pickle.load(open('spamclassifier_RdmFOr.pkl','rb'))

In [71]:
rdf.predict(x_test)

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

In [73]:
y_pred_rd = model2.predict(x_test)
y_pred_rd

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

 ## confusion matrix

In [74]:
confusion_matrix(y_test,y_pred_rd)

array([[954,   1],
       [ 18, 142]], dtype=int64)

In [76]:
print(classification_report(y_test,y_pred_rd))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       0.99      0.89      0.94       160

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [77]:
y_test.shape

(1115, 1)

In [81]:
{}

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

In [83]:
y_test['randomforest']=y_pred_rd

In [86]:
y_test['MultinomialNB']=y_pred_Mb

In [87]:
matching_data = y_test
matching_data.head(20)

Unnamed: 0,spam,randomforest,MultinomialNB
4456,0,0,0
690,1,1,1
944,0,0,0
3768,0,0,0
1189,0,0,0
4437,0,0,0
3587,1,1,1
1982,0,0,0
2038,0,0,0
2078,0,0,0


## Thankyou  😁