In [3]:
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.tokenize import word_tokenize
import sklearn.metrics as m
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pushk\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pushk\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [6]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pushk\AppData\Roaming\nltk_data...


True

In [7]:
dataset=pd.read_csv('spam.csv',encoding='latin-1')
dataset

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [8]:
sent=dataset.iloc[:,[1]]['v2']

In [9]:
sent

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

In [10]:
label=dataset.iloc[:,[0]]['v1']

In [11]:
label

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: v1, Length: 5572, dtype: object

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
le=LabelEncoder()
label=le.fit_transform(label)

In [14]:
label

array([0, 0, 1, ..., 0, 0, 0])

In [15]:
le.classes_

array(['ham', 'spam'], dtype=object)

In [16]:
import re

In [17]:
len(set(stopwords.words('english')))

179

In [18]:
stem=PorterStemmer()

In [19]:
sent

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

In [20]:
sentences=[]
for sen in sent:
  senti=re.sub('[^A-Za-z]',' ',sen)
  senti=senti.lower()
  words=word_tokenize(senti)
  word=[stem.stem(i) for i in words if i not in stopwords.words('english')]
  senti=' '.join(word)
  sentences.append(senti)


In [21]:
sentences

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chg send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free',
 'gon na home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breathe

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
cv=CountVectorizer(max_features=5000)

In [24]:
features=cv.fit_transform(sentences)

In [25]:
features=features.toarray()

In [26]:
features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [28]:
len(cv.get_feature_names_out())

5000

In [29]:
feature_train,feature_test,label_train,label_test=train_test_split(features,label,test_size=0.2,random_state=7)

#Naive Bayies

In [30]:
model=MultinomialNB()
model.fit(feature_train,label_train)

In [31]:
label_pred=model.predict(feature_test)

In [32]:
label_pred

array([0, 0, 0, ..., 0, 0, 0])

In [33]:
label_test

array([0, 0, 0, ..., 0, 0, 0])

In [34]:
m.accuracy_score(label_test,label_pred)

0.9856502242152466

In [35]:
print(m.classification_report(label_test,label_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       970
           1       0.93      0.96      0.95       145

    accuracy                           0.99      1115
   macro avg       0.96      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [36]:
print(m.confusion_matrix(label_test,label_pred))

[[960  10]
 [  6 139]]


In [37]:
import pickle


In [38]:
with open('sms_spam_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

In [39]:
with open('preprocessing_objects.pkl', 'wb') as preprocessing_file:
    preprocessing_objects = {
        'label_encoder': le,
        'stemmer': stem,
        'count_vectorizer': cv
    }
    pickle.dump(preprocessing_objects, preprocessing_file)

#SVC

In [3]:
model=SVC(kernel='linear')
model.fit(feature_train,label_train)

NameError: name 'SVC' is not defined

In [None]:
label_pred=model.predict(feature_test)

In [4]:
m.accuracy_score(label_test,label_pred)

NameError: name 'm' is not defined

In [None]:
label_pred

array([0, 0, 0, ..., 0, 0, 0])

In [5]:
label_test

NameError: name 'label_test' is not defined

In [6]:
print(m.classification_report(label_test,label_pred))

NameError: name 'm' is not defined

In [7]:
print(m.confusion_matrix(label_test,label_pred))

NameError: name 'm' is not defined

#LogisticRegression

In [8]:
model=LogisticRegression()
model.fit(feature_train,label_train)

NameError: name 'LogisticRegression' is not defined

In [9]:
label_pred=model.predict(feature_test)

NameError: name 'model' is not defined

In [10]:
m.accuracy_score(label_test,label_pred)

NameError: name 'm' is not defined

In [11]:
label_pred

NameError: name 'label_pred' is not defined

In [12]:
label_test

NameError: name 'label_test' is not defined

In [13]:
print(m.classification_report(label_test,label_pred))

NameError: name 'm' is not defined

In [None]:
print(m.confusion_matrix(label_test,label_pred))

[[969   1]
 [ 14 131]]


#Decision Tree

In [14]:
model=DecisionTreeClassifier()
model.fit(feature_train,label_train)


NameError: name 'DecisionTreeClassifier' is not defined

In [15]:
label_pred=model.predict(feature_test)

NameError: name 'model' is not defined

In [None]:
m.accuracy_score(label_test,label_pred)

0.97847533632287

In [16]:
label_pred

NameError: name 'label_pred' is not defined

In [17]:
label_test

NameError: name 'label_test' is not defined

In [18]:
print(m.classification_report(label_test,label_pred))

NameError: name 'm' is not defined

In [19]:
print(m.confusion_matrix(label_test,label_pred))

NameError: name 'm' is not defined