In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('spam_data.csv',)

In [3]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [4]:
df.dropna(inplace=True)

In [5]:
df.shape

(5160, 6)

In [57]:
#Vectorization (Bag of Words/tfidf)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#cv=CountVectorizer()
tfidf=TfidfVectorizer(max_features=3000)
x=tfidf.fit_transform(df['transformed_text']).toarray()

In [58]:
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [59]:
x.shape

(5160, 3000)

In [60]:
y=df['target']

In [61]:
from sklearn.model_selection import train_test_split

In [62]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [63]:
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

In [12]:
gnb=GaussianNB()
gnb.fit(x_train,y_train)
y_pred=gnb.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(precision_score(y_test,y_pred))

0.876937984496124
[[800 106]
 [ 21 105]]
0.4976303317535545


In [13]:
bnb=BernoulliNB()
bnb.fit(x_train,y_train)
y_pred1=bnb.predict(x_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

0.9718992248062015
[[904   2]
 [ 27  99]]
0.9801980198019802


In [64]:
mnb=MultinomialNB()
mnb.fit(x_train,y_train)
y_pred2=mnb.predict(x_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

0.9748062015503876
[[906   0]
 [ 26 100]]
1.0


In [15]:
# getting best results in mnb

In [16]:
#Trying other algorithms also

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [43]:
svc = SVC(kernel='sigmoid',gamma=1.0,probability=True)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
#dtc = DecisionTreeClassifier(max_depth=5)
#lrc = LogisticRegression(penalty='l1',solver='liblinear')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
#abc = AdaBoostClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
#bgc = BaggingClassifier(n_estimators=50, random_state=2)
#gbc = GradientBoostingClassifier(n_estimators=50, random_state=2)

In [19]:
clfs = {'SVC':svc,'KNC':knc,'NB':mnb,'RFC':rfc,'ETC':etc,}

In [20]:
def training_classifiers(clf,x_train,x_test,y_train,y_test):
    clf.fit(x_train,y_train)
    y_pred=clf.predict(x_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    
    return accuracy,precision

In [21]:
accuracy_scores=[]
precision_scores=[]

for name,clf in clfs.items():
    current_accuracy,current_precision=training_classifiers(clf,x_train,x_test,y_train,y_test)
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

In [22]:
performance_df = pd.DataFrame({'Algorithm name':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores},).sort_values('Precision',ascending=False)


In [23]:
performance_df

Unnamed: 0,Algorithm name,Accuracy,Precision
1,KNC,0.912791,1.0
2,NB,0.959302,1.0
3,RFC,0.966085,1.0
4,ETC,0.971899,0.980198
0,SVC,0.97093,0.98


In [24]:
# improvement

In [25]:
# 1. --> Adding no. of features in vectorization
tfidf=TfidfVectorizer(max_features=3000)
x=tfidf.fit_transform(df['transformed_text']).toarray()

In [26]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [27]:
accuracy_scores=[]
precision_scores=[]

for name,clf in clfs.items():
    current_accuracy,current_precision=training_classifiers(clf,x_train,x_test,y_train,y_test)
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

In [28]:
temp_df = pd.DataFrame({"Algorithm name":clfs.keys(),"accuracy_3000":accuracy_scores,'precision_3000':precision_scores})

In [29]:
performance_df=performance_df.merge(temp_df,on = 'Algorithm name')

In [30]:
performance_df

Unnamed: 0,Algorithm name,Accuracy,Precision,accuracy_3000,precision_3000
0,KNC,0.912791,1.0,0.922481,1.0
1,NB,0.959302,1.0,0.974806,1.0
2,RFC,0.966085,1.0,0.969961,0.970297
3,ETC,0.971899,0.980198,0.972868,0.99
4,SVC,0.97093,0.98,0.973837,0.980583


In [31]:
#accuracy of NB increased from 95 to 97

In [32]:
# 2.--> Apply scaling(Use MinMax scaler to avoid negative values)

In [33]:
'''tfidf=TfidfVectorizer(max_features=3000)
x=tfidf.fit_transform(df['transformed_text']).toarray()
from sklearn.preprocessing import MinMaxScaler
ms = MinMaxScaler()
x=ms.fit_transform(x)'''

"tfidf=TfidfVectorizer(max_features=3000)\nx=tfidf.fit_transform(df['transformed_text']).toarray()\nfrom sklearn.preprocessing import MinMaxScaler\nms = MinMaxScaler()\nx=ms.fit_transform(x)"

In [34]:
#x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [35]:
'''accuracy_scores=[]
precision_scores=[]

for name,clf in clfs.items():
    current_accuracy,current_precision=training_classifiers(clf,x_train,x_test,y_train,y_test)
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)'''

'accuracy_scores=[]\nprecision_scores=[]\n\nfor name,clf in clfs.items():\n    current_accuracy,current_precision=training_classifiers(clf,x_train,x_test,y_train,y_test)\n    accuracy_scores.append(current_accuracy)\n    precision_scores.append(current_precision)'

In [36]:
#temp_df2 = pd.DataFrame({"Algorithm name":clfs.keys(),"accuracy_on_scaling":accuracy_scores,'precision_on_scaling':precision_scores})

In [37]:
#performance_df=performance_df.merge(temp_df2,on='Algorithm name')

In [38]:
#performance_df  #Performance of model reduced

In [39]:
#We are getting best results with NB, ETC and SVC with max_features=3000

In [48]:
# Naive bayes is giving Best results of all

In [49]:
import pickle

In [53]:
pickle.dump(tfidf,open('vectorizer.pkl','wb'))

In [65]:
pickle.dump(mnb,open('model.pkl','wb'))