In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [17]:
data = pd.read_csv('indian_spam.csv')
data.head()

Unnamed: 0,v1,v2
0,ham,zyada itrao mat . english meko bi aati hai :-P
1,ham,is ajnbi dunia me akle 1 khwab hu swalon se k...
2,ham,Husband n wife were opening joint bank ac Husb...
3,ham,Husband Suhag raat pe- Tum ne kabhi Blue Film ...
4,ham,indagi me 5 cheez kabhi bhi aa sakti h 'Hum' '...


In [19]:
data.columns = ['result','text']

In [21]:
data.head()

Unnamed: 0,result,text
0,ham,zyada itrao mat . english meko bi aati hai :-P
1,ham,is ajnbi dunia me akle 1 khwab hu swalon se k...
2,ham,Husband n wife were opening joint bank ac Husb...
3,ham,Husband Suhag raat pe- Tum ne kabhi Blue Film ...
4,ham,indagi me 5 cheez kabhi bhi aa sakti h 'Hum' '...


In [23]:
data['result'].value_counts()

result
ham     1000
spam    1000
Name: count, dtype: int64

In [35]:
data['result'] = data['result'].map({'ham': 0, 'spam': 1})

In [37]:
data.head()

Unnamed: 0,result,text
0,0,zyada itrao mat . english meko bi aati hai :-P
1,0,is ajnbi dunia me akle 1 khwab hu swalon se k...
2,0,Husband n wife were opening joint bank ac Husb...
3,0,Husband Suhag raat pe- Tum ne kabhi Blue Film ...
4,0,indagi me 5 cheez kabhi bhi aa sakti h 'Hum' '...


## Data Preprocessing

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
vec = TfidfVectorizer()
x = vec.fit_transform(data['text'])
y = data['result']

In [49]:
x[1]

<1x10047 sparse matrix of type '<class 'numpy.float64'>'
	with 24 stored elements in Compressed Sparse Row format>

## Crossvalidating which model works best for our data

In [43]:
models = [SVC(), MultinomialNB(), LogisticRegression()]

In [53]:
score_list = []
score = []
for i in range(3):
    score.append(cross_val_score(models[i], x, y, cv = 10))
    score_list.append(np.mean(score[i]))
score_list

[0.9549999999999998, 0.9484999999999999, 0.9515]

According to above scores, we get SVC as best model

## Hyperparameter Tuning

In [60]:
parameters = {'C' : [1,5,10,15,20],
             'kernel':['linear', 'poly', 'rbf']}

In [66]:
from sklearn.model_selection import GridSearchCV
classifier = GridSearchCV(SVC(), parameters, cv=10)

In [68]:
classifier.fit(x,y)

In [69]:
classifier.best_params_

{'C': 5, 'kernel': 'linear'}

In [70]:
classifier.best_score_

0.959

'C': 5, 'kernel': 'linear' 
are the best parameters

In [75]:
model = SVC(kernel = 'linear', C = 5)

In [77]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2)

In [79]:
model.fit(xtrain, ytrain)

In [81]:
model.score(xtest,ytest)

0.965

In [85]:
pred = model.predict(xtest)
from sklearn.metrics import classification_report
rep = classification_report(pred, ytest, output_dict=True)
rep = pd.DataFrame(rep)
rep

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.979592,0.95098,0.965,0.965286,0.965429
recall,0.950495,0.979798,0.965,0.965147,0.965
f1-score,0.964824,0.965174,0.965,0.964999,0.964997
support,202.0,198.0,0.965,400.0,400.0
