In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [4]:
data = pd.read_csv('indian_spam.csv', encoding='latin')
data2 = pd.read_csv('spam.csv', encoding='latin')
data = pd.concat([data,data2], ignore_index=True)
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,zyada itrao mat . english meko bi aati hai :-P,,,
1,ham,is ajnbi dunia me akle 1 khwab hu swalon se k...,,,
2,ham,Husband n wife were opening joint bank ac Husb...,,,
3,ham,Husband Suhag raat pe- Tum ne kabhi Blue Film ...,,,
4,ham,indagi me 5 cheez kabhi bhi aa sakti h 'Hum' '...,,,


In [5]:
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1, inplace=True)

In [6]:
data.tail()

Unnamed: 0,v1,v2
7567,spam,This is the 2nd time we have tried 2 contact u...
7568,ham,Will Ì_ b going to esplanade fr home?
7569,ham,"Pity, * was in mood for that. So...any other s..."
7570,ham,The guy did some bitching but I acted like i'd...
7571,ham,Rofl. Its true to its name


In [7]:
data.columns = ['result','text']

In [8]:
data.head()

Unnamed: 0,result,text
0,ham,zyada itrao mat . english meko bi aati hai :-P
1,ham,is ajnbi dunia me akle 1 khwab hu swalon se k...
2,ham,Husband n wife were opening joint bank ac Husb...
3,ham,Husband Suhag raat pe- Tum ne kabhi Blue Film ...
4,ham,indagi me 5 cheez kabhi bhi aa sakti h 'Hum' '...


In [9]:
data['result'].value_counts()

result
ham     5825
spam    1747
Name: count, dtype: int64

In [10]:
data['result'] = data['result'].map({'ham': 0, 'spam': 1})

In [11]:
data.head()

Unnamed: 0,result,text
0,0,zyada itrao mat . english meko bi aati hai :-P
1,0,is ajnbi dunia me akle 1 khwab hu swalon se k...
2,0,Husband n wife were opening joint bank ac Husb...
3,0,Husband Suhag raat pe- Tum ne kabhi Blue Film ...
4,0,indagi me 5 cheez kabhi bhi aa sakti h 'Hum' '...


## Data Preprocessing

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
vec = TfidfVectorizer()
x = vec.fit_transform(data['text'])
y = data['result']

In [14]:
x[1]

<1x15640 sparse matrix of type '<class 'numpy.float64'>'
	with 24 stored elements in Compressed Sparse Row format>

## Crossvalidating which model works best for our data

In [15]:
models = [SVC(), MultinomialNB(), LogisticRegression()]

In [16]:
score_list = []
score = []
for i in range(3):
    score.append(cross_val_score(models[i], x, y, cv = 10))
    score_list.append(np.mean(score[i]))
score_list

KeyboardInterrupt: 

According to above scores, we get SVC as best model

## Hyperparameter Tuning

In [19]:
parameters = {'C' : [1,5,10,15,20],
             'kernel':['linear', 'poly', 'rbf']}

In [20]:
from sklearn.model_selection import RandomizedSearchCV
classifier = RandomizedSearchCV(SVC(), parameters, cv=10)

In [21]:
classifier.fit(x,y)

In [22]:
classifier.best_params_

{'kernel': 'linear', 'C': 1}

In [23]:
classifier.best_score_

0.9665890562315486

'C': 5, 'kernel': 'linear' 
are the best parameters

In [17]:
model = SVC(kernel = 'linear', C = 1)

In [18]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2)

In [19]:
model.fit(xtrain, ytrain)

In [20]:
model.score(xtest,ytest)

0.9762376237623762

In [21]:
pred = model.predict(xtest)
from sklearn.metrics import classification_report
rep = classification_report(pred, ytest, output_dict=True)
rep = pd.DataFrame(rep)
rep

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.995737,0.909357,0.976238,0.952547,0.97772
recall,0.974145,0.984177,0.976238,0.979161,0.976238
f1-score,0.984823,0.945289,0.976238,0.965056,0.976577
support,1199.0,316.0,0.976238,1515.0,1515.0
