In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('model.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,sim_content,sin_comment,word_count,duplicate_word_ratio,no_of_sentences,length_of_comment,num_of_punctuations,is_period_sequence,stop_word_ratio,post_coment_gap,black_word_count,is_link,is_youtube_link,is_number,is_mail,comment_duplication,classifier_val
0,0,0.372249,0.280355,3.057751,0.083333,3,6.612096,4.0,0,0.0,8.750086,1.0,1,0,0,0,0.0,1
1,1,0.354255,0.341604,2.410947,0.0,1,4.897796,1.0,0,0.0,15.380144,2.0,0,0,0,0,0.715092,0
2,2,0.487752,0.314966,1.578192,0.0,1,3.791178,0.0,0,0.0,14.181022,0.0,0,0,0,0,0.0,0
3,3,0.468095,0.368193,2.328976,0.0,1,5.041488,0.0,0,0.090909,13.636652,2.0,0,0,0,0,0.0,0
4,4,0.366191,0.258061,2.238987,0.0,1,5.174158,4.0,0,0.0,13.5295,0.0,1,1,0,0,0.0,0


In [4]:
df.rename(columns={'Unnamed: 0':'id'},inplace=True)
df.columns

Index(['id', 'sim_content', 'sin_comment', 'word_count',
       'duplicate_word_ratio', 'no_of_sentences', 'length_of_comment',
       'num_of_punctuations', 'is_period_sequence', 'stop_word_ratio',
       'post_coment_gap', 'black_word_count', 'is_link', 'is_youtube_link',
       'is_number', 'is_mail', 'comment_duplication', 'classifier_val'],
      dtype='object')

### Standardization

In [5]:
dfc = df.copy()

In [6]:
y = dfc['classifier_val']
X = dfc.drop(['classifier_val','id'],axis=1)

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)
X_train = scaler.transform(X) 

### Handle imbalance dataset

In [8]:
dfc['classifier_val'].value_counts()

0    2540
1     585
Name: classifier_val, dtype: int64

In [9]:
from imblearn.combine import SMOTETomek
from collections import Counter

os=SMOTETomek()
X_os,y_os=os.fit_sample(X,y)
print("The number of classes before fit {}".format(Counter(y)))
print("The number of classes after fit {}".format(Counter(y_os)))

X = X_os
y = y_os

The number of classes before fit Counter({0: 2540, 1: 585})
The number of classes after fit Counter({1: 2486, 0: 2486})


### Hyperparameter Tuning

In [11]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [12]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [19]:
grid = {
   'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [int(x) for x in range(1,70,2)],
 
    'n_neighbors': [int(x) for x in range(5,80)],
    'p': [1,2],
    'weights':['uniform', 'distance']
}

In [20]:
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=10, shuffle=True) 

In [21]:
from sklearn.model_selection import RandomizedSearchCV
knn_random = RandomizedSearchCV(estimator=knn, param_distributions=grid, n_jobs=-1,n_iter = 100, cv=cv, scoring='accuracy',error_score=0,return_train_score=True,verbose=2)

In [22]:
knn_random.fit(X,y)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   38.1s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  4.0min finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=True),
                   error_score=0, estimator=KNeighborsClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'algorithm': ['auto', 'ball_tree',
                                                      'kd_tree', 'brute'],
                                        'leaf_size': [1, 3, 5, 7, 9, 11, 13, 15,
                                                      17, 19, 21, 23, 25, 27,
                                                      29, 31, 33, 35, 37, 39,
                                                      41, 43, 45, 47, 49, 51,
                                                      53, 55, 57, 59, ...],
                                        'n_neighbors': [5, 6, 7, 8, 9, 10, 11,
                                                        12, 13, 14, 15, 16, 17,
                                                        18, 19, 20, 21, 22, 23,
                         

### Evaluvate

In [23]:
knn_random.best_score_

0.9143196528569003

In [26]:
knn_random.best_params_

{'weights': 'distance',
 'p': 1,
 'n_neighbors': 7,
 'leaf_size': 51,
 'algorithm': 'brute'}

In [29]:
best_model = knn_random.best_estimator_

In [30]:
best_model

KNeighborsClassifier(algorithm='brute', leaf_size=51, n_neighbors=7, p=1,
                     weights='distance')

In [31]:
# for save the model if you want
#import pickle
# open a file, where you ant to store the data
#file = open('random_forest_regression_model_v2.pkl', 'wb')
# dump information to that file
#pickle.dump(rf_random, file)

In [32]:
# Best score 0.9143196528569003