### Import Libraries

In [29]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

### Load Data

In [30]:
X=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X.csv")
X_train=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X_train.csv")
X_test=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X_test.csv")
X_selected=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X_selected.csv")
X_train_selected=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X_train_selected.csv")
X_test_selected=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X_test_selected.csv")
y=np.load("/kaggle/input/hit-prediction-processed-data/Hit Prediction/y.npy")
y_train=np.load("/kaggle/input/hit-prediction-processed-data/Hit Prediction/y_train.npy")
y_test=np.load("/kaggle/input/hit-prediction-processed-data/Hit Prediction/y_test.npy")
y_selected=np.load("/kaggle/input/hit-prediction-processed-data/Hit Prediction/y_selected.npy")

### Fitting Basic Model with default Parameters

#### Building Base Model

In [31]:
clf = SVC(gamma='auto')
clf.fit(X_train_selected, y_train)
clf.score(X_test_selected, y_test)

0.7752262333365768

### Cross Validation Score on Basic Model

In [32]:
scores = cross_val_score(clf, X_selected, y_selected, cv=5)
scores

array([0.7724398 , 0.77387179, 0.76997932, 0.77618295, 0.77070916])

In [33]:
print("%0.5f accuracy with a standard deviation of %0.5f" % (scores.mean(), scores.std()))

0.77264 accuracy with a standard deviation of 0.00223


### Finding Best Parameters using Grid Search

#### Hyper Parameter List

In [34]:
parameters = {
'kernel':["linear", "poly", "rbf", "sigmoid"],
'gamma':[ "scale","auto"],
'C':[0.5,1.0,1.5],
'class_weight': ["balanced", None]
}

#### Base Model

In [35]:
clf = SVC()

#### Grid Search

In [36]:
gcv = GridSearchCV(clf, parameters, cv=5,verbose=False, n_jobs=-1)

In [37]:
gcv.fit(X_train_selected,y_train)

GridSearchCV(cv=5, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.5, 1.0, 1.5],
                         'class_weight': ['balanced', None],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             verbose=False)

In [38]:
gcv.best_params_

{'C': 1.5, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'rbf'}

#### Grid Search Cross Validation Mean Score

In [39]:
gcv.best_score_

0.7722274598503317

### Test Accuracy for Model with best hyperparameters

In [40]:
clf=gcv.best_estimator_
clf.fit(X_train_selected,y_train)
clf.score(X_test_selected,y_test)

0.7759073659628296

In [41]:
pred=clf.predict(X_test_selected)
print(confusion_matrix(y_test,pred))

[[3424 1715]
 [ 588 4550]]


In [42]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.85      0.67      0.75      5139
           1       0.73      0.89      0.80      5138

    accuracy                           0.78     10277
   macro avg       0.79      0.78      0.77     10277
weighted avg       0.79      0.78      0.77     10277



### Cross Validation Score on best model 

In [43]:
scores = cross_val_score(clf, X_selected, y_selected, cv=5)
scores

array([0.77596692, 0.77484491, 0.7708308 , 0.77642623, 0.77253376])

In [44]:
print("%0.5f accuracy with a standard deviation of %0.5f" % (scores.mean(), scores.std()))

0.77412 accuracy with a standard deviation of 0.00213


#### Inference -
1. The cross validation score for the base model with default parameters is **0.77264**. 

2. The cross validation score for the best model with hyper parameters {'algorithm': 'ball_tree', 'n_neighbors': 19, 'p': 1, 'weights': 'distance'} is **0.77412**.