## Classification Algorithm
## SVM - Grid

In [1]:
import pandas as pd

### Data collection

In [2]:
dataset = pd.read_csv('Social_Network_Ads.csv')

In [3]:
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


### Data preprocessing

In [4]:
dataset=pd.get_dummies(dataset,dtype=int,drop_first=True)

In [5]:
dataset

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased,Gender_Male
0,15624510,19,19000,0,1
1,15810944,35,20000,0,1
2,15668575,26,43000,0,0
3,15603246,27,57000,0,0
4,15804002,19,76000,0,1
...,...,...,...,...,...
395,15691863,46,41000,1,0
396,15706071,51,23000,1,1
397,15654296,50,20000,1,0
398,15755018,36,33000,0,1


In [6]:
dataset.shape

(400, 5)

In [7]:
dataset = dataset.drop('User ID', axis=1)

### Check the No of Unique classes and its count

In [8]:
dataset['Purchased'].value_counts()

Purchased
0    257
1    143
Name: count, dtype: int64

### Input Output Split

In [9]:
indep = dataset[['Age','EstimatedSalary','Gender_Male']]
dep = dataset[['Purchased']]

### Train Test Split

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(indep,dep,test_size=0.20,random_state=0)

### Standardization

In [11]:
from sklearn.preprocessing import StandardScaler
SC = StandardScaler()
X_train = SC.fit_transform(X_train)
X_test = SC.transform(X_test)

In [12]:
X_train

array([[ 1.92295008e+00,  2.14601566e+00,  1.02532046e+00],
       [ 2.02016082e+00,  3.78719297e-01, -9.75304830e-01],
       [-1.38221530e+00, -4.32498705e-01, -9.75304830e-01],
       [-1.18779381e+00, -1.01194013e+00, -9.75304830e-01],
       [ 1.92295008e+00, -9.25023920e-01, -9.75304830e-01],
       [ 3.67578135e-01,  2.91803083e-01, -9.75304830e-01],
       [ 1.73156642e-01,  1.46942725e-01, -9.75304830e-01],
       [ 2.02016082e+00,  1.74040666e+00,  1.02532046e+00],
       [ 7.56421121e-01, -8.38107706e-01, -9.75304830e-01],
       [ 2.70367388e-01, -2.87638347e-01, -9.75304830e-01],
       [ 3.67578135e-01, -1.71750061e-01,  1.02532046e+00],
       [-1.18475597e-01,  2.20395980e+00, -9.75304830e-01],
       [-1.47942605e+00, -6.35303205e-01, -9.75304830e-01],
       [-1.28500455e+00, -1.06988428e+00,  1.02532046e+00],
       [-1.38221530e+00,  4.07691369e-01,  1.02532046e+00],
       [-1.09058306e+00,  7.55356227e-01, -9.75304830e-01],
       [-1.47942605e+00, -2.00722133e-01

### Train Set

### Model Creation in GridSearchCV

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# param_grid - This enables searching over any sequence of parameter settings.
# Scoring - Strategy to evaluate the performance of the cross-validated model on the test set.
# n_jobs - -1 means using all processors
# CV - Determines the cross-validation splitting strategy

param_grid = {'kernel':['linear','rbf','poly','sigmoid'],
              'C' : [1,10,100,1000,2000,3000], 
              'gamma' : ['scale','auto']}

grid = GridSearchCV(SVC(probability=True), param_grid, refit = True,verbose = 3, n_jobs=-1,scoring = 'f1_weighted')

grid.fit(X_train,Y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


  y = column_or_1d(y, warn=True)


### Test Set

### Evaluation Metrics 
### Confusion Matix
### Classification Report

In [22]:
# print the best parameter after tuning
results = grid.cv_results_

grid_predictions = grid.predict(X_test)

from sklearn.metrics import confusion_matrix
CM = confusion_matrix(Y_test,grid_predictions)

from sklearn.metrics import classification_report
Clf_report = classification_report(Y_test,grid_predictions)

In [23]:
print("Confusion Matrix:\n",CM)

Confusion Matrix:
 [[55  3]
 [ 1 21]]


In [24]:
print("Classification Report:\n",Clf_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.96        58
           1       0.88      0.95      0.91        22

    accuracy                           0.95        80
   macro avg       0.93      0.95      0.94        80
weighted avg       0.95      0.95      0.95        80



### ADDITIONAL : Check for the best params & f1_score 

In [26]:
from sklearn.metrics import f1_score

# Parameters : average{‘micro’, ‘macro’, ‘samples’, ‘weighted’, ‘binary
f1_macro_ave = f1_score(Y_test,grid_predictions, average='macro')

print('f1_macro_value of best parameter{}:'.format(grid.best_params_),f1_macro_ave)

f1_macro_value of best parameter{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}: 0.938977879481312


### ADDITIONAL : Check roc_auc_score -Receiver Operating Characterstic (Area Under Curve)

In [27]:
# ROC_AUC : A performance metric used primarily for binary classification problems to evaluate the quality of a classification model.

from sklearn.metrics import roc_auc_score
roc_auc_score(Y_test,grid.predict_proba(X_test)[:,1])

0.969435736677116

### Tabulation of Parameters in GridSearchCV

In [28]:
Table=pd.DataFrame(results)
Table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.031157,0.005452,0.020426,0.004978,1,scale,linear,"{'C': 1, 'gamma': 'scale', 'kernel': 'linear'}",0.796088,0.758892,0.740864,0.820367,0.902824,0.803807,0.056776,35
1,0.036996,0.01017,0.017787,0.002237,1,scale,rbf,"{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}",0.860542,0.890137,0.85992,0.907389,0.96875,0.897348,0.040016,1
2,0.029007,0.019137,0.022637,0.005565,1,scale,poly,"{'C': 1, 'gamma': 'scale', 'kernel': 'poly'}",0.794154,0.783708,0.769318,0.873807,0.919631,0.828124,0.058387,23
3,0.026175,0.00449,0.023362,0.005843,1,scale,sigmoid,"{'C': 1, 'gamma': 'scale', 'kernel': 'sigmoid'}",0.766401,0.795968,0.720769,0.726744,0.842259,0.770428,0.045172,37
4,0.035492,0.019792,0.022765,0.002932,1,auto,linear,"{'C': 1, 'gamma': 'auto', 'kernel': 'linear'}",0.796088,0.758892,0.740864,0.820367,0.902824,0.803807,0.056776,35
5,0.042063,0.016497,0.024326,0.005817,1,auto,rbf,"{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}",0.860542,0.890137,0.85992,0.907389,0.96875,0.897348,0.040016,1
6,0.03736,0.014478,0.018175,0.001589,1,auto,poly,"{'C': 1, 'gamma': 'auto', 'kernel': 'poly'}",0.794154,0.783708,0.769318,0.873807,0.919631,0.828124,0.058387,23
7,0.030495,0.009201,0.020702,0.007707,1,auto,sigmoid,"{'C': 1, 'gamma': 'auto', 'kernel': 'sigmoid'}",0.766401,0.795968,0.720769,0.726744,0.842259,0.770428,0.045172,37
8,0.033662,0.00862,0.016266,0.001768,10,scale,linear,"{'C': 10, 'gamma': 'scale', 'kernel': 'linear'}",0.796088,0.758892,0.740864,0.83804,0.887483,0.804273,0.053313,33
9,0.035639,0.008244,0.024284,0.003988,10,scale,rbf,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}",0.875759,0.858747,0.875897,0.907389,0.96875,0.897308,0.039026,3


### Check the Model by getting User Input

In [30]:
Age_input = int(input("AGE: "))
Salary_input = int(input("Salary: "))
Gender_input = int(input("Gender Male 0 or 1: "))

AGE:  25
Salary:  20000
Gender Male 0 or 1:  1


### Prediction

In [31]:
Prediction=grid.predict([[Age_input,Salary_input,Gender_input]]) 
Prediction


array([1], dtype=int64)