## Model Selection and Cross Validation

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('Social_Network_Ads.csv')
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [3]:
df.shape

(400, 5)

In [4]:
#seperating x and y
X = df[['Age','EstimatedSalary']]
y = df['Purchased']

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

## Without CrossValidation

In [6]:
#by changing the random state accuracy will change a lot
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size = 0.25,random_state = 10)
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(x_train,y_train)
y_pred = knn_model.predict(x_test)
accuracy_score(y_test,y_pred)

0.87

## 1) With K-Fold Cross Validation

### a) KNN model

In [7]:
from sklearn.model_selection import cross_val_score #k-fold cross validation

In [8]:
knn_model = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn_model,  x_train, y_train, cv=15, scoring ='accuracy')
print('Scores = ',scores)
print('Avg Score = ',scores.mean())

Scores =  [0.75 0.85 0.6  0.8  0.8  0.8  0.85 0.95 0.8  0.75 0.8  0.7  0.75 0.8
 0.7 ]
Avg Score =  0.7799999999999999


In [10]:
knn_model.score(x_test,y_test)

NotFittedError: This KNeighborsClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

### b) Logistic regression model

In [9]:
from sklearn.linear_model import LogisticRegression
logreg_model = LogisticRegression()
scores = cross_val_score(logreg_model,  x_train, y_train, cv=10, scoring ='accuracy')
print('Scores = ',scores)
print('Avg Score = ',scores.mean())

Scores =  [0.63333333 0.63333333 0.63333333 0.63333333 0.63333333 0.63333333
 0.63333333 0.63333333 0.53333333 0.6       ]
Avg Score =  0.6199999999999999


## 2) Stratified K-Fold Cross Validation

In [10]:
#import library
from sklearn.model_selection import StratifiedKFold

In [11]:
#model
skf = StratifiedKFold(n_splits=5)
scores = []
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    knn_model = KNeighborsClassifier(n_neighbors=5)
    knn_model.fit(X_train,y_train)
    y_pred = knn_model.predict(X_test)
    score = accuracy_score(y_test,y_pred)
    scores.append(score)
    
print('Scores = ',scores)
print('Average Score = ',np.array(scores).mean())

Scores =  [0.8125, 0.8625, 0.725, 0.7625, 0.675]
Average Score =  0.7674999999999998
