**Important: This code will be done without any feature engineering. We are only focused on the concepts of model selection and optimizations.**

## **Cross validation**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [None]:
data = pd.read_csv('pima-indians-diabetes.csv')
print(data.shape)
data.head()

In [None]:
X = data.drop(columns='class', axis=1)
y = data['class']

In [None]:
k = 10

kfold = KFold(n_splits=k, shuffle=True, random_state=42)
model = LogisticRegression(max_iter=500)
results = cross_val_score(model, X, y, cv=kfold)
print(results)
print('Accuracy: %.3f%% (%.3f%%)' % (results.mean()*100, results.std()*100))

Our model perform on average 77%. And could vary between 70.806% and 83.346%. 

## **Leave one out cross validation**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from numpy import mean
from numpy import absolute
from numpy import sqrt
import pandas as pd

In [None]:
data = pd.DataFrame({'x1': [2, 5, 4, 3, 4, 6, 7, 5, 8, 9],
                     'x2': [14, 12, 12, 13, 7, 8, 7, 4, 6, 5],
                     'y': [6, 8, 12, 14, 14, 15, 17, 22, 24, 23]})

data

In [None]:
X = data.iloc[:, 0:2]
y = data.iloc[:, 2]

In [None]:
# cross validation method

cv = LeaveOneOut()

In [None]:
model = LinearRegression()

In [None]:
# using LOOCV to evaluate the model

scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1) 

In [None]:
# Mean Absolute Error

mean(absolute(scores))

## **GridSearchCV for model optimizations**

In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split 

In [None]:
data = pd.read_csv('pima-indians-diabetes.csv')
print(data.shape)
data.head()

In [None]:
X = data.drop('class', axis=1)
y = data['class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Splitting the validation data from the training data

X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [None]:
print('Total dataset:', len(data.values))
print('Total test data:', len(X_test))
print('Total training data:', len(X_train2))
print('Total validation data:', len(X_val))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()

In [None]:
KNeighborsClassifier().get_params()

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [None]:
param_grid = {'n_neighbors': list(range(1,9)),
              'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
              'leaf_size': [10,20,30,40,50],
              'weights': ('uniform', 'distance')
              }

In [None]:
from sklearn.model_selection import GridSearchCV

GS = GridSearchCV(knn_clf, param_grid, cv=20)

In [None]:
GS.fit(X_train2, y_train2)

In [None]:
GS.best_params_

In [None]:
GS.best_estimator_

In [None]:
GS.cv_results_['mean_test_score']

In [None]:
GS.best_score_

In [None]:
# Exporting the results to evaluate which one was the best result

df = pd.DataFrame(GS.cv_results_)
df = df.sort_values('rank_test_score')
df.to_csv('cv_results.csv', encoding='utf-8', sep=',')

In [None]:
# Making the model with the best hyper parameters

from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(algorithm='auto', leaf_size=10, n_neighbors=8, weights='uniform')

In [None]:
knn_clf.fit(X_train2, y_train2)

In [None]:
from sklearn import metrics

# Since we don't wanna go back and change any parameters we can use the test data now otherwise we need
# to use the validation data to check

y_pred = knn_clf.predict(X_val)

In [None]:
print('Accuracy: ', metrics.accuracy_score(y_val, y_pred))

In [None]:
# Changing more hyper parameters to see if the accuracy score get better

param_grid2 = {'n_neighbors': list(range(1,12)),
              'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
              'leaf_size': [5,10,20,30,40,50,60],
              'weights': ('uniform', 'distance')
              }


In [None]:
from sklearn.model_selection import GridSearchCV

GS2 = GridSearchCV(knn_clf, param_grid2, cv=20)

In [None]:
GS2.fit(X_train2, y_train2)

In [None]:
GS2.best_params_

In [None]:
GS2.best_estimator_

In [None]:
GS2.best_score_

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf2 = KNeighborsClassifier(algorithm='auto', leaf_size=5, n_neighbors=11, weights='distance')

In [None]:
knn_clf2.fit(X_train2, y_train2)

In [None]:
from sklearn import metrics

y_pred2 = knn_clf2.predict(X_val)

In [None]:
print('Accuracy: ', metrics.accuracy_score(y_val, y_pred2))

In [None]:
# Since we don't wanna go back and change any parameters we can use the test data now otherwise we need
# to use the validation data to check

y_pred3 = knn_clf2.predict(X_test)

In [None]:
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred3))

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


conf_matrix = confusion_matrix(y_test,y_pred3)

plt.figure(figsize=(6, 6))
sns.heatmap(conf_matrix, annot=True)
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

In [None]:
# Our model is good in detecting the '0's but is not doing good in detecting the '1's
# maybe we need to tune the hyper parameters again to make it better 

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred3))

In [None]:
# Computing the ROC and the AOC for the first model

from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt

y_scores = knn_clf.predict_proba(X_test)
fpr1, tpr1, threshold1 = roc_curve(y_test, y_scores[:, 1])
roc_auc1 = auc(fpr1, tpr1)

In [None]:
# Computing the ROC and the AOC for the second model

y_scores2 = knn_clf2.predict_proba(X_test)
fpr2, tpr2, threshold2 = roc_curve(y_test, y_scores2[:, 1])
roc_auc2 = auc(fpr2, tpr2)

In [None]:
plt.plot(fpr1, tpr1, 'b', label = 'AUC = %0.2f' % roc_auc1)
plt.plot(fpr2, tpr2, 'g', label = 'AUC = %0.2f' % roc_auc2)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve of kNN')
plt.show()

## **GridSearchCV with pipeline**

In [None]:
data = pd.read_csv('wisc_bc_data.csv')
print(data.shape)
data.head()

In [None]:
X = data.iloc[:, 2:].values
y = data.iloc[:, 1].values

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [None]:
y = le.fit_transform(y)

In [None]:
y

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Splitting the validation data from the training data

X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [None]:
print('Total dataset:', len(data.values))
print('Total test data:', len(X_test))
print('Total training data:', len(X_train2))
print('Total validation data:', len(X_val))

### **Implementing pipeline**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [None]:
pipe_svc = Pipeline([('scl', StandardScaler()), ('pca', PCA(n_components=2)), ('svc', SVC())])

In [None]:
pipe_svc.fit(X_train2, y_train2)
print('Test accuracy: %.3f' % pipe_svc.score(X_test, y_test))

In [None]:
print(PCA().get_params())

In [None]:
print(SVC().get_params())

In [None]:
param_grid = {'pca__n_components': [14,15],
              'svc__C': [0.001, 0.01, 0.1, 1, 10, 100],
              'svc__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
              'svc__kernel': ['rbf', 'poly']}

In [None]:
grid = GridSearchCV(pipe_svc, param_grid=param_grid, cv=5)

In [None]:
grid.fit(X_train2, y_train2)

In [None]:
print('Best cross-validation accuracy: {:.2f}'.format(grid.best_score_))
print('Best parameters: ', grid.best_params_)
print('Validation set accuracy: {:.2f}'.format(grid.score(X_val, y_val)))

In [None]:
# Now doing again but tuning the hyper parameters to evaluate with the test data

pipe_svc2 = Pipeline([('scl', StandardScaler()), ('pca', PCA(n_components=15)), ('svc', SVC(C=10, gamma=0.01, kernel='rbf', probability=True))])

In [None]:
pipe_svc2.fit(X_train2, y_train2)

In [None]:
# Probably the model is overfitting

print('Test accuracy: %.3f' % pipe_svc2.score(X_test, y_test))

In [None]:
# Now doing again but tuning the hyper parameters to reduce the overfitting

pipe_svc3 = Pipeline([('scl', StandardScaler()), ('pca', PCA(n_components=3)), ('svc', SVC(C=10, gamma=0.01, kernel='rbf', probability=True))])

In [None]:
pipe_svc3.fit(X_train2, y_train2)

In [None]:
print('Test accuracy: %.3f' % pipe_svc3.score(X_test, y_test))

In [None]:
y_pred = pipe_svc3.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


conf_matrix = confusion_matrix(y_test,y_pred)

plt.figure(figsize=(6, 6))
sns.heatmap(conf_matrix, annot=True)
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt

y_scores = pipe_svc3.predict_proba(X_test)
fpr, tpr, threshold = roc_curve(y_test, y_scores[:, 1])
roc_auc = auc(fpr, tpr)

In [None]:
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.3f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC Curve of kNN')
plt.show()

## **RandomSearchCV**

Taking Boston house price dataset to check accuracy of Random Forest Regression model and tuning hyperparameters-number of estimators and max depth of the tree to find the best value. 

In [2]:
import pandas as pd
import numpy as np

Source: "http://lib.stat.cmu.edu/datasets/boston"

 Variables in order:
 CRIM     per capita crime rate by town,
 ZN       proportion of residential land zoned for lots over 25,000 sq.ft.,
 INDUS    proportion of non-retail business acres per town,
 CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise),
 NOX      nitric oxides concentration (parts per 10 million),
 RM       average number of rooms per dwelling,
 AGE      proportion of owner-occupied units built prior to 1940,
 DIS      weighted distances to five Boston employment centres,
 RAD      index of accessibility to radial highways,
 TAX      full-value property-tax rate per $10,000,
 PTRATIO  pupil-teacher ratio by town,
 B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town,
 LSTAT    % lower status of the population,
 MEDV     Median value of owner-occupied homes in $1000's,

In [3]:
data = pd.read_csv('boston.txt', sep="\s+", skiprows=22, header=None)
print(data.shape)
data.head(15)

(1012, 11)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3
1,396.9,4.98,24.0,,,,,,,,
2,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8
3,396.9,9.14,21.6,,,,,,,,
4,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8
5,392.83,4.03,34.7,,,,,,,,
6,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7
7,394.63,2.94,33.4,,,,,,,,
8,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7
9,396.9,5.33,36.2,,,,,,,,


In [4]:
# Jumping one row concatenate with the following line until the third columns

df = pd.DataFrame(np.concatenate([data.iloc[::2, :], data.iloc[1::2, :3]], axis=1), columns=['CRIM',
                                                                                             'ZN',
                                                                                             'INDUS',
                                                                                             'CHAS',
                                                                                             'NOX',
                                                                                             'RM',
                                                                                             'AGE',
                                                                                             'DIS',
                                                                                             'RAD',
                                                                                             'TAX', 
                                                                                             'PTRATIO',  
                                                                                             'B',
                                                                                             'LSTAT',
                                                                                             'MEDV'])
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [5]:
X = df.iloc[:,:-1]
y = df['MEDV']

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
print('Total dataset:', len(df.values))
print('Total test data:', len(X_test))
print('Total training data:', len(X_train))

Total dataset: 506
Total test data: 152
Total training data: 354


In [8]:
from sklearn import ensemble

regr = ensemble.RandomForestRegressor(n_estimators=5, max_depth=3)
regr.fit(X_train, y_train)

print('training score: ', regr.score(X_train, y_train))
print('test score: ', regr.score(X_test, y_test))

training score:  0.859900456526567
test score:  0.8159654794586295


**Cross validation**

In [9]:
from sklearn.model_selection import cross_val_score

# The results of cross validation aren't so good
scores1 = cross_val_score(ensemble.RandomForestRegressor(n_estimators=5, max_depth=3), X_train, y_train, cv=10)
np.average(scores1)

0.7673394246485313

**Combining CV with GridSearch to achieve better results**

In [13]:
from sklearn.model_selection import GridSearchCV
model = ensemble.RandomForestRegressor() 

params = {'n_estimators': [20,30,40,60,100],
           'max_depth': [5,10,15,20],
           'max_features': [2,5,8]}

GS = GridSearchCV(estimator=model, param_grid=params, cv=10, n_jobs=-1)               

In [15]:
grid = GS.fit(X_train, y_train)

In [16]:
grid.best_score_

0.8535174419993815

In [17]:
grid.best_params_

{'max_depth': 20, 'max_features': 5, 'n_estimators': 20}

In [19]:
# Results before grid search: training score:  0.859900456526567
#                             test score:  0.8159654794586295

regr2 = ensemble.RandomForestRegressor(max_depth=20, max_features=5, n_estimators=20)
regr2.fit(X_train, y_train)

print('training score: ', regr2.score(X_train, y_train))
print('test score: ', regr2.score(X_test, y_test))

training score:  0.9741879906113879
test score:  0.8578455623531034


In [None]:
# Starting the code for Random Search CV