<a href="https://colab.research.google.com/github/SARA3SAEED/ML-Mu/blob/main/s07_ml_model_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model Selection & Evaluation

### Importing Libraries

In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

In [81]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


==========

# Model Selection

In [82]:
from sklearn.datasets import load_iris

In [83]:
iris = load_iris()

In [84]:
print(iris.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


In [85]:
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [86]:
X = iris.data
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [87]:
y = iris.target
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [88]:
iris_df = pd.DataFrame(X, columns=iris.feature_names)
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [89]:
iris_df['target'] = iris.target

In [90]:
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [91]:
X = iris_df.drop('target', axis=1).values
X

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [92]:
y = iris_df['target'].values
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [93]:
from sklearn.model_selection import train_test_split

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [95]:
# SVC -> Model # 1
from sklearn.svm import SVC
clf = SVC(C=10, kernel='poly', degree=3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluation
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))
print()
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

0.98

[[19  0  0]
 [ 0 15  0]
 [ 0  1 15]]


In [96]:
# KNN -> Model # 2
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluation
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))
print()
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

0.98

[[19  0  0]
 [ 0 15  0]
 [ 0  1 15]]


In [97]:
# Logistic Regression -> Model # 3
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=10000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluation
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))
print()
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

1.0

[[19  0  0]
 [ 0 15  0]
 [ 0  0 16]]


In [98]:
# Random Forests -> Model # 4
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=150, max_depth=10, max_leaf_nodes=5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluation
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))
print()
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

0.98

[[19  0  0]
 [ 0 15  0]
 [ 0  1 15]]


==========

# Model Evaluation & Improvement

## Splitter (Cross-Validation)

##### Importing Dataset

In [99]:
from sklearn.datasets import load_breast_cancer

In [100]:
cancer = load_breast_cancer()

##### Selecting Model

In [101]:
from sklearn.svm import SVC

In [102]:
clf = SVC()

### Cross-Validation with `train_test_split`

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [103]:
X = cancer.data
y = cancer.target

In [104]:
from sklearn.model_selection import train_test_split

In [105]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [106]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

In [107]:
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [108]:
X.shape

(569, 30)

In [109]:
X_train.shape

(455, 30)

In [110]:
X_valid.shape

(57, 30)

In [111]:
X_test.shape

(57, 30)

In [112]:
# training (class)
clf.fit(X_train, y_train)

##### Evaluating Model

In [113]:
from sklearn.metrics import accuracy_score

In [114]:
# validation (midterm)
y_pred_valid = clf.predict(X_valid)
y_pred_valid

array([1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0])

In [115]:
y_valid

array([1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0])

In [116]:
print(accuracy_score(y_valid, y_pred_valid))

0.9122807017543859


In [117]:
# testing (final)
y_pred_test = clf.predict(X_test)
y_pred_test

array([0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1])

In [118]:
y_test

array([0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1])

In [119]:
print(accuracy_score(y_test, y_pred_test))

0.9824561403508771


### Cross-Validation with `cross_val_score`

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html

In [120]:
from sklearn.model_selection import cross_val_score

In [121]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.1, random_state=42)

In [122]:
scores = cross_val_score(clf, X_train, y_train)
scores

array([0.93203883, 0.9223301 , 0.93137255, 0.91176471, 0.85294118])

In [123]:
scores = cross_val_score(clf, cancer.data, cancer.target, cv=3)
scores

array([0.85263158, 0.93157895, 0.94708995])

In [124]:
scores.mean()

0.9104334911352455

In [125]:
# clf.score(y_test, clf.predict(X_test))

### Cross-Validation with `cross_validate`

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html

In [126]:
from sklearn.model_selection import cross_validate

Scoring Options: https://scikit-learn.org/stable/modules/model_evaluation.html

In [127]:
scores = cross_validate(clf, X, y, scoring=['accuracy','average_precision'],cv=10)
scores

{'fit_time': array([0.0301888 , 0.02135253, 0.04790974, 0.02168798, 0.02919674,
        0.00982785, 0.02040243, 0.07729459, 0.01742744, 0.0562408 ]),
 'score_time': array([0.01814699, 0.02326345, 0.01541281, 0.01358986, 0.00930405,
        0.00883937, 0.02805018, 0.05614758, 0.00554585, 0.02914405]),
 'test_accuracy': array([0.89473684, 0.84210526, 0.89473684, 0.92982456, 0.92982456,
        0.92982456, 0.94736842, 0.92982456, 0.92982456, 0.91071429]),
 'test_average_precision': array([0.97979197, 0.97076669, 0.99210049, 0.98058628, 0.99348603,
        0.98882819, 0.98557846, 0.9727489 , 0.99089446, 0.9942693 ])}

In [128]:
pd.DataFrame(scores)#['test_score']

Unnamed: 0,fit_time,score_time,test_accuracy,test_average_precision
0,0.030189,0.018147,0.894737,0.979792
1,0.021353,0.023263,0.842105,0.970767
2,0.04791,0.015413,0.894737,0.9921
3,0.021688,0.01359,0.929825,0.980586
4,0.029197,0.009304,0.929825,0.993486
5,0.009828,0.008839,0.929825,0.988828
6,0.020402,0.02805,0.947368,0.985578
7,0.077295,0.056148,0.929825,0.972749
8,0.017427,0.005546,0.929825,0.990894
9,0.056241,0.029144,0.910714,0.994269


In [129]:
pd.DataFrame(scores)['test_accuracy'].mean()

0.9138784461152882

In [130]:
pd.DataFrame(scores)['test_average_precision'].mean()

0.9849050763460452

==========

## Hyperparameters Tuning (Grid Search)

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [131]:
from sklearn.model_selection import GridSearchCV

In [132]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
param_grid

{'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

In [133]:
from sklearn.svm import SVC

In [134]:
grid_search = GridSearchCV(SVC(), param_grid, cv=5)

In [135]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=42)

In [136]:
grid_search.fit(X_train, y_train)

In [137]:
grid_search.score(X_test, y_test)

0.9230769230769231

In [138]:
grid_search.best_params_

{'C': 1, 'gamma': 0.001}

In [139]:
grid_search.best_score_

0.9177838577291382

In [140]:
grid_search.best_estimator_

In [141]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.036844,0.005959,0.016523,0.006609,0.001,0.001,"{'C': 0.001, 'gamma': 0.001}",0.627907,0.635294,0.635294,0.623529,0.623529,0.629111,0.005296,6
1,0.023979,0.018066,0.007105,0.003778,0.001,0.01,"{'C': 0.001, 'gamma': 0.01}",0.627907,0.635294,0.635294,0.623529,0.623529,0.629111,0.005296,6
2,0.018764,0.008407,0.006975,0.004437,0.001,0.1,"{'C': 0.001, 'gamma': 0.1}",0.627907,0.635294,0.635294,0.623529,0.623529,0.629111,0.005296,6
3,0.016566,0.003689,0.00571,0.000656,0.001,1.0,"{'C': 0.001, 'gamma': 1}",0.627907,0.635294,0.635294,0.623529,0.623529,0.629111,0.005296,6
4,0.020324,0.007298,0.006872,0.001671,0.001,10.0,"{'C': 0.001, 'gamma': 10}",0.627907,0.635294,0.635294,0.623529,0.623529,0.629111,0.005296,6
5,0.014859,0.000215,0.005652,0.000595,0.001,100.0,"{'C': 0.001, 'gamma': 100}",0.627907,0.635294,0.635294,0.623529,0.623529,0.629111,0.005296,6
6,0.01359,0.001717,0.005095,0.00028,0.01,0.001,"{'C': 0.01, 'gamma': 0.001}",0.627907,0.635294,0.635294,0.623529,0.623529,0.629111,0.005296,6
7,0.021655,0.010743,0.00803,0.002898,0.01,0.01,"{'C': 0.01, 'gamma': 0.01}",0.627907,0.635294,0.635294,0.623529,0.623529,0.629111,0.005296,6
8,0.018423,0.003926,0.005823,0.000504,0.01,0.1,"{'C': 0.01, 'gamma': 0.1}",0.627907,0.635294,0.635294,0.623529,0.623529,0.629111,0.005296,6
9,0.020501,0.006347,0.005926,0.001289,0.01,1.0,"{'C': 0.01, 'gamma': 1}",0.627907,0.635294,0.635294,0.623529,0.623529,0.629111,0.005296,6


==========

# Algorithms Chain & Pipeline

### Traditional Pipeline

In [142]:
# loading data
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [143]:
# splitting data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

In [144]:
# scaling data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [145]:
# building model
from sklearn.svm import SVC
svm = SVC()

In [146]:
# learn an SVM on the scaled training data
svm.fit(X_train_scaled, y_train)

In [147]:
# scale the test data and score the scaled data
X_test_scaled = scaler.transform(X_test)

In [148]:
svm.score(X_test_scaled, y_test)

0.972027972027972

### Building Pipeline

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

In [149]:
from sklearn.pipeline import Pipeline

In [150]:
pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())])

In [151]:
pipe.fit(X_train, y_train)

In [152]:
pipe.score(X_test, y_test)

0.972027972027972

### Using Pipelines in Grid Searches

In [153]:
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],
              'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

In [154]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)

In [155]:
grid.fit(X_train, y_train)

In [156]:
grid.best_score_

0.9812311901504789

In [157]:
grid.score(X_test, y_test)

0.972027972027972

In [158]:
grid.best_params_

{'svm__C': 1, 'svm__gamma': 1}

==========

# THANK YOU!