In [1]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

# ML
from sklearn.neighbors import KNeighborsClassifier

# import libraries for model validation
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import LeavePOut
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.model_selection import cross_val_score


In [2]:
iris = datasets.load_iris()

In [3]:
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [4]:
iris.data[:3]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2]])

In [5]:
iris.target[:3]

array([0, 0, 0])

In [6]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [7]:
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [9]:
X = iris.data
y = iris.target

In [10]:
# instantiate the KNN classifier
# {'metric': 'minkowski', 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
clf = KNeighborsClassifier( )

In [12]:
# get the KNN parameters
clf.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [13]:
# get the KNN parameters
clf.get_params().keys()

dict_keys(['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'])

In [14]:
n_neighbors = [3, 5, 7, 9, 11, 13, 15, 19, 23, 29]
algos       = ['ball_tree', 'kd_tree', 'brute']
dist_metric = ['minkowski']
p_root      = [1, 2, 3]
weights     = ['uniform', 'distance']
leaf_size   = [15, 30, 40, 50, 60]

In [15]:
# define the parameters
parameters = dict(
                n_neighbors= n_neighbors,
                #algorithm= algos,
                metric= dist_metric,
                p= p_root,
                weights= weights,
                #leaf_size= leaf_size
            )

print(parameters)

{'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 19, 23, 29], 'metric': ['minkowski'], 'p': [1, 2, 3, 4], 'weights': ['uniform', 'distance']}


In [16]:
# define splits
n_splits = 5

kf  = KFold(n_splits =n_splits, shuffle=True, random_state=100)
skf = StratifiedKFold(n_splits =n_splits, random_state=100)

#sf  = ShuffleSplit(n_splits =n_splits, test_size=0.2, random_state=100)
#ssf = StratifiedShuffleSplit(n_splits =n_splits, test_size=0.2, random_state=100)

In [17]:
# instantiate the grid search CV
grid = GridSearchCV(estimator = clf, 
                   param_grid = parameters, 
                   scoring    = 'accuracy',
                   cv=kf,
                   verbose=4)

In [18]:
# fit the data to the grid object
grid.fit(X, y)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV] metric=minkowski, n_neighbors=3, p=1, weights=uniform ...........
[CV]  metric=minkowski, n_neighbors=3, p=1, weights=uniform, score=1.0, total=   0.0s
[CV] metric=minkowski, n_neighbors=3, p=1, weights=uniform ...........
[CV]  metric=minkowski, n_neighbors=3, p=1, weights=uniform, score=0.9666666666666667, total=   0.0s
[CV] metric=minkowski, n_neighbors=3, p=1, weights=uniform ...........
[CV]  metric=minkowski, n_neighbors=3, p=1, weights=uniform, score=0.8666666666666667, total=   0.0s
[CV] metric=minkowski, n_neighbors=3, p=1, weights=uniform ...........
[CV]  metric=minkowski, n_neighbors=3, p=1, weights=uniform, score=0.9333333333333333, total=   0.0s
[CV] metric=minkowski, n_neighbors=3, p=1, weights=uniform ...........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s


[CV]  metric=minkowski, n_neighbors=3, p=1, weights=uniform, score=1.0, total=   0.0s
[CV] metric=minkowski, n_neighbors=3, p=1, weights=distance ..........
[CV]  metric=minkowski, n_neighbors=3, p=1, weights=distance, score=1.0, total=   0.0s
[CV] metric=minkowski, n_neighbors=3, p=1, weights=distance ..........
[CV]  metric=minkowski, n_neighbors=3, p=1, weights=distance, score=0.9666666666666667, total=   0.0s
[CV] metric=minkowski, n_neighbors=3, p=1, weights=distance ..........
[CV]  metric=minkowski, n_neighbors=3, p=1, weights=distance, score=0.8666666666666667, total=   0.0s
[CV] metric=minkowski, n_neighbors=3, p=1, weights=distance ..........
[CV]  metric=minkowski, n_neighbors=3, p=1, weights=distance, score=0.9333333333333333, total=   0.0s
[CV] metric=minkowski, n_neighbors=3, p=1, weights=distance ..........
[CV]  metric=minkowski, n_neighbors=3, p=1, weights=distance, score=1.0, total=   0.0s
[CV] metric=minkowski, n_neighbors=3, p=2, weights=uniform ...........
[CV]  me

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:    3.7s finished


GridSearchCV(cv=KFold(n_splits=5, random_state=100, shuffle=True),
       error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 19, 23, 29], 'metric': ['minkowski'], 'p': [1, 2, 3, 4], 'weights': ['uniform', 'distance']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=4)

In [19]:
print('Estimator: \n',    grid.best_estimator_)
print('Best params : \n', grid.best_params_)
print(grid.classes_)
print(grid.best_score_)

Estimator: 
 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=13, p=3,
           weights='uniform')
Best params : 
 {'metric': 'minkowski', 'n_neighbors': 13, 'p': 3, 'weights': 'uniform'}
[0 1 2]
0.9866666666666667
