# Scikit-learn Session Lab

## import classes

In [1]:
import numpy as np
from sklearn import datasets

## Load and parse the data file

In [2]:
iris = datasets.load_iris()
#sample characteristics
iris_X = iris.data
# The variety of flowers 
iris_y = iris.target
np.unique(iris_y)

array([0, 1, 2])

# Split the data into training and test sets

## Split iris data in train and test data

## A random permutation, to split the data randomly

In [3]:
# Set a random array, the size of array is length of iris_X
np.random.seed(0)
indices = np.random.permutation(len(iris_X))
print (indices)

[114  62  33 107   7 100  40  86  76  71 134  51  73  54  63  37  78  90
  45  16 121  66  24   8 126  22  44  97  93  26 137  84  27 127 132  59
  18  83  61  92 112   2 141  43  10  60 116 144 119 108  69 135  56  80
 123 133 106 146  50 147  85  30 101  94  64  89  91 125  48  13 111  95
  20  15  52   3 149  98   6  68 109  96  12 102 120 104 128  46  11 110
 124  41 148   1 113 139  42   4 129  17  38   5  53 143 105   0  34  28
  55  75  35  23  74  31 118  57 131  65  32 138  14 122  19  29 130  49
 136  99  82  79 115 145  72  77  25  81 140 142  39  58  88  70  87  36
  21   9 103  67 117  47]


In [4]:
iris_X_train = iris_X[indices[:-10]]
iris_y_train = iris_y[indices[:-10]]
iris_X_test = iris_X[indices[-10:]]
iris_y_test = iris_y[indices[-10:]]
print (iris_X[indices[10]])

[ 6.1  2.6  5.6  1.4]


# Train a k-nearest-neighbor model

## Create and fit a k-nearest-neighbor classifier

In [5]:
from sklearn.neighbors import KNeighborsClassifier
# if we set the num of neighbirs as 8, the result will be 100%, default is 5
knn = KNeighborsClassifier(n_neighbors=5)
# Learn how to classfier according to the iris_X_train and iris_y_train
knn.fit(iris_X_train, iris_y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

## evaluate model on test instances and compute test error

In [6]:
from sklearn.metrics import accuracy_score
knn.predict(iris_X_test)

array([1, 2, 1, 0, 0, 0, 2, 1, 2, 0])

### The really result:

In [7]:
iris_y_test

array([1, 1, 1, 0, 0, 0, 2, 1, 2, 0])

In [8]:
accuracy_score(iris_y_test,knn.predict(iris_X_test))

0.90000000000000002

# Write a jupyter notebook with the following tasks:

## Write error of the classifier

In [9]:
error = 1 - accuracy_score(iris_y_test,knn.predict(iris_X_test))

In [10]:
print ("when the number of neghbor is 5, error rate : {0} ".format(error))

when the number of neghbor is 5, error rate : 0.09999999999999998 


##  What is the optimal parameter k of the k'nearest-neighbor classifier for this dataset ?

In [11]:
tab = dict()
i=1
while i<=(len(iris_X_train)):
    k = KNeighborsClassifier(n_neighbors=i)
    k.fit(iris_X_train, iris_y_train)
    tab[i] = accuracy_score(iris_y_test,k.predict(iris_X_test))
    i+=1
for x in tab:
    if tab[x]==max(tab.values()):
        print ("The optimal k is : {0} with {1}".format(x,tab[x]))

The optimal k is : 8 with 1.0
The optimal k is : 9 with 1.0
The optimal k is : 10 with 1.0
The optimal k is : 11 with 1.0
The optimal k is : 12 with 1.0
The optimal k is : 14 with 1.0
The optimal k is : 16 with 1.0
The optimal k is : 17 with 1.0
The optimal k is : 18 with 1.0
The optimal k is : 19 with 1.0
The optimal k is : 20 with 1.0
The optimal k is : 21 with 1.0
The optimal k is : 22 with 1.0
The optimal k is : 23 with 1.0
The optimal k is : 24 with 1.0
The optimal k is : 26 with 1.0
The optimal k is : 27 with 1.0
The optimal k is : 28 with 1.0
The optimal k is : 29 with 1.0
The optimal k is : 30 with 1.0
The optimal k is : 31 with 1.0
The optimal k is : 32 with 1.0
The optimal k is : 33 with 1.0
The optimal k is : 34 with 1.0
The optimal k is : 35 with 1.0
The optimal k is : 36 with 1.0
The optimal k is : 37 with 1.0
The optimal k is : 38 with 1.0
The optimal k is : 39 with 1.0
The optimal k is : 40 with 1.0
The optimal k is : 41 with 1.0
The optimal k is : 42 with 1.0
The optima

# Write a jupyter notebook with the following tasks

# With the iris dataset:

## Use two other classifiers

### 1> Naive Bayes

In [12]:
import numpy as np 
from sklearn.naive_bayes import MultinomialNB

In [13]:
# init
clf = MultinomialNB(alpha=0.01)
clf.fit(iris_X_train, iris_y_train)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [14]:
from sklearn.metrics import accuracy_score
accuracy_bayes = accuracy_score(iris_y_test,clf.predict(iris_X_test))

In [15]:
print ("Accuracy is {0}".format(accuracy_bayes))

Accuracy is 0.9


### 2> Logistic Regression Classifier 

In [16]:
import numpy as np
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l1')
lr.fit(iris_X_train, iris_y_train)
from sklearn.metrics import accuracy_score
accuracy_Regression= accuracy_score(iris_y_test,lr.predict(iris_X_test))
print ("Accuracy is {0}".format(accuracy_Regression))

Accuracy is 0.9


### 3>Decision Tree Classifier

In [17]:
import numpy as np
from sklearn import tree    
dtree = tree.DecisionTreeClassifier()    
dtree.fit(iris_X_train, iris_y_train)    
from sklearn.metrics import accuracy_score
accuracy_Decision= accuracy_score(iris_y_test,lr.predict(iris_X_test))
print ("Accuracy is {0}".format(accuracy_Decision))

Accuracy is 0.9


## Use cross-validation to evaluate the classifiers

In [18]:
from sklearn.model_selection import cross_val_score

In [19]:
knn_score=cross_val_score(knn,iris_X,iris_y)
print("knn: {0}".format(knn_score.mean()))

knn: 0.9869281045751634


In [20]:
decision_score = cross_val_score(dtree,iris_X,iris_y)
print("Decision Tree Classifier: {0}".format(decision_score.mean()))

Decision Tree Classifier: 0.960375816993464


In [21]:
regression_score = cross_val_score(lr,iris_X,iris_y)
print("Logistic Regression Classifier: {0}".format(regression_score.mean()))

Logistic Regression Classifier: 0.9399509803921569


In [22]:
bayes_score = cross_val_score(clf,iris_X,iris_y)
print("Naive Bayes Classifier: {0}".format(bayes_score.mean()))

Naive Bayes Classifier: 0.9607843137254902


# Best one:

## Knn: 0.9869281045751634 with k=5 by default