# Introduction to Scikit-learn

In [1]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [2]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(
    digits.data, digits.target, random_state=0)

In [3]:
X_train.shape

(1347, 64)

In [4]:
np.bincount(y_train)

array([141, 139, 133, 138, 143, 134, 129, 131, 126, 133], dtype=int64)

Really Simple API
-------------------
0) Import your model class

In [5]:
from sklearn.svm import LinearSVC

1) Instantiate an object and set the parameters

In [6]:
svm = LinearSVC()

2) Fit the model

In [7]:
svm.fit(X_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

3) Apply / evaluate

In [8]:
print(svm.predict(X_train))
print(y_train)

[2 8 9 ... 7 7 8]
[2 8 9 ... 7 7 8]


In [9]:
svm.score(X_train, y_train)

0.9948032665181886

In [10]:
svm.score(X_test, y_test)

0.9311111111111111

And again
---------

In [11]:
from sklearn.ensemble import RandomForestClassifier

In [12]:
rf = RandomForestClassifier(n_estimators=100)

In [13]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [14]:
rf.score(X_train, y_train)

1.0

In [15]:
rf.score(X_test, y_test)

0.9711111111111111

# Exercises

## Exercise 1
Load the iris dataset from the ``sklearn.datasets`` module using the ``load_iris`` function.

Split it into training and test set using ``train_test_split``.

## Exercise 2
Then train an evaluate ``sklearn.neighbors.KNeighborsClassifier``, the RandomForestClassifier and  ``sklearn.linear_model.LogisticRegression`` on the iris dataset.
How do these perform on the training set vs the test set? Which one is the best on the training set, which one is the best on the test set?

## Exercise 3 (extra)
Can you construct a binary classification dataset (using np.random for example) on which ``sklearn.linear_model.LogisticRegression`` achieves an accuracy of 1? Can you construct a binary classification dataset on which it achieves accuracy 0.5?

In [None]:
# %load solutions/train_iris.py

# Exercise 1 - solution

In [17]:
# imports
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [18]:
# loading iris data
iris = load_iris()

In [20]:
# split into train and test sets
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# Exercise 2 - solution

In [21]:
# imports
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [26]:
# create all models
kNeighbors = KNeighborsClassifier()
randForest = RandomForestClassifier()
logisticReg = LogisticRegression()

# training all models
kNeighbors.fit(X_train, y_train)
randForest.fit(X_train, y_train)
logisticReg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [32]:
# training and test data scores for KNeighbors Classifier
print('KNeighbors classifier')
print('\tTraining score: {}'.format(kNeighbors.score(X_train, y_train)))
print('\tTesting score: {}\n'.format(kNeighbors.score(X_test, y_test)))

# training and test data scores for Random Forest Classifier
print('Random Forest classifier')
print('\tTraining score: {}'.format(randForest.score(X_train, y_train)))
print('\tTesting score: {}\n'.format(randForest.score(X_test, y_test)))

# training and test data scores for Logistic Regression Classifier
print('Logistic Regression classifier')
print('\tTraining score: {}'.format(logisticReg.score(X_train, y_train)))
print('\tTesting score: {}'.format(logisticReg.score(X_test, y_test)))

KNeighbors classifier
	Training score: 0.9714285714285714
	Testing score: 1.0

Random Forest classifier
	Training score: 0.9904761904761905
	Testing score: 0.9555555555555556

Logistic Regression classifier
	Training score: 0.9333333333333333
	Testing score: 0.9555555555555556


In [33]:
# predictions for each model
kneighborsPred = kNeighbors.predict(X_test)
randForestPred = randForest.predict(X_test)
logisticRegPred = logisticReg.predict(X_test)

In [34]:
# imports for metrics
from sklearn.metrics import classification_report, confusion_matrix

In [38]:
# metrics for k-neighbors classifier
print('Metrics for KNeighbors Classifier\n')
print(confusion_matrix(y_test, kneighborsPred))
print(classification_report(y_test, kneighborsPred))

Metrics for KNeighbors Classifier

[[13  0  0]
 [ 0 20  0]
 [ 0  0 12]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00        20
           2       1.00      1.00      1.00        12

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



In [39]:
# metrics for Random Forest classifier
print('Metrics for Random Forest Classifier\n')
print(confusion_matrix(y_test, randForestPred))
print(classification_report(y_test, randForestPred))

Metrics for Random Forest Classifier

[[13  0  0]
 [ 0 19  1]
 [ 0  1 11]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.95      0.95      0.95        20
           2       0.92      0.92      0.92        12

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45



In [40]:
# metrics for Logistic Regression classifier
print('Metrics for Logistic Regression Classifier\n')
print(confusion_matrix(y_test, logisticRegPred))
print(classification_report(y_test, logisticRegPred))

Metrics for Logistic Regression Classifier

[[13  0  0]
 [ 0 18  2]
 [ 0  0 12]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      0.90      0.95        20
           2       0.86      1.00      0.92        12

    accuracy                           0.96        45
   macro avg       0.95      0.97      0.96        45
weighted avg       0.96      0.96      0.96        45

