# Scikit-learn

Scikit-learn contains simple and efficient tools for data mining and data analysis.  It implements a wide variety of machine learning algorithms and processes to conduct advanced analytics.

Library documentation: <a>http://scikit-learn.org/stable/</a>

Credit: John Wittenauer

### General

In [64]:
import numpy as np
import pandas as pd
from sklearn import datasets

### Classification

In [2]:
# import the iris dataset
iris = datasets.load_iris()

In [3]:
# k nearest neighbors
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(iris.data, iris.target)

KNeighborsClassifier()

In [4]:
# decision tree
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(iris.data, iris.target)

DecisionTreeClassifier()

In [5]:
# naive bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)
print("Number of mislabeled points : %d" % (iris.target != y_pred).sum())

Number of mislabeled points : 6


In [20]:
# import a sample dataset and view the data
digits = datasets.load_digits()
print(digits.data.shape,digits.data)

(1797, 64) [[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]


In [19]:
# view the target variable
print(digits.target.shape,digits.target)

(1797,) [0 1 2 ... 8 9 8]


In [8]:
# train a Naive Bayes using everything but the last example 
classifier_gnb = GaussianNB()
classifier_gnb.fit(digits.data[:-1], digits.target[:-1])

GaussianNB()

In [22]:
digits.data[-1]

array([ 0.,  0., 10., 14.,  8.,  1.,  0.,  0.,  0.,  2., 16., 14.,  6.,
        1.,  0.,  0.,  0.,  0., 15., 15.,  8., 15.,  0.,  0.,  0.,  0.,
        5., 16., 16., 10.,  0.,  0.,  0.,  0., 12., 15., 15., 12.,  0.,
        0.,  0.,  4., 16.,  6.,  4., 16.,  6.,  0.,  0.,  8., 16., 10.,
        8., 16.,  8.,  0.,  0.,  1.,  8., 12., 14., 12.,  1.,  0.])

In [25]:
# predict the target of the last example
classifier_gnb.predict([list(digits.data[-1])])[0]

8

In [32]:
# perform cross-validation on the estimator's predictions
from sklearn.model_selection import KFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4], [1, 2]])
y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
kf = KFold(n_splits=3)
for train_index, validation_index in kf.split(X):
    print("TRAIN:", train_index, "VALIDATION:", validation_index)
    X_train, X_validation = X[train_index], X[validation_index]
    y_train, y_validation = y[train_index], y[validation_index]

TRAIN: [3 4 5 6 7 8] VALIDATION: [0 1 2]
TRAIN: [0 1 2 6 7 8] VALIDATION: [3 4 5]
TRAIN: [0 1 2 3 4 5] VALIDATION: [6 7 8]


In [33]:
from sklearn.model_selection import cross_val_score
cross_val_score(classifier_gnb, digits.data, digits.target, cv=3)

array([0.8263773 , 0.79799666, 0.8163606 ])

In [35]:
# use the grid search module to optimize model parameters
from sklearn.model_selection import GridSearchCV
classifier_knn = KNeighborsClassifier()
classifier_knn.fit(digits.data[:-1], digits.target[:-1])
k_vals = np.array([3,9,13])
dist_metrics=np.array(["manhattan", "euclidean", "chebyshev"])
classifier = GridSearchCV(estimator=classifier_knn, param_grid={'n_neighbors':k_vals, 'metric':dist_metrics})
classifier.fit(digits.data[:1000], digits.target[:1000])

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'metric': array(['manhattan', 'euclidean', 'chebyshev'], dtype='<U9'),
                         'n_neighbors': array([ 3,  9, 13])})

In [36]:
classifier.best_score_

0.9530000000000001

In [37]:
print('K=',classifier.best_estimator_.n_neighbors,'distance-metric:',classifier.best_estimator_.metric)

K= 3 distance-metric: euclidean


In [14]:
# run against the test set
classifier.score(digits.data[1000:], digits.target[1000:])

0.94228356336260977

In [38]:
# nested cross-validation example
cross_val_score(classifier, digits.data, digits.target)

array([0.95555556, 0.95833333, 0.96657382, 0.98328691, 0.96657382])

### Regression

In [39]:
# load another sample dataset
diabetes = datasets.load_diabetes()

In [40]:
# linear regression
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(diabetes.data, diabetes.target)

LinearRegression()

In [43]:
# regression coefficients
print(diabetes.data.shape)
print(regr.coef_.shape,regr.coef_)
print(regr.intercept_)

(442, 10)
(10,) [ -10.01219782 -239.81908937  519.83978679  324.39042769 -792.18416163
  476.74583782  101.04457032  177.06417623  751.27932109   67.62538639]
152.1334841628965


In [44]:
# mean squared error
np.mean((regr.predict(diabetes.data)-diabetes.target)**2)

2859.6903987680657

In [45]:
# explained variance (r^2)
regr.score(diabetes.data, diabetes.target)

0.5177494254132934

### Preprocessing

In [58]:
# feature scaling
from sklearn import preprocessing
X = np.array([[ 1., -1.,  2.],
               [ 2.,  0.,  0.],
               [ 0.,  1., -1.]])
X_scaled = preprocessing.scale(X) # default is standardization
X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [56]:
# save the scaling transform to apply to new data later
scaler_standardization = preprocessing.StandardScaler().fit(X)
print('mean:',scaler_standardization.mean_)
print('std:',pow(scaler_standardization.var_,0.5))
scaler

mean: [1.         0.         0.33333333]
std: [0.81649658 0.81649658 1.24721913]


StandardScaler()

In [57]:
scaler_standardization.transform(X)

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [60]:
# minmax scaling
min_max_scaler = preprocessing.MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(X)
print('min:',min_max_scaler.data_min_)
print('max:',min_max_scaler.data_max_)
X_minmax

min: [ 0. -1. -1.]
max: [2. 1. 2.]


array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

### Clustering

In [61]:
# k means clustering
from sklearn import cluster
k_means = cluster.KMeans(n_clusters=3)
k_means.fit(iris.data)

KMeans(n_clusters=3)

In [62]:
print(k_means.labels_)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 2 2 2 2
 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 1 2 2 2 1 2 2 2 1 2
 2 1]


In [63]:
print(k_means.cluster_centers_)

[[5.006      3.428      1.462      0.246     ]
 [5.9016129  2.7483871  4.39354839 1.43387097]
 [6.85       3.07368421 5.74210526 2.07105263]]


### Decomposition

In [74]:
# create a signal with 2 useful dimensions
x1 = np.random.normal(size=100)
x2 = np.random.normal(size=100)
x3 = x1 + x2
X = np.c_[x1, x2, x3]

In [79]:
# compute principal component analysis
from sklearn import decomposition
pca = decomposition.PCA()
pca.fit(X)

PCA()

In [80]:
pca.explained_variance_ # the eigenvalues

array([3.23030446e+00, 1.04904711e+00, 1.28602087e-31])

In [82]:
# only the 2 first components are useful
pca.n_components = 2
X_reduced = pca.fit_transform(X)
X_reduced.shape

(100, 2)