In [1]:
# Practice from this link: https://github.com/lytemar/Applied-Machine-Learning-in-Python--University-of-Michigan---Coursera/blob/master/Module%203.ipynb
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
import pandas as pd
import seaborn as sns

dataset = load_digits()
X, y = dataset.data, dataset.target

In [2]:
for class_name, class_count in zip(dataset.target_names, np.bincount(dataset.target)):
    print(class_name, class_count)

0 178
1 182
2 177
3 183
4 181
5 182
6 181
7 179
8 174
9 180


In [3]:
# No lets create an imbalance binary class.
# We define digit 1 as positive class and other digits as negative class
y1 = y.copy()
y1[y1 != 1] = 0

In [4]:
for class_name, class_count in zip(dataset.target_names, np.bincount(y1)):
    print(class_name, class_count)

0 1615
1 182


In [5]:
print(y[0:20] ,'\n',  y1[0:20])

[0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9] 
 [0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]


In [6]:
# For this imbalance binary class we want to see how good is the SVC model.
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(X, y1, random_state = 0)
clf = SVC(kernel='rbf', C = 1, gamma='auto').fit(X_train, y_train)
print('SVC score on train data of an imbalanced binary class {:.3f}'.format(clf.score(X_train, y_train)))
print('SVC score on test data of an imbalanced binary class {:.3f}'.format(clf.score(X_test, y_test)))

SVC score on train data of an imbalanced binary class 1.000
SVC score on test data of an imbalanced binary class 0.909


# Dummy Classifier

In [7]:
# Lets compare with the results of a Dummy classifier
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
print('Dummy score on train data of an imbalanced binary class {:.2f}'.format(clf.score(X_train, y_train)))
print('Dummy score on test data of an imbalanced binary class {:.2f}'.format(clf.score(X_test, y_test)))

Dummy score on train data of an imbalanced binary class 0.90
Dummy score on test data of an imbalanced binary class 0.90


In [8]:
# How the prediction of the Dummy classifier looks like?
Dummy_prediction = clf.predict(X_test)
Dummy_prediction
# You see it only has 0! Not very suprised by score of dummy classifier

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [9]:
# SVC with kernel = linear
clf = SVC(kernel='linear', C=1).fit(X_train, y_train)
print('SVC with linear kernel score on an imbalanced binary class {:.2f}'.format(clf.score(X_test, y_test)))

SVC with linear kernel score on an imbalanced binary class 0.98


# Confusion matrix

In [10]:
from sklearn.metrics import confusion_matrix
dummy_most_frequenct = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
y_predicted = dummy_most_frequenct.predict(X_test)
confusion_matrix(y_test, y_predicted)
# Look dummyclassifier predicts 407 negative class correctly

array([[407,   0],
       [ 43,   0]], dtype=int64)

In [11]:
from sklearn.metrics import confusion_matrix
clf = DummyClassifier(strategy='stratified').fit(X_train, y_train)
y_predicted_stra_strati = clf.predict(X_test)
confusion = confusion_matrix(y_test, y_predicted_stra_strati)
print('Random class-proportional prediction (dummy classifier): \n', confusion)

Random class-proportional prediction (dummy classifier): 
 [[368  39]
 [ 38   5]]


In [12]:
from sklearn.svm import SVC
clf_SVC = SVC(kernel='linear', C =1).fit(X_train, y_train)
y_SVC_prediction = clf_SVC.predict(X_test)
confusion = confusion_matrix(y_test, y_SVC_prediction)
print('SVC with linear kernel prediction: \n', confusion)

SVC with linear kernel prediction: 
 [[402   5]
 [  5  38]]


In [13]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression().fit(X_train, y_train)
y_logreg_prediction = logreg.predict(X_test)
confusion = confusion_matrix(y_test, y_logreg_prediction)
print('LogisticRegression confusion matric: \n', confusion)

LogisticRegression confusion matric: 
 [[401   6]
 [  8  35]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [14]:
from sklearn.tree import DecisionTreeClassifier
clf_tree = DecisionTreeClassifier(max_depth=4).fit(X_train, y_train)
y_tree_prediction = clf_tree.predict(X_test)
confusion = confusion_matrix(y_test, y_tree_prediction)
print('Decision tree classifier confusion matric: \n', confusion)

Decision tree classifier confusion matric: 
 [[405   2]
 [  9  34]]


In [15]:
from sklearn.svm import SVC
clf_SVC_rbf = SVC(kernel='rbf', gamma='auto').fit(X_train, y_train)
y_SVC_rbf_prediction = clf_SVC_rbf.predict(X_test)
confusion = confusion_matrix(y_test, y_SVC_rbf_prediction)
print('SVC with rbf kernel classifier confusion matric: \n', confusion)

SVC with rbf kernel classifier confusion matric: 
 [[407   0]
 [ 41   2]]


# Accuracy, Precision, Recall and F1 scores

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
clf_tree = DecisionTreeClassifier(max_depth=4).fit(X_train ,y_train)
y_predicted_clf_tree = clf_tree.predict(X_test)
confusion = confusion_matrix(y_test, y_predicted_clf_tree)
print('Accuracy: {:.2f}'.format(accuracy_score(y_test, y_predicted_clf_tree)))
print('Precision: {:.2f}'.format(precision_score(y_test, y_predicted_clf_tree)))
print('Recall: {:.2f}'.format(recall_score(y_test, y_predicted_clf_tree)))
print('F1 score: {:.2f}'.format(f1_score(y_test, y_predicted_clf_tree)))

Accuracy: 0.97
Precision: 0.89
Recall: 0.79
F1 score: 0.84


In [17]:
from sklearn.dummy import DummyClassifier
clf_dummy = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
y_predicted_clf_dummy = clf_dummy.predict(X_test)
print('Dummy Accuracy: {:.2f}'.format(accuracy_score(y_test, y_predicted_clf_dummy)))
print('Dummy Precision: {:.2f}'.format(precision_score(y_test, y_predicted_clf_dummy)))
print('Dummy Recall: {:.2f}'.format(recall_score(y_test, y_predicted_clf_dummy)))
print('Dummy F1 score: {:.2f}'.format(f1_score(y_test, y_predicted_clf_dummy)))

Dummy Accuracy: 0.90
Dummy Precision: 0.00
Dummy Recall: 0.00
Dummy F1 score: 0.00


  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predicted_clf_tree, target_names=['other numbers', '1']))

               precision    recall  f1-score   support

other numbers       0.98      0.99      0.98       407
            1       0.89      0.79      0.84        43

     accuracy                           0.97       450
    macro avg       0.94      0.89      0.91       450
 weighted avg       0.97      0.97      0.97       450

