## Preparation

In [2]:
%load_ext watermark
%watermark  -d -u -a 'Christine Hou' -v -p numpy,scipy,matplotlib,sklearn

Author: Christine Hou

Last updated: 2021-11-22

Python implementation: CPython
Python version       : 3.9.2
IPython version      : 7.28.0

numpy     : 1.19.5
scipy     : 1.6.1
matplotlib: 3.3.4
sklearn   : 0.0



In [9]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## Input

In [8]:
# picture = pd.read_csv("...")
# picture.tail()
#X, y = picture.data[:, 2:], picture.target
#X_train, X_test, y_train, y_test = train_test_split(X, y, 
#                                                    test_size=0.3,
#                                                    random_state=123,
#                                                    shuffle=True)
#print('X_train.shape:', X_train.shape)
#print('y_train.shape:', y_train.shape)
#print('X_test.shape:', X_test.shape)
#print('y_test.shape:', y_test.shape)
#X_train_sub, X_valid, y_train_sub, y_valid = \
#    train_test_split(X_train, y_train, test_size=0.2,\
#                     random_state=1, stratify=y_train)

#print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])

# Model

## 1. KNN 

Original KNN

In [None]:
error.rate = []

for i in range(1,30):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error.rate.append(np.mean(pred_i != y_test))

plt.figure(figsize = (10,6))
plt.plot(range(1,30),error_rate,color = "blue",
        linstyle = "dashed",marker = "o",
        markerfacecolor = "red", marksize = 10)

plt.title("Error rate vs. K value")
plt.xlable("K value")
plt.ylable("Error rate")

# Find the k-value with lowest Error Rate - k
# for example, k = 3
knn_1 = KNeighborsClassifier(n_neighbors=k,metric = "manhattan",p =2,weights= "uniform")
knn_1.fit(X_train,y_train)
y_pred_test = knn_1.predict(X_test)
print("Training Accuracy: %0.4f" % knn_1.score(X_train, y_train))
print("Validation Accuracy: %0.4f" % knn_1.score(X_valid, y_valid))
print("Test Accuracy: %0.4f" % knn_1.score(X_test, y_test))

Tuning Strategy

In [None]:
params =  {
    'min_samples_split': scipy.stats.randint(2, 12),
    'min_impurity_decrease': scipy.stats.uniform(0.0, 0.5),
    'max_depth': [6, 16, None]
}

knn_new = RandomizedSearchCV(
    estimator=knn_1,
    param_distributions=params,
    n_iter=10,
    cv=3,
    verbose=2,
    n_jobs=1)

knn_new.fit(X_train, y_train)
knn_new.best_score_

In [None]:
knn_new.best_params_
print("Training Accuracy: %0.4f" % knn_new.best_estimator_.score(X_train, y_train))
print("Validation Accuracy: %0.4f" % knn_new.best_estimator_.score(X_valid, y_valid))
print("Test Accuracy: %0.4f" % knn_new.best_estimator_.score(X_test, y_test))

Visualization

In [None]:
# Train
plot_decision_regions(X_train, y_train, knn_new)
plt.xlabel('TBD')
plt.ylabel('TBD')
plt.legend(loc='upper left')
plt.show()

# Test
plot_decision_regions(X_test, y_test, knn_new)
plt.xlabel('TBD')
plt.ylabel('TBD')
plt.legend(loc='upper left')
plt.show()

## 2. Decision Tree

Original Decision Tree

In [None]:
tree = DecisionTreeClassifier(random_state=123)
tree.fit(X_train, y_train)
print("Training Accuracy: %0.4f" % tree.score(X_train, y_train))
print("Validation Accuracy: %0.4f" % tree.score(X_valid, y_valid))
print("Test Accuracy: %0.4f" % tree.score(X_test, y_test))

Tuning

In [None]:
params =  {
    'min_samples_split': scipy.stats.randint(2, 12),
    'min_impurity_decrease': scipy.stats.uniform(0.0, 0.5),
    'max_depth': [6, 16, None]
}


tree_new = RandomizedSearchCV(
    estimator=tree,
    param_distributions=params,
    n_iter=10,
    cv=3,
    verbose=2,
    n_jobs=1)

tree_new.fit(X_train, y_train)
tree_new.best_score_

In [None]:
tree_new.best_params_
print("Training Accuracy: %0.4f" % tree_new.best_estimator_.score(X_train, y_train))
print("Validation Accuracy: %0.4f" % tree.best_estimator_.score(X_valid, y_valid))
print("Test Accuracy: %0.4f" % tree.best_estimator_.score(X_test, y_test))

Visualization

In [None]:
# Train
plot_decision_regions(X_train, y_train, tree_new)
plt.xlabel('TBD')
plt.ylabel('TBD')
plt.legend(loc='upper left')
plt.show()

# Test
plot_decision_regions(X_test, y_test, tree_new)
plt.xlabel('TBD')
plt.ylabel('TBD')
plt.legend(loc='upper left')
plt.show()

## 3. Random Forest

Original Random Forest

In [None]:
forest = RandomForestClassifier(n_estimators=100,
                                random_state=123)

forest.fit(X_train, y_train)
    
print("Training Accuracy: %0.4f" % forest.score(X_train, y_train))
print("Validation Accuracy: %0.4f" % forest.score(X_valid, y_valid))
print("Test Accuracy: %0.4f" % forest.score(X_test, y_test))

Tuning

In [12]:
params =  {
    'min_samples_split': scipy.stats.randint(2, 12),
    'min_impurity_decrease': scipy.stats.uniform(0.0, 0.5),
    'max_depth': [6, 16, None]
}

forest_new = RandomizedSearchCV(
    estimator=forest,
    param_distributions=params,
    n_iter=10,
    cv=3,
    verbose=2,
    n_jobs=1)

forest_new.fit(X_train, y_train)

forest_new.best_score_

NameError: name 'scipy' is not defined

In [None]:
forest_new.best_params_
print("Training Accuracy: %0.4f" % forest_new.best_estimator_.score(X_train, y_train))
print("Validation Accuracy: %0.4f" % forest_new.best_estimator_.score(X_valid, y_valid))
print("Test Accuracy: %0.4f" % forest_new.best_estimator_.score(X_test, y_test))

Visualization

In [None]:
# Train
plot_decision_regions(X_train, y_train, forest_new)
plt.xlabel('TBD')
plt.ylabel('TBD')
plt.legend(loc='upper left')
plt.show()

# Test
plot_decision_regions(X_test, y_test, forest_new)
plt.xlabel('TBD')
plt.ylabel('TBD')
plt.legend(loc='upper left')
plt.show()