In [20]:
# Set up
%pylab
%pylab inline
%matplotlib inline
%load_ext autoreload
%autoreload 2
import tqdm
import random
import pandas as pd
from collections import Counter
from itertools import cycle

from sklearn import datasets, metrics, tree
from sklearn.cluster import KMeans, MiniBatchKMeans, Birch, DBSCAN
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

import spectral
import seaborn as sns 
import tqdm
import copy 
import scipy

import DecisionTree

Using matplotlib backend: TkAgg
Populating the interactive namespace from numpy and matplotlib
Populating the interactive namespace from numpy and matplotlib
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
def KNN(train_X, train_Y, test_X, ks):
    train_X = train_X.astype(np.float32)
    test_X = test_X.astype(np.float32)
    dists = -2 * np.dot(train_X, test_X.T) + np.sum(test_X**2, axis=1) + np.sum(train_X**2, axis=1)[:, np.newaxis]
    closest = np.argsort(dists, axis=0)
    targets = train_Y[closest]
    preds = {}
    for k in ks:
        predictions = scipy.stats.mode(targets[:k])[0] 
        predictions = predictions.ravel()
        preds[k] = predictions
    return preds

In [43]:
X_train = pd.read_csv('data/Smartphones/X_train.txt', delim_whitespace=True, header=None)
y_train = pd.read_csv('data/Smartphones/y_train.txt', delim_whitespace=True, header=None).values.ravel()

X_test = pd.read_csv('data/Smartphones/X_test.txt', delim_whitespace=True, header=None)
y_test = pd.read_csv('data/Smartphones/y_test.txt', delim_whitespace=True, header=None).values.ravel()

In [44]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,551,552,553,554,555,556,557,558,559,560
0,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,-0.923527,-0.934724,...,-0.074323,-0.298676,-0.710304,-0.112754,0.0304,-0.464761,-0.018446,-0.841247,0.179941,-0.058627
1,0.278419,-0.016411,-0.12352,-0.998245,-0.9753,-0.960322,-0.998807,-0.974914,-0.957686,-0.943068,...,0.158075,-0.595051,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317
2,0.279653,-0.019467,-0.113462,-0.99538,-0.967187,-0.978944,-0.99652,-0.963668,-0.977469,-0.938692,...,0.414503,-0.390748,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118
3,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.98275,-0.989302,-0.938692,...,0.404573,-0.11729,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663
4,0.276629,-0.01657,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,-0.990441,-0.942469,...,0.087753,-0.351471,-0.699205,0.12332,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892


In [55]:
# KNN
ks = np.arange(1, 21, 2)
predictions = KNN(np.array(X_train), np.array(y_train), np.array(X_test), ks)

In [56]:
best_k, best_err = 1, 100
for k, pred in predictions.items():
    err = list(pred == np.array(y_test)).count(False) / len(y_test)
    print(f'For k = {k}, err = {err * 100:.2f}%')
    if err < best_err:
        best_err = err
        best_k = k

For k = 1, err = 12.15%
For k = 3, err = 10.93%
For k = 5, err = 9.84%
For k = 7, err = 9.67%
For k = 9, err = 9.47%
For k = 11, err = 9.54%
For k = 13, err = 9.37%
For k = 15, err = 9.57%
For k = 17, err = 9.47%
For k = 19, err = 9.40%


In [60]:
print(f'Confusion matrix for KNN with k = {best_k}')
metrics.confusion_matrix(predictions[best_k], y_test)

Confusion matrix for KNN with k = 13


array([[488,  37,  48,   0,   0,   0],
       [  0, 430,  43,   4,   0,   0],
       [  8,   4, 329,   0,   0,   0],
       [  0,   0,   0, 391,  33,   2],
       [  0,   0,   0,  96, 499,   1],
       [  0,   0,   0,   0,   0, 534]], dtype=int64)

In [61]:
# Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [62]:
random_forest = RandomForestClassifier(n_estimators=20)
extra_random_forest = ExtraTreesClassifier(n_estimators=20)

random_forest.fit(X_train, y_train)
extra_random_forest.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [64]:
print('Random Forest:')
print(metrics.confusion_matrix(random_forest.predict(X_test), y_test))
print(f'Score: {random_forest.score(X_test, y_test) * 100:.2f}%\n')

Random Forest:
[[480  46  23   0   0   0]
 [  7 417  49   0   0   0]
 [  9   8 348   0   0   0]
 [  0   0   0 427  46   0]
 [  0   0   0  64 486   0]
 [  0   0   0   0   0 537]]
Score: 91.45%



In [65]:
print('Extremely Randomized Trees:')
print(metrics.confusion_matrix(extra_random_forest.predict(X_test), y_test))
print(f'Score: {extra_random_forest.score(X_test, y_test) * 100:.2f}%\n')

Extremely Randomized Trees:
[[480  26  25   0   0   0]
 [  6 439  50   1   0   0]
 [ 10   6 345   0   0   0]
 [  0   0   0 439  27   0]
 [  0   0   0  51 505   0]
 [  0   0   0   0   0 537]]
Score: 93.15%

