In [51]:
import pandas as pd
import numpy as np
from math import sqrt
import time

import numba
from numba import int32, float64
from numba.experimental import jitclass

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.utils import shuffle


import matplotlib.pyplot as plt
#https://stackoverflow.com/questions/21154643/python-line-profiler-installation
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [52]:
digits = load_digits()
print(digits.data.shape)


#image representatio of the data
# plt.gray() 
# plt.matshow(digits.images[0]) 
# plt.show() 


X = digits.data 
y = digits.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

(1797, 64)


## KNN

In [53]:
len(X_train),len(X_test)

(1347, 450)

In [68]:
class KNN:
    def __init__(self, K=3):
        self.K = K

    def fit(self, x_train, y_train):
        self.X_train = x_train
        self.Y_train = y_train

    
    def euc_dist(self, x1, x2):
        return np.sqrt(np.sum((x1-x2)**2))
        
    
    def predict(self, X_test):
        predictions = [] 
        for i in range(len(X_test)):
            dist = np.array([self.euc_dist(X_test[i], x_t) for x_t in self.X_train])
            dist_sorted = dist.argsort()[:self.K]
            neigh_count = {}
            print(dist_sorted)
            for idx in dist_sorted:
                if self.Y_train[idx] in neigh_count:
                    neigh_count[self.Y_train[idx]] += 1
                else:
                    neigh_count[self.Y_train[idx]] = 1
           
            sorted_neigh_count = sorted(neigh_count.items(), reverse=True)
            predictions.append(sorted_neigh_count[0][0]) 
        return predictions


In [69]:
k = 3
start = time.time()
model = KNN(K = k)
model.fit(X_train, y_train)
pred = model.predict(X_test)
acc = accuracy_score(y_test, pred)
end = time.time()
print(f"Time Taken: {end-start} sec")
print("K = "+str(k)+"; Accuracy: "+str(acc))

[469 584 264]
[469 812 767]
[ 131  126 1305]
[316 911 689]
[ 136 1021  284]
[223 254 683]
[122 477 459]
[464 719 539]
[1234  819  152]
[637 242 525]
[309 197 199]
[866 787 668]
[431 419 611]
[ 38 147   2]
[ 557 1129  500]
[828 693 486]
[ 767  812 1048]
[ 84  18 140]
[889  17 309]
[1066  761  140]
[1039  856 1253]
[441 923 235]
[1256 1064  350]
[1145 1245 1168]
[427 820 525]
[1277   56  886]
[341 503 568]
[ 429 1232 1165]
[768 263 323]
[333 150 990]
[483 226 587]
[690 701 407]
[ 686 1261  573]
[ 763 1019 1129]
[1322  564  646]
[ 211 1325  576]
[ 445  976 1242]
[662 822 161]
[ 457 1097  704]
[ 628 1205  748]
[270 453 914]
[761 643  44]
[261 283 714]
[486 464 926]
[ 917 1187  629]
[933 366  83]
[ 393 1140  259]
[1110  145 1324]
[1333  545  635]
[ 716 1278  900]
[ 146  290 1318]
[934 646 532]
[ 628 1242  684]
[833 186 778]
[546  18 671]
[1222 1156  735]
[1295   43  731]
[1156  421  178]
[352  23 717]
[947 427 766]
[1302  770  542]
[322 983 867]
[1200  709  761]
[1134  962 1279]
[781 107 28

In [58]:
%timeit -n 5 model.predict(X_test)

14.4 s ± 576 ms per loop (mean ± std. dev. of 7 runs, 5 loops each)


In [59]:
%lprun -f  model.predict model.predict(X_test)

## Logistic regression

In [61]:
alpha = 1e-2
class_of_interest = 10
max_iter = 1000


def _sigmoid(x):
    """Sigmoide function"""

    return 1.0 / (1.0 + np.exp(-x))

def predict(x_bar, params):
    """predict the probability of a class"""  

    return _sigmoid(np.dot(params, x_bar))

def _compute_cost(input_var, output_var, params):
    """Compute the log likelihood cost"""

    cost = 0
    for x, y in zip(input_var, output_var):
        x_bar = np.array(np.insert(x, 0, 1))
        y_hat = self.predict(x_bar, params)

        y_binary = 1.0 if y == class_of_interest else 0.0
        cost += y_binary * np.log(y_hat) + (1.0 - y_binary) * np.log(1 - y_hat)

    return cost

def train(input_var, label, initial_params, print_iter = 5000):
    """Train the model using batch gradient ascent"""

    iteration = 1
    while iteration < max_iter:
        if iteration % print_iter == 0:
            print(f'iteration: {iteration}')
            print(f'cost: {_compute_cost(input_var, label, initial_params)}')
            print('--------------------------------------------')

        for i, xy in enumerate(zip(input_var, label)):
            x_bar = np.array(np.insert(xy[0], 0, 1))
            y_hat = predict(x_bar, initial_params)

            y_binary = 1.0 if xy[1] == class_of_interest else 0.0
            gradient = (y_binary - y_hat) * x_bar
            initial_params += alpha * gradient

        iteration +=1

    return initial_params

def test(input_test, label_test,trained_params):
    """Test the accuracy of the model using test data"""
    total_classifications = 0
    correct_classifications = 0

    for x,y in zip(input_test, label_test):
        total_classifications += 1
        x_bar = np.array(np.insert(x, 0, 1))
        y_hat = predict(x_bar, trained_params)
        
        y_binary = 1.0 if y == class_of_interest else 0.0

        if y_hat >= 0.5 and  y_binary == 1:
            # correct classification of class_of_interest
            correct_classifications += 1

        if y_hat < 0.5 and  y_binary != 1:
            # correct classification of an other class
            correct_classifications += 1

    accuracy = correct_classifications / total_classifications

    return accuracy


In [62]:
digits_train, digits_test, digits_label_train, digits_label_test = train_test_split(X, y, test_size=0.30)

In [63]:
start = time.time()
initial_params = np.zeros(len(digits.data[0]) + 1)
for clas in range(10):
    class_of_interest = clas
    if clas == 0:
        trained_params = initial_params
    trained_params = train(digits_train / 16.0, digits_label_train, trained_params,1000)
digits_accuracy = test(digits_test / 16.0, digits_label_test,trained_params)
end = time.time()

print(f'Accuracy of prediciting in test set: {digits_accuracy}')
print(f'Total time taken: {end- start}sec')

Accuracy of prediciting in test set: 0.9851851851851852
Total time taken: 969.3034505844116sec


In [65]:
%timeit -n 1 train(digits_train / 16.0, digits_label_train, initial_params,1000)

1min 35s ± 10.8 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [66]:
%lprun -f train train(digits_train / 16.0, digits_label_train, initial_params,1000)