#  Implementation of KNN from scratch

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from scipy.stats import mode
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm

In [2]:
iris = datasets.load_iris()
x  = pd.DataFrame( data = iris['data'],columns = ['sl','sw','pl','pw'])
y = iris['target']
# Normalize the x 
transformer  = Normalizer()
x = transformer.fit_transform(x)

In [3]:
x[0:5]

array([[0.80377277, 0.55160877, 0.22064351, 0.0315205 ],
       [0.82813287, 0.50702013, 0.23660939, 0.03380134],
       [0.80533308, 0.54831188, 0.2227517 , 0.03426949],
       [0.80003025, 0.53915082, 0.26087943, 0.03478392],
       [0.790965  , 0.5694948 , 0.2214702 , 0.0316386 ]])

In [4]:
x_train,x_test ,y_train, y_test =  train_test_split(x, y , test_size = 0.2,random_state = 100)

In [5]:
mode([1,1,0]).mode[0]

1

In [6]:
import math

In [10]:
# Implementing the KNN

def euclidian_distance(a,b):
    distance = 0
    for i in range(len(a)):
        distance = np.square(a[i] - b[i]) +distance
    distance = np.sqrt(distance)
    return distance

In [14]:
def KNN(test_instance , training_data , training_label , k):
    distances = []
    
    for index in range(len(training_data)):
        distance_info = (training_label[index] , euclidian_distance(training_data[index] , test_instance))
        distances.append(distance_info)
    distances = sorted(distances , key = lambda k : k[1])
    top_k_labels = [label[0] for label in distances[0:k]]
    label = mode(top_k_labels).mode[0]
    return label

In [15]:
KNN(x_test[17] , x_train , y_train , 5)

1

 # Implementation of logistic regression from scratch 

In [16]:
from sklearn.datasets import make_classification
x, y = make_classification(n_samples=50000, n_features=15, n_informative=10, n_redundant=5,
                           n_classes=2, weights=[0.7], class_sep=0.7, random_state=15)

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=15)

In [18]:
learning_rate = 0.0001
iterations = 36000
intercept = 0
weights = np.zeros_like(x_train[0])
lamda = 0
N = len(x_train)

In [19]:
def sigmoid(x , weights , intercept):
    return 1/(1 + np.exp(-((intercept + np.dot(weights, x)))))

In [20]:
def calculate_gradient(z , x , y ):
    gradient = (y - sigmoid(x , weights , intercept)) 
    return gradient

In [21]:
#def gradient_descent(gradient):
#    weights = weights + leaning_rate*gradient
#    intercept = intercept + learning_rate*()
#    return weights

In [22]:
for i in tqdm(range(iterations)):
    z = sigmoid(x[i] , weights , intercept)
    gradient = calculate_gradient( z , x_train[i] , y[i])
    weights = weights + learning_rate*x[i]*(gradient - (lamda*weights)/N)
    intercept = intercept + learning_rate*gradient

100%|██████████| 36000/36000 [00:00<00:00, 41149.01it/s]


In [23]:
weights

array([-1.13905745,  0.42572788,  0.46469008,  0.71064658,  0.28259081,
        1.14624626, -1.1195814 , -0.26880231,  1.46189023,  0.03784674,
       -0.08143955,  0.31358978,  0.37034735,  0.74001357,  0.56322186])

In [24]:
intercept

-0.3753332212297188

In [105]:
def pred(w,b, x):
    N = len(x)
    predict = []
    for i in range(N):
        if sigmoid(w , b, x[i]) >= 0.5: # sigmoid(w,x,b) returns 1/(1+exp(-(dot(x,w)+b)))
            predict.append(1)
        else:
            predict.append(0)
    return np.array(predict)
#print(1-np.sum(y_train - pred(w,b,x_train))/len(X_train))
#print(1-np.sum(y_test  - pred(w,b,x_test))/len(X_test))

In [26]:
y_pred = pred(weights , intercept , x_test)

In [27]:
from sklearn.metrics import accuracy_score

In [29]:
accuracy_score(y_test , y_pred)

0.73648

#  Yet another way of doing it folks

In [159]:
from tqdm import tqdm
from sklearn.metrics import log_loss
import math

train_loss=[]
test_loss=[]
epochs=100
eta0 = 0.0001
alpha = 0.0001
w = np.zeros_like(x_train[0])
b= 0

def sigmoid(w,x,b):
    return 1/(1+np.exp(-(np.dot(x,w)+b)))

for i in tqdm(range(epochs)):
    for batch in range(N):
        
        #batch = np.random.choice(len(x_train))

        w = ((1-((eta0*alpha)/N)) * w)+((alpha*x_train[batch])*(y_train[batch]-sigmoid(w,x_train[batch],b)))

        
        b=b+eta0*(y_train[batch]-sigmoid(w,x_train[batch],b))

    ytrain_pred=sigmoid(w,x_train,b)
    ytest_pred=sigmoid(w,x_test,b) 
    Train_loss=0
    Test_loss=0
    
    for a in range(len(x_train)): 
        Train_loss+=-((y_train[a]*(math.log(ytrain_pred[a])))+ ((1-y_train[a])*(math.log(1-ytrain_pred[a]))))
    Normalized_train_loss=(Train_loss)/N
    train_loss.append(Normalized_train_loss)
    
    
    for c in range(len(x_test)):      
        Test_loss+=-((y_test[c]*(math.log(ytest_pred[c])))+ ((1-y_test[c])*(math.log(1-ytest_pred[c])))) 
    Normalized_test_loss=(Test_loss)/len(y_test)
    test_loss.append(Normalized_test_loss)
    
    
    
    #print("NORM Train loss :",Normalized_train_loss)
    #print('NORM test loss :',Normalized_test_loss)
    
    #print('epoch {}'.format(i),'\ntrain _loss:',Train_loss)
    #print('epoch {}'.format(i),'\ntest _loss:',Test_loss)

100%|██████████| 100/100 [02:02<00:00,  1.22s/it]


In [158]:
w

array([-4.29756022e-01,  1.93023835e-01, -1.48464492e-01,  3.38103414e-01,
       -2.21229065e-01,  5.69932661e-01, -4.45183637e-01, -8.99209544e-02,
        2.21804886e-01,  1.73809503e-01,  1.98727752e-01, -5.59489815e-04,
       -8.13106734e-02,  3.39094300e-01,  2.29785009e-02])

In [156]:
b

-0.8930132160174784

In [50]:
np.random.choice(len([x for x in range(5)]))

4

#  My new approach friends to solve this now 

In [51]:
x_train.shape

(37500, 15)

In [135]:
alpha = 0.0001   # This will be the learining rate of the gradient descent 
intercept = 0
weights = np.zeros_like(x_train[0])
lamda = 0.0001   # this is the constant we use with the regulariser
iterations = 300
N = len(x_train)

In [136]:
weights

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [137]:
def sigmoid(weights , intercept, x):
    return 1/(1 + np.exp(-(np.dot(weights,x) + intercept)))

In [138]:
sigmoid(np.array([x for x in range(4)]), 0.1 , [np.random.rand() for x in range(4)])

0.961423254056835

In [139]:
# here we will iterate and constantly update the weights and the intercept
for i in tqdm(range(iterations)):
    for batch_index in range(N):
        #batch_index = np.random.randint(0,N)
        weights = (1 - alpha*lamda/N)*weights + alpha*x_train[batch_index]*(y[batch_index] - sigmoid(weights,intercept,x[batch_index]))
        intercept = intercept +alpha*(y[batch_index] - sigmoid(weights,intercept,x[batch_index]))

100%|██████████| 300/300 [03:58<00:00,  1.17it/s]


In [140]:
weights

array([-4.58703486, -3.43977361,  2.48557237,  6.3279082 ,  2.25036457,
        5.92522705, -4.37834416, -4.12770115,  1.55107821, -0.45223257,
       -0.20733484,  4.17269546,  2.97894235, -1.00659106, 10.17721666])

In [141]:
intercept

-5.352551303734417

In [142]:
y_pred = pred(weights , intercept , x_test)

In [143]:
accuracy_score(y_test , y_pred)

0.69864

In [70]:
# computing the log loss of the results given by our model
def compute_log_loss(weigts , intercept , predicted_y , y_test):
    pass

In [None]:
# y*log(prediction) - (1-y)*log(1-prediction)