In [24]:
import numpy as np
import pandas as pd
from scipy import stats
from typing import Dict, Any
from abc import ABC,abstractmethod

In [25]:
cname = ['wrist x value', 'wrist y value', 'wrist z value', 'thigh x value', 'thigh y value', 'thigh z value', 'class']
cname2 = ['wrist x value', 'wrist y value', 'wrist z value', 'thigh x value', 'thigh y value', 'thigh z value']
train = pd.read_csv('trainingB.csv', names=cname, header=None)
df_train = pd.DataFrame(train)
df_train

Unnamed: 0,wrist x value,wrist y value,wrist z value,thigh x value,thigh y value,thigh z value,class
0,-0.468750,-1.156250,0.093750,-1.140625,0.109375,-0.562500,downstairs
1,-2.625000,0.281250,0.421875,-0.109375,0.125000,0.687500,jogging
2,-0.234375,-0.718750,0.093750,-1.984375,-0.046875,-0.625000,downstairs
3,-0.125000,-0.796875,0.093750,-0.734375,0.093750,-0.343750,downstairs
4,0.046875,-0.765625,0.015625,-0.640625,-0.125000,-0.468750,downstairs
...,...,...,...,...,...,...,...
23057,-0.312500,-1.109375,-0.109375,-0.843750,-0.390625,-0.046875,upstairs
23058,0.109375,-1.281250,0.187500,-2.609375,-1.062500,0.203125,jogging
23059,0.593750,-0.359375,-0.734375,-2.578125,-0.609375,-0.250000,jogging
23060,0.093750,0.578125,-0.031250,-0.562500,-0.703125,1.875000,jogging


In [26]:
test = pd.read_csv('SoalB.csv', names=cname2, header=None)
test

Unnamed: 0,wrist x value,wrist y value,wrist z value,thigh x value,thigh y value,thigh z value
0,-0.296875,-0.703125,0.031250,-0.265625,-0.031250,0.000000
1,-0.171875,-1.140625,0.328125,-2.531250,-0.828125,0.203125
2,-0.343750,-1.031250,0.781250,-0.703125,0.500000,-0.125000
3,-0.234375,-0.671875,0.093750,-1.546875,-0.046875,-0.828125
4,0.265625,-1.000000,-0.062500,-1.875000,0.515625,0.281250
...,...,...,...,...,...,...
95,-0.656250,-1.812500,-0.031250,-1.140625,-0.234375,-0.296875
96,-0.296875,-1.093750,0.140625,-0.890625,0.250000,-0.546875
97,-0.468750,-1.453125,0.109375,-1.062500,-0.203125,-0.375000
98,-0.125000,-0.546875,-0.046875,-0.906250,0.031250,-0.343750


In [27]:
# change the summary into numerical value
df_train['class'] = df_train['class'].map({'downstairs': -1, 'jogging': 0, 'upstairs': 1})
df_train

Unnamed: 0,wrist x value,wrist y value,wrist z value,thigh x value,thigh y value,thigh z value,class
0,-0.468750,-1.156250,0.093750,-1.140625,0.109375,-0.562500,-1
1,-2.625000,0.281250,0.421875,-0.109375,0.125000,0.687500,0
2,-0.234375,-0.718750,0.093750,-1.984375,-0.046875,-0.625000,-1
3,-0.125000,-0.796875,0.093750,-0.734375,0.093750,-0.343750,-1
4,0.046875,-0.765625,0.015625,-0.640625,-0.125000,-0.468750,-1
...,...,...,...,...,...,...,...
23057,-0.312500,-1.109375,-0.109375,-0.843750,-0.390625,-0.046875,1
23058,0.109375,-1.281250,0.187500,-2.609375,-1.062500,0.203125,0
23059,0.593750,-0.359375,-0.734375,-2.578125,-0.609375,-0.250000,0
23060,0.093750,0.578125,-0.031250,-0.562500,-0.703125,1.875000,0


In [28]:
X_train = df_train.iloc[:, 0:6].values
y_train = df_train.iloc[:, 6].values
data = X_train

In [29]:
class KNN(ABC):
    """
    Base class for KNN implementations
    """
    def __init__(self, K: int = 3, metric: str = 'minkowski', p: int = 2) -> None:
        """
        Initializer function. Ensure that input parameters are compatiable.
        Inputs:
        K -> integer specifying number of neighbours to consider
        metric -> string to indicate the distance metric to use (valid entries are 'minkowski' or 'cosine')
        p -> order of the minkowski metric (valid only when distance == 'minkowski')
        """
        # check distance is a valid entry
        valid_distance = ['minkowski','cosine']
        if metric not in valid_distance:
            msg = "Entered value for metric is not valid. Pick one of {}".format(valid_distance)
            raise ValueError(msg)

        # check minkowski p parameter
        if (metric == 'minkowski') and (p <= 0):
            msg = "Entered value for p is not valid. For metric = 'minkowski', p >= 1"
            raise ValueError(msg)

        # store/initialise input parameters
        self.K = K
        self.metric = metric
        self.p = p
        self.X_train = np.array([])
        self.y_train = np.array([])

    def __del__(self) -> None:
        """
        Destructor function.
        """
        del self.K
        del self.metric
        del self.p
        del self.X_train
        del self.y_train

    def __minkowski(self, x: np.array) -> np.array:
        """
        Private function to compute the minkowski distance between point x and the training data X
        Inputs:
        x -> numpy data point of predictors to consider
        Outputs:
        np.array -> numpy array of the computed distances
        """
        return np.power(np.sum(np.power(np.abs(self.X_train - x),self.p),axis=1),1/self.p)

    def __cosine(self, x: np.array) -> np.array:
        """
        Private function to compute the cosine distance between point x and the training data X
        Inputs:
        x -> numpy data point of predictors to consider
        Outputs:
        np.array -> numpy array of the computed distances
        """
        return (1 - (np.dot(self.X_train,x)/(np.linalg.norm(x)*np.linalg.norm(self.X_train,axis=1))) )

    def __distances(self, X: np.array) -> np.array:
        """
        Private function to compute distances to each point x in X[x,:]
        Inputs:
        X -> numpy array of points [x]
        Outputs:
        D -> numpy array containing distances from x to all points in the training set.
        """
        # cover distance calculation
        if self.metric == 'minkowski':
            D = np.apply_along_axis(self.__minkowski,1,X)
        elif self.metric == 'cosine':
            D = np.apply_along_axis(self.__cosine,1,X)

        # return computed distances
        return D

    @abstractmethod
    def _generate_predictions(self, idx_neighbours: np.array) -> np.array:
        """
        Protected function to compute predictions from the K nearest neighbours
        """
        pass

    def fit(self, X: np.array, y: np.array) -> None:
        """
        Public training function for the class. It is assummed input X has been normalised.
        Inputs:
        X -> numpy array containing the predictor features
        y -> numpy array containing the labels associated with each value in X
        """
        # store training data
        self.X_train = np.copy(X)
        self.y_train = np.copy(y)

    def predict(self, X: np.array) -> np.array:
        """
        Public prediction function for the class.
        It is assummed input X has been normalised in the same fashion as the input to the training function
        Inputs:
        X -> numpy array containing the predictor features
        Outputs:
        y_pred -> numpy array containing the predicted labels
        """
        # ensure we have already trained the instance
        if (self.X_train.size == 0) or (self.y_train.size == 0):
            raise Exception('Model is not trained. Call fit before calling predict.')

        # compute distances
        D = self.__distances(X)

        # obtain indices for the K nearest neighbours
        idx_neighbours = D.argsort()[:,:self.K]

        # compute predictions
        y_pred = self._generate_predictions(idx_neighbours)

        # return results
        return y_pred

    def get_params(self, deep: bool = False) -> Dict:
        """
        Public function to return model parameters
        Inputs:
        deep -> boolean input parameter
        Outputs:
        Dict -> dictionary of stored class input parameters
        """
        return {'K':self.K,
                'metric':self.metric,
                'p':self.p}

#Generating kNN Classifier
class KNNClassifier(KNN):
    """
    Class for KNN classifiction implementation
    """
    def __init__(self, K: int = 3, metric: str = 'minkowski', p: int = 2) -> None:
        """
        Initializer function. Ensure that input parameters are compatiable.
        Inputs:
        K -> integer specifying number of neighbours to consider
        metric -> string to indicate the distance metric to use (valid entries are 'minkowski' or 'cosine')
        p -> order of the minkowski metric (valid only when distance == 'minkowski')
        """
        # call base class initialiser
        super().__init__(K,metric,p)

    def _generate_predictions(self, idx_neighbours: np.array) -> np.array:
        """
        Protected function to compute predictions from the K nearest neighbours
        Inputs:
        idx_neighbours -> indices of nearest neighbours
        Outputs:
        y_pred -> numpy array of prediction results
        """
        # compute the mode label for each submitted sample
        y_pred = stats.mode(self.y_train[idx_neighbours],axis=1).mode.flatten()

        # return result
        return y_pred

In [32]:
# knn
knn = KNNClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(test)
y_pred

array([ 1,  0, -1, -1,  0,  0, -1, -1,  0,  0,  0, -1,  1,  0,  1,  0,  0,
        1,  1,  0,  0,  1, -1,  0, -1, -1,  1,  1, -1, -1,  1,  1,  0,  0,
        0,  1,  1, -1,  1, -1,  0,  0, -1,  0,  0, -1,  0, -1,  1,  0, -1,
       -1, -1,  0, -1, -1,  1, -1,  0,  1,  1, -1,  1,  1,  1, -1,  1,  0,
        0,  1, -1,  0,  0,  1,  0, -1,  0,  1,  1,  0,  1,  1,  1, -1, -1,
        0,  0,  1,  0, -1,  0,  1,  0,  0,  0,  1, -1,  1, -1, -1],
      dtype=int64)

In [33]:
# create ID and prediction column dataframe
id = np.arange(1, len(y_pred)+1)
id = id.reshape(-1, 1)
id
y_pred = np.array(y_pred)
y_pred = y_pred.reshape(-1, 1)
y_pred
df = np.concatenate((id, y_pred), axis=1)
df = pd.DataFrame(df, columns=['ID', 'label'])
df

Unnamed: 0,ID,label
0,1,1
1,2,0
2,3,-1
3,4,-1
4,5,0
...,...,...
95,96,1
96,97,-1
97,98,1
98,99,-1


In [34]:
#  change label back to walkfast, walkmod, walkslow
df['label'] = df['label'].map({-1: 'downstairs', 0: 'jogging', 1: 'upstairs'})
df

Unnamed: 0,ID,label
0,1,upstairs
1,2,jogging
2,3,downstairs
3,4,downstairs
4,5,jogging
...,...,...
95,96,upstairs
96,97,downstairs
97,98,upstairs
98,99,downstairs


In [35]:
# save to csv
df.to_csv('submission_2.csv', index=False)