In [1]:
#Import Library
import numpy as np
import pandas as pd
from scipy import stats
from typing import Dict, Any
from abc import ABC,abstractmethod
from sklearn.datasets import load_diabetes, load_breast_cancer
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import mean_squared_error,\
                            mean_absolute_error,\
                            accuracy_score,\
                            precision_score,\
                            recall_score,\
                            f1_score,\
                            make_scorer
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
#Generating kNN Class
class KNN(ABC):
    """
    Base class for KNN implementations
    """
    
    def __init__(self, K : int = 3, metric : str = 'minkowski', p : int = 2) -> None:
        """
        Initializer function. Ensure that input parameters are compatiable.
        Inputs:
            K      -> integer specifying number of neighbours to consider
            metric -> string to indicate the distance metric to use (valid entries are 'minkowski' or 'cosine')
            p      -> order of the minkowski metric (valid only when distance == 'minkowski')
        """
        # check distance is a valid entry
        valid_distance = ['minkowski','cosine']
        if metric not in valid_distance:
            msg = "Entered value for metric is not valid. Pick one of {}".format(valid_distance)
            raise ValueError(msg)
        # check minkowski p parameter
        if (metric == 'minkowski') and (p <= 0):
            msg = "Entered value for p is not valid. For metric = 'minkowski', p >= 1"
            raise ValueError(msg)
        # store/initialise input parameters
        self.K       = K
        self.metric  = metric
        self.p       = p
        self.X_train = np.array([])
        self.y_train = np.array([])
        
    def __del__(self) -> None:
        """
        Destructor function. 
        """
        del self.K
        del self.metric
        del self.p
        del self.X_train
        del self.y_train
      
    def __minkowski(self, x : np.array) -> np.array:
        """
        Private function to compute the minkowski distance between point x and the training data X
        Inputs:
            x -> numpy data point of predictors to consider
        Outputs:
            np.array -> numpy array of the computed distances
        """
        return np.power(np.sum(np.power(np.abs(self.X_train - x),self.p),axis=1),1/self.p)
    
    def __cosine(self, x : np.array) -> np.array:
        """
        Private function to compute the cosine distance between point x and the training data X
        Inputs:
            x -> numpy data point of predictors to consider
        Outputs:
            np.array -> numpy array of the computed distances
        """
        return (1 - (np.dot(self.X_train,x)/(np.linalg.norm(x)*np.linalg.norm(self.X_train,axis=1))))
    
    def __distances(self, X : np.array) -> np.array:
        """
        Private function to compute distances to each point x in X[x,:]
        Inputs:
            X -> numpy array of points [x]
        Outputs:
            D -> numpy array containing distances from x to all points in the training set.
        """
        # cover distance calculation
        if self.metric == 'minkowski':
            D = np.apply_along_axis(self.__minkowski,1,X)
        elif self.metric == 'cosine':
            D = np.apply_along_axis(self.__cosine,1,X)
        # return computed distances
        return D
    
    @abstractmethod
    def _generate_predictions(self, idx_neighbours : np.array) -> np.array:
        """
        Protected function to compute predictions from the K nearest neighbours
        """
        pass
        
    def fit(self, X : np.array, y : np.array) -> None:
        """
        Public training function for the class. It is assummed input X has been normalised.
        Inputs:
            X -> numpy array containing the predictor features
            y -> numpy array containing the labels associated with each value in X
        """
        # store training data
        self.X_train = np.copy(X)
        self.y_train = np.copy(y)
        
    def predict(self, X : np.array) -> np.array:
        """
        Public prediction function for the class. 
        It is assummed input X has been normalised in the same fashion as the input to the training function
        Inputs:
            X -> numpy array containing the predictor features
        Outputs:
           y_pred -> numpy array containing the predicted labels
        """
        # ensure we have already trained the instance
        if (self.X_train.size == 0) or (self.y_train.size == 0):
            raise Exception('Model is not trained. Call fit before calling predict.')
        # compute distances
        D = self.__distances(X)
        # obtain indices for the K nearest neighbours
        idx_neighbours = D.argsort()[:,:self.K]
        # compute predictions
        y_pred = self._generate_predictions(idx_neighbours)
        # return results
        return y_pred
    
    def get_params(self, deep : bool = False) -> Dict:
        """
        Public function to return model parameters
        Inputs:
            deep -> boolean input parameter
        Outputs:
            Dict -> dictionary of stored class input parameters
        """
        return {'K':self.K,
                'metric':self.metric,
                'p':self.p}

In [3]:
#Load Dataset
Home_data = pd.read_csv('HomeC.csv')
nan_removed = pd.DataFrame.dropna(pd.DataFrame(Home_data))
nan_removed.value_counts()

  Home_data = pd.read_csv('HomeC.csv')


time        use [kW]  gen [kW]  House overall [kW]  Dishwasher [kW]  Furnace 1 [kW]  Furnace 2 [kW]  Home office [kW]  Fridge [kW]  Wine cellar [kW]  Garage door [kW]  Kitchen 12 [kW]  Kitchen 14 [kW]  Kitchen 38 [kW]  Barn [kW]  Well [kW]  Microwave [kW]  Living room [kW]  Solar [kW]  temperature  icon         humidity  visibility  summary     apparentTemperature  pressure  windSpeed  cloudCover  windBearing  precipIntensity  dewPoint  precipProbability
1451624400  0.932833  0.003483  0.932833            0.000033         0.020700        0.061917        0.442633          0.124150     0.006983          0.013083          0.000417         0.000150         0.000000         0.031350   0.001017   0.004067        0.001517          0.003483    36.14        clear-night  0.62      10.00       Clear       29.26                1016.91   9.18       cloudCover  282.0        0.0000           24.40     0.00                 1
1451960350  0.498450  0.054750  0.498450            0.000000         0.020100

In [4]:
# remove duplicates values for temperature, icon, humidity, visibility, summary, apparentTemperature, pressure, windSpeed, cloudCover, windBearing, precipIntensity, dewPoint, and precipProbability
duplicate_removed = nan_removed.drop_duplicates(subset=['temperature', 'icon', 'humidity', 'visibility', 'summary', 'apparentTemperature', 'pressure', 'windSpeed', 'cloudCover', 'windBearing', 'precipIntensity', 'dewPoint', 'precipProbability'], keep='first')
duplicate_removed['summary'].value_counts()

summary
Clear                       6549
Partly Cloudy               1083
Light Rain                   476
Drizzle                      180
Overcast                     105
Rain                          90
Mostly Cloudy                 79
Light Snow                    76
Flurries                      31
Breezy                        27
Snow                          20
Breezy and Partly Cloudy      18
Foggy                         17
Rain and Breezy                3
Heavy Snow                     3
Flurries and Breezy            2
Breezy and Mostly Cloudy       1
Dry                            1
Name: count, dtype: int64

In [5]:
# split the dataframe into 3 type of summary Clear, Cloudy, and other
clear = duplicate_removed[duplicate_removed['summary'] == 'Clear']
cloudy = duplicate_removed[duplicate_removed['summary'] == 'Overcast']
other = duplicate_removed[(duplicate_removed['summary'] != 'Clear') & (duplicate_removed['summary'] != 'Overcast')]
other['summary'] = 'Other'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  other['summary'] = 'Other'


In [6]:
# combine the 3 dataframe into 1
frames = [clear, cloudy, other]
result = pd.concat(frames)

# change the summary into numerical value
result['summary'] = result['summary'].map({'Clear': 0, 'Overcast': -1, 'Other': 1})
result = result.reset_index(drop=True)

In [7]:
# select the features and summary as target
X = result[['temperature', 'humidity', 'visibility', 'apparentTemperature', 'windSpeed', 'windBearing', 'dewPoint']].values
y = result['summary'].values

In [8]:
#Generating kNN Classifier
class KNNClassifier(KNN):
    """
    Class for KNN classifiction implementation
    """
    
    def __init__(self, K : int = 3, metric : str = 'minkowski', p : int = 2) -> None:
        """
        Initializer function. Ensure that input parameters are compatiable.
        Inputs:
            K       -> integer specifying number of neighbours to consider
            metric  -> string to indicate the distance metric to use (valid entries are 'minkowski' or 'cosine')
            p       -> order of the minkowski metric (valid only when distance == 'minkowski')
        """
        # call base class initialiser
        super().__init__(K,metric,p)
        
    def _generate_predictions(self, idx_neighbours : np.array) -> np.array:
        """
        Protected function to compute predictions from the K nearest neighbours
        Inputs:
            idx_neighbours -> indices of nearest neighbours
        Outputs:
            y_pred -> numpy array of prediction results
        """        
        # compute the mode label for each submitted sample
        y_pred = stats.mode(self.y_train[idx_neighbours],axis=1).mode.flatten()   
        # return result
        return y_pred

In [9]:
#properly format labels
y = np.where(y==0,-1,1)

In [10]:
#define the scoring metrics
scoring_metrics = {'accuracy' : make_scorer(accuracy_score), 
                   'precision': make_scorer(precision_score),
                   'recall'   : make_scorer(recall_score),
                   'f1'       : make_scorer(f1_score)}

In [11]:
## define a helper function for our analysis ##
def cv_classifier_analysis(pipe : Any, 
                           X : np.array, 
                           y : np.array, 
                           k : int, 
                           scoring_metrics : Dict,
                           metric : str) -> None:
    """
    Function to carry out cross-validation analysis for input KNN classifier
    Inputs:
        pipe            -> input pipeline containing preprocessing and KNN classifier
        X               -> numpy array of predictors
        y               -> numpy array of labels
        k               -> integer value for number of nearest neighbours to consider
        scoring_metrics -> dictionary of scoring metrics to consider 
        metric          -> string indicating distance metric used
    """
    # print hyperparameter configuration
    print('RESULTS FOR K = {0}, {1}'.format(k,metric))
    # run cross validation
    dcScores = cross_validate(pipe,X,y,cv=StratifiedKFold(10),scoring=scoring_metrics)
    # report results
    print('Mean Accuracy: %.2f' % np.mean(dcScores['test_accuracy']))
    print('Mean Precision: %.2f' % np.mean(dcScores['test_precision']))
    print('Mean Recall: %.2f' % np.mean(dcScores['test_recall']))
    print('Mean F1: %.2f' % np.mean(dcScores['test_f1']))

In [12]:
#perform cross-validation for a range of model hyperparameters for the Custom model
K = [3,6,9]
for k in K:
    # define the pipeline for manhatten distance
    p_manhat = Pipeline([('scaler', StandardScaler()), ('knn', KNNClassifier(k, metric = 'minkowski', p = 1))])
    # define the pipeline for euclidean distance
    p_euclid = Pipeline([('scaler', StandardScaler()), ('knn', KNNClassifier(k, metric = 'minkowski', p = 2))])
    # define the pipeline for cosine distance
    p_cosine = Pipeline([('scaler', StandardScaler()), ('knn', KNNClassifier(k, metric = 'cosine'))])
    # cross validate for p_manhat
    cv_classifier_analysis(p_manhat, X, y, k, scoring_metrics, 'MANHATTEN DISTANCE')
    # cross validate for p_euclid
    cv_classifier_analysis(p_euclid, X, y, k, scoring_metrics, 'EUCLIDEAN DISTANCE')
    # cross validate for p_cosine
    cv_classifier_analysis(p_cosine, X, y, k, scoring_metrics, 'COSINE DISTANCE')

RESULTS FOR K = 3, MANHATTEN DISTANCE


Mean Accuracy: 0.72
Mean Precision: 0.42
Mean Recall: 0.34
Mean F1: 0.38
RESULTS FOR K = 3, EUCLIDEAN DISTANCE
Mean Accuracy: 0.72
Mean Precision: 0.43
Mean Recall: 0.36
Mean F1: 0.39
RESULTS FOR K = 3, COSINE DISTANCE
Mean Accuracy: 0.72
Mean Precision: 0.43
Mean Recall: 0.35
Mean F1: 0.38
RESULTS FOR K = 6, MANHATTEN DISTANCE
Mean Accuracy: 0.75
Mean Precision: 0.51
Mean Recall: 0.26
Mean F1: 0.34
RESULTS FOR K = 6, EUCLIDEAN DISTANCE
Mean Accuracy: 0.75
Mean Precision: 0.52
Mean Recall: 0.27
Mean F1: 0.35
RESULTS FOR K = 6, COSINE DISTANCE
Mean Accuracy: 0.76
Mean Precision: 0.53
Mean Recall: 0.28
Mean F1: 0.36
RESULTS FOR K = 9, MANHATTEN DISTANCE
Mean Accuracy: 0.75
Mean Precision: 0.51
Mean Recall: 0.31
Mean F1: 0.38
RESULTS FOR K = 9, EUCLIDEAN DISTANCE
Mean Accuracy: 0.75
Mean Precision: 0.51
Mean Recall: 0.32
Mean F1: 0.39
RESULTS FOR K = 9, COSINE DISTANCE
Mean Accuracy: 0.75
Mean Precision: 0.51
Mean Recall: 0.32
Mean F1: 0.39
