In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.metrics import accuracy_score , confusion_matrix , roc_auc_score , roc_curve, classification_report
from pandas_profiling import ProfileReport
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import pickle
import os
import csv
import logging
from pandas_profiling import ProfileReport

In [2]:
class Human_activity_recognition_knn(object):
    def __init__(self,dir_path):
        self.dir_path = dir_path
        logging.basicConfig(filename='knn.log', level=logging.DEBUG,
                    format='%(asctime)s:%(levelname)s:%(message)s')
        logging.info('Human_activity_recognition_knn class object is created.')
        
    def prepare_datset(self):
        """
        Create a final csv-'merge.csv'from the directory folder to be used as dataframe for later stage.
        
        
        Parameters
        ----------
        None
        
        Returns:
        ----------
        None        
        """   
        logging.info('Dataset preparation started from the raw data.') 
        try:
            # assign directory
            directory = self.dir_path

            # iterate over files in
            # that directory
            folder_file_dict = {}
            for filename in os.listdir(directory):
                f = os.path.join(directory, filename)
                # checking if it is not a file
                if not os.path.isfile(f):
                    file_list = [os.path.join(f, sub_filename) for sub_filename in os.listdir(f) if sub_filename != "README.txt"]
                    folder_file_dict[filename] = file_list      
            header = []
            df_list = []
            for key in folder_file_dict:
                for file in folder_file_dict[key]:
                    with open(file, "r", encoding="shift_jis", errors="", newline="" ) as f:
                        lst = csv.reader(f, delimiter=",")
                        df = pd.DataFrame(lst)
                        df.drop(df.columns[[0,4,5,6,7]], axis=1, inplace =True)
                        df_list.append(df)
                merged_df = pd.concat(df_list)
                merged_df.columns = ["frontal_axis_reading(g)","vertical_axis_reading(g)","lateral_axis_reading(g)","activity"]
                merged_df.to_csv('merged.csv', index=None, header=True)
        except Exception as e:
            logging.error("{} occured while creating datasets from the raw data.".format(str(e)))               
            
    def load_dataset(self):
        """
        Load csv file as pandas dataframe.
        
        
        Parameters
        ----------
        None
        
        Returns:
        ----------
        None        
        """
        logging.info('Dataset is getting loaded as pandas dataframe.')
        try:        
            self.df = pd.read_csv("merged.csv") 
            self.df.drop(['time','Unnamed: 8'], axis=1, inplace=True)
        except FileNotFoundError:
            logging.error("File not found: exception occured while loading csv as pandas dataframe.")
        except pd.errors.EmptyDataError:
            logging.error("No data: exception occured while loading csv as pandas dataframe.")
        except pd.errors.ParserError:
            logging.errornt("Parse error: exception occured while loading csv as pandas dataframe.")
        except Exception as e:
            logging.error("{} occured while loading csv as pandas dataframe.".format(str(e)))
            
    def create_profile_report(self,inp_df):
        """
        Create pandas profile report for the input data frame.
        
        
        Parameters
        ----------
        inp_df: Input data frame.
        
        Returns:
        ----------
        None        
        """    
        logging.info('Profile reporting started for dataframe.')
        return ProfileReport(inp_df)
    
    def handle_outlier(self):
        """
        remove outliers for the impacted feature columns.


        Parameters
        ----------
        None

        Returns:
        ----------
        None        
        """
        logging.info('Outliers are getting removed.')
        q = self.df['lateral_axis_reading(g)'].quantile(.90)
        self.df_new = self.df[self.df['lateral_axis_reading(g)'] < q]
        
    def standard_scaling(self):
        """
        Perform standard scaling on input dataframe.
        
        
        Parameters
        ----------
        None
        
        Returns:
        ----------
        None        
        """      
        logging.info('Standard scalling started for feature columsn.')
        self.y = self.df_new['activity']
        self.x = self.df_new.drop(columns=['activity'])
        scalar = StandardScaler()
        self.x_scaled = scalar.fit_transform(self.x)
        self.df_new_scalar = pd.DataFrame(scalar.fit_transform(self.df_new))
        
    def train_test_split(self, test_size, random_state):
        """
        Split data frame into train and test.
         
        Parameters
        ----------
        None
        
        Returns:
        ----------
        Train and test data for feature and predicted columns.        
        """
        logging.info('train and test split for dataframe started.')
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_scaled , self.y , test_size = test_size , random_state = random_state)
        
    def hyperparam_tuning_fit(self):        
        knn = KNeighborsClassifier()
        k_range = list(range(1, 31))
        param = dict(n_neighbors=k_range)

        # defining parameter range
        self.grid = GridSearchCV(knn, param_grid = param, verbose = 3) # default: refit = True, cv = 5       
        
        # fitting the model for grid search
        self.knn_model = self.grid.fit(self.x_train, self.y_train)

        # print best parameter after tuning
        print(self.grid.best_params_)

        # print how our model looks after hyper-parameter tuning
        print(self.grid.best_estimator_)
        
    def calc_classification_score(self):
        """
        Calculate the classification score.
        
        Parameters
        ----------
        None. 
        
        Returns:
        ----------
        None. 
        """        
        grid_predictions = self.grid.predict(self.x_test)

        # print classification report
        print(classification_report(self.y_test, grid_predictions)) 

        
    def predict(self,test_case):
        """
        Predict the dependent feature based on the input test case.
        
        Parameters
        ----------
        test_case: It is the independent variable list value. 
        
        Returns:
        ----------
        Returns the predicted feature. 
        """               
        logging.info('Prediction will be done for the testcase {}.'.format(test_case))
        try:
            return self.grid.predict(test_case)
        except Exception as e:
            logging.error("{} occured while predicting dependent feature.".format(str(e)))
            return None
        
    def save_knn_model(self,file_name):
        """
        Save the knn model based on the input file name.
        
        Parameters
        ----------
        file_name: knn model will be saved with this file name. 
        
        Returns:
        ----------
        None. 
        """  
        logging.info('Save knn model into file: {}.'.format(file_name))
        try:
            pickle.dump(self.knn_model,open(file_name,'wb'))
        except Exception as e:
            logging.error("{} occured while saving knn model.".format(str(e)))        

In [3]:
knn_obj = Human_activity_recognition_knn('../Datasets_Healthy_Older_People')

In [4]:
knn_obj.prepare_datset()

In [5]:
knn_obj.load_dataset()

In [6]:
inp_df = knn_obj.df
pf = knn_obj.create_profile_report(inp_df)
pf.to_widgets()
pf.to_file("har_profiling.html")

Summarize dataset:   0%|          | 0/17 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
knn_obj.handle_outlier()

In [13]:
knn_obj.standard_scaling()

In [14]:
knn_obj.train_test_split(0.2,144)

In [15]:
knn_obj.hyperparam_tuning_fit()

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END .....................n_neighbors=1;, score=0.978 total time=   1.8s
[CV 2/5] END .....................n_neighbors=1;, score=0.982 total time=   1.6s
[CV 3/5] END .....................n_neighbors=1;, score=0.981 total time=   1.6s
[CV 4/5] END .....................n_neighbors=1;, score=0.979 total time=   1.6s
[CV 5/5] END .....................n_neighbors=1;, score=0.979 total time=   1.6s
[CV 1/5] END .....................n_neighbors=2;, score=0.975 total time=   1.6s
[CV 2/5] END .....................n_neighbors=2;, score=0.978 total time=   1.6s
[CV 3/5] END .....................n_neighbors=2;, score=0.978 total time=   1.6s
[CV 4/5] END .....................n_neighbors=2;, score=0.976 total time=   1.6s
[CV 5/5] END .....................n_neighbors=2;, score=0.974 total time=   1.6s
[CV 1/5] END .....................n_neighbors=3;, score=0.975 total time=   1.6s
[CV 2/5] END .....................n_neighbors=3

[CV 2/5] END ....................n_neighbors=21;, score=0.956 total time=   1.8s
[CV 3/5] END ....................n_neighbors=21;, score=0.957 total time=   1.8s
[CV 4/5] END ....................n_neighbors=21;, score=0.955 total time=   1.8s
[CV 5/5] END ....................n_neighbors=21;, score=0.954 total time=   1.8s
[CV 1/5] END ....................n_neighbors=22;, score=0.955 total time=   1.7s
[CV 2/5] END ....................n_neighbors=22;, score=0.955 total time=   1.8s
[CV 3/5] END ....................n_neighbors=22;, score=0.956 total time=   1.8s
[CV 4/5] END ....................n_neighbors=22;, score=0.954 total time=   1.7s
[CV 5/5] END ....................n_neighbors=22;, score=0.953 total time=   1.8s
[CV 1/5] END ....................n_neighbors=23;, score=0.955 total time=   1.8s
[CV 2/5] END ....................n_neighbors=23;, score=0.955 total time=   1.8s
[CV 3/5] END ....................n_neighbors=23;, score=0.955 total time=   1.7s
[CV 4/5] END ...............

In [16]:
knn_obj.calc_classification_score()

              precision    recall  f1-score   support

           1       0.96      0.97      0.96      2637
           2       0.90      0.93      0.91       691
           3       1.00      1.00      1.00      9809
           4       0.87      0.77      0.82       382

    accuracy                           0.98     13519
   macro avg       0.93      0.92      0.92     13519
weighted avg       0.98      0.98      0.98     13519



In [21]:
testcase = [knn_obj.x_test[0]]
print(testcase)
print(knn_obj.predict(testcase))

[array([-0.47399757,  1.41783671,  0.83561075])]
[1]


In [22]:
knn_obj.save_knn_model('knn_model')