In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, matthews_corrcoef, f1_score
from pandas_profiling import ProfileReport
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
import pickle
import os
import csv
import logging
from sklearn.preprocessing import StandardScaler

In [2]:
class Human_activity_recognition_stacking(object):
    def __init__(self,dir_path):
        self.dir_path = dir_path
        logging.basicConfig(filename='stacking.log', level=logging.DEBUG,
                    format='%(asctime)s:%(levelname)s:%(message)s')
        logging.info('Human_activity_recognition_stacking class object is created.')
        
    def prepare_datset(self):
        """
        Create a final csv-'merge.csv'from the directory folder to be used as dataframe for later stage.
        
        
        Parameters
        ----------
        None
        
        Returns:
        ----------
        None        
        """   
        logging.info('Dataset preparation started from the raw data.') 
        try:
            # assign directory
            directory = self.dir_path

            # iterate over files in
            # that directory
            folder_file_dict = {}
            for filename in os.listdir(directory):
                f = os.path.join(directory, filename)
                # checking if it is not a file
                if not os.path.isfile(f):
                    file_list = [os.path.join(f, sub_filename) for sub_filename in os.listdir(f) if sub_filename != "README.txt"]
                    folder_file_dict[filename] = file_list      
            header = []
            df_list = []
            for key in folder_file_dict:
                for file in folder_file_dict[key]:
                    with open(file, "r", encoding="shift_jis", errors="", newline="" ) as f:
                        lst = csv.reader(f, delimiter=",")
                        df = pd.DataFrame(lst)
                        df.drop(df.columns[[0,4,5,6,7]], axis=1, inplace =True)
                        df_list.append(df)
                merged_df = pd.concat(df_list)
                merged_df.columns = ["frontal_axis_reading(g)","vertical_axis_reading(g)","lateral_axis_reading(g)","activity"]
                merged_df.to_csv('merged.csv', index=None, header=True)
        except Exception as e:
            logging.error("{} occured while creating datasets from the raw data.".format(str(e)))               
            
    def load_dataset(self):
        """
        Load csv file as pandas dataframe.
        
        
        Parameters
        ----------
        None
        
        Returns:
        ----------
        None        
        """
        logging.info('Dataset is getting loaded as pandas dataframe.')
        try:        
            self.df = pd.read_csv("merged.csv") 
            self.df.drop(['time','Unnamed: 8'], axis=1, inplace=True)
        except FileNotFoundError:
            logging.error("File not found: exception occured while loading csv as pandas dataframe.")
        except pd.errors.EmptyDataError:
            logging.error("No data: exception occured while loading csv as pandas dataframe.")
        except pd.errors.ParserError:
            logging.errornt("Parse error: exception occured while loading csv as pandas dataframe.")
        except Exception as e:
            logging.error("{} occured while loading csv as pandas dataframe.".format(str(e)))
            
    def create_profile_report(self,inp_df):
        """
        Create pandas profile report for the input data frame.
        
        
        Parameters
        ----------
        inp_df: Input data frame.
        
        Returns:
        ----------
        None        
        """    
        logging.info('Profile reporting started for dataframe.')
        return ProfileReport(inp_df)
    
    def handle_outlier(self):
        """
        remove outliers for the impacted feature columns.


        Parameters
        ----------
        None

        Returns:
        ----------
        None        
        """
        logging.info('Outliers are getting removed.')
        q = self.df['lateral_axis_reading(g)'].quantile(.90)
        self.df_new = self.df[self.df['lateral_axis_reading(g)'] < q]
        
    def standard_scaling(self):
        """
        Perform standard scaling on input dataframe.
        
        
        Parameters
        ----------
        None
        
        Returns:
        ----------
        None        
        """      
        logging.info('Standard scalling started for feature columsn.')
        self.y = self.df_new['activity']
        self.x = self.df_new.drop(columns=['activity'])
        scalar = StandardScaler()
        self.x_scaled = scalar.fit_transform(self.x)
        self.df_new_scalar = pd.DataFrame(scalar.fit_transform(self.df_new))
        
    def train_test_split(self, test_size, random_state):
        """
        Split data frame into train and test.
         
        Parameters
        ----------
        None
        
        Returns:
        ----------
        Train and test data for feature and predicted columns.        
        """
        logging.info('train and test split for dataframe started.')
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_scaled , self.y , test_size = test_size , random_state = random_state)
        
    def base_model_fit(self):
        
        self.knn = KNeighborsClassifier(n_neighbors=1) # Define classifier
        self.knn.fit(self.x_train, self.y_train) # Train model           
        
        self.svc = SVC(C=100, gamma=1, kernel="rbf")
        self.svc.fit(self.x_train, self.y_train)       
        
        self.dt = DecisionTreeClassifier(criterion = 'gini', max_depth = 30, min_samples_leaf = 1, min_samples_split = 2, splitter = 'random') # Define classifier
        self.dt.fit(self.x_train, self.y_train) # Train model
        
        self.rf = RandomForestClassifier(criterion = 'gini', max_depth=9, min_samples_leaf=2, n_estimators=160) # Define classifier
        self.rf.fit(self.x_train, self.y_train) # Train model      
        

    def base_evaluate_model(self):
        """
        Calculate the classification score.
        
        Parameters
        ----------
        None. 
        
        Returns:
        ----------
        None. 
        """        
        # Make predictions
        y_train_pred = self.knn.predict(self.x_train)
        y_test_pred = self.knn.predict(self.x_test)

        # Training set performance
        self.knn_train_accuracy = accuracy_score(self.y_train, y_train_pred) # Calculate Accuracy
        self.knn_train_mcc = matthews_corrcoef(self.y_train, y_train_pred) # Calculate MCC
        self.knn_train_f1 = f1_score(self.y_train, y_train_pred, average='weighted') # Calculate F1-score

        # Test set performance
        self.knn_test_accuracy = accuracy_score(self.y_test, y_test_pred) # Calculate Accuracy
        self.knn_test_mcc = matthews_corrcoef(self.y_test, y_test_pred) # Calculate MCC
        self.knn_test_f1 = f1_score(self.y_test, y_test_pred, average='weighted') # Calculate F1-score

        print('Model performance for KNN Training set')
        print('- Accuracy: %s' % self.knn_train_accuracy)
        print('- MCC: %s' % self.knn_train_mcc)
        print('- F1 score: %s' % self.knn_train_f1)
        print('----------------------------------')
        print('Model performance for KNN Test set')
        print('- Accuracy: %s' % self.knn_test_accuracy)
        print('- MCC: %s' % self.knn_test_mcc)
        print('- F1 score: %s' % self.knn_test_f1)
        
        # Make predictions
        y_train_pred = self.svc.predict(self.x_train)
        y_test_pred = self.svc.predict(self.x_test)

        # Training set performance
        self.svc_train_accuracy = accuracy_score(self.y_train, y_train_pred) # Calculate Accuracy
        self.svc_train_mcc = matthews_corrcoef(self.y_train, y_train_pred) # Calculate MCC
        self.svc_train_f1 = f1_score(self.y_train, y_train_pred, average='weighted') # Calculate F1-score

        # Test set performance
        self.svc_test_accuracy = accuracy_score(self.y_test, y_test_pred) # Calculate Accuracy
        self.svc_test_mcc = matthews_corrcoef(self.y_test, y_test_pred) # Calculate MCC
        self.svc_test_f1 = f1_score(self.y_test, y_test_pred, average='weighted') # Calculate F1-score

        print('Model performance for SVC Training set')
        print('- Accuracy: %s' % self.svc_train_accuracy)
        print('- MCC: %s' % self.svc_train_mcc)
        print('- F1 score: %s' % self.svc_train_f1)
        print('----------------------------------')
        print('Model performance for SVC Test set')
        print('- Accuracy: %s' % self.svc_test_accuracy)
        print('- MCC: %s' % self.svc_test_mcc)
        print('- F1 score: %s' % self.svc_test_f1)   
        
        # Make predictions
        y_train_pred = self.dt.predict(self.x_train)
        y_test_pred = self.dt.predict(self.x_test)

        # Training set performance
        self.dt_train_accuracy = accuracy_score(self.y_train, y_train_pred) # Calculate Accuracy
        self.dt_train_mcc = matthews_corrcoef(self.y_train, y_train_pred) # Calculate MCC
        self.dt_train_f1 = f1_score(self.y_train, y_train_pred, average='weighted') # Calculate F1-score

        # Test set performance
        self.dt_test_accuracy = accuracy_score(self.y_test, y_test_pred) # Calculate Accuracy
        self.dt_test_mcc = matthews_corrcoef(self.y_test, y_test_pred) # Calculate MCC
        self.dt_test_f1 = f1_score(self.y_test, y_test_pred, average='weighted') # Calculate F1-score

        print('Model performance for Decision Tree Training set')
        print('- Accuracy: %s' % self.dt_train_accuracy)
        print('- MCC: %s' % self.dt_train_mcc)
        print('- F1 score: %s' % self.dt_train_f1)
        print('----------------------------------')
        print('Model performance for Decision Tree Test set')
        print('- Accuracy: %s' % self.dt_test_accuracy)
        print('- MCC: %s' % self.dt_test_mcc)
        print('- F1 score: %s' % self.dt_test_f1) 
        
        # Make predictions
        y_train_pred = self.rf.predict(self.x_train)
        y_test_pred = self.rf.predict(self.x_test)

        # Training set performance
        self.rf_train_accuracy = accuracy_score(self.y_train, y_train_pred) # Calculate Accuracy
        self.rf_train_mcc = matthews_corrcoef(self.y_train, y_train_pred) # Calculate MCC
        self.rf_train_f1 = f1_score(self.y_train, y_train_pred, average='weighted') # Calculate F1-score

        # Test set performance
        self.rf_test_accuracy = accuracy_score(self.y_test, y_test_pred) # Calculate Accuracy
        self.rf_test_mcc = matthews_corrcoef(self.y_test, y_test_pred) # Calculate MCC
        self.rf_test_f1 = f1_score(self.y_test, y_test_pred, average='weighted') # Calculate F1-score

        print('Model performance for Random Forest Training set')
        print('- Accuracy: %s' % self.rf_train_accuracy)
        print('- MCC: %s' % self.rf_train_mcc)
        print('- F1 score: %s' % self.rf_train_f1)
        print('----------------------------------')
        print('Model performance for Random Forest Test set')
        print('- Accuracy: %s' % self.rf_test_accuracy)
        print('- MCC: %s' % self.rf_test_mcc)
        print('- F1 score: %s' % self.rf_test_f1)         
        
    def build_stack_model(self):
        estimator_list = [
            ('knn',self.knn),
            ('svm_rbf',self.svc),
            ('dt',self.dt),
            ('rf',self.rf)]
        
        # Build stack model
        self.stack_model = StackingClassifier(
            estimators=estimator_list, final_estimator=LogisticRegression()
        )

        # Train stacked model
        self.stack_model.fit(self.x_train, self.y_train) 
        
        # Make predictions
        y_train_pred = self.stack_model.predict(self.x_train)
        y_test_pred = self.stack_model.predict(self.x_test)

        # Training set model performance
        self.stack_model_train_accuracy = accuracy_score(self.y_train, y_train_pred) # Calculate Accuracy
        self.stack_model_train_mcc = matthews_corrcoef(self.y_train, y_train_pred) # Calculate MCC
        self.stack_model_train_f1 = f1_score(self.y_train, y_train_pred, average='weighted') # Calculate F1-score

        # Test set model performance
        self.stack_model_test_accuracy = accuracy_score(self.y_test, y_test_pred) # Calculate Accuracy
        self.stack_model_test_mcc = matthews_corrcoef(self.y_test, y_test_pred) # Calculate MCC
        self.stack_model_test_f1 = f1_score(self.y_test, y_test_pred, average='weighted') # Calculate F1-score

        print('Model performance for Stack Model Training set')
        print('- Accuracy: %s' % self.stack_model_train_accuracy)
        print('- MCC: %s' % self.stack_model_train_mcc)
        print('- F1 score: %s' % self.stack_model_train_f1)
        print('----------------------------------')
        print('Model performance for Stack Model Test set')
        print('- Accuracy: %s' % self.stack_model_test_accuracy)
        print('- MCC: %s' % self.stack_model_test_mcc)
        print('- F1 score: %s' % self.stack_model_test_f1)
        
    def stacking_result(self):
        acc_train_list = {'knn':self.knn_train_accuracy,
        'svm_rbf': self.svc_train_accuracy,
        'dt': self.dt_train_accuracy,
        'rf': self.rf_train_accuracy,
        'stack': self.stack_model_train_accuracy}

        mcc_train_list = {'knn':self.knn_train_mcc,
        'svm_rbf': self.svc_train_mcc,
        'dt': self.dt_train_mcc,
        'rf': self.rf_train_mcc,
        'stack': self.stack_model_train_mcc}

        f1_train_list = {'knn':self.knn_train_f1,
        'svm_rbf': self.svc_train_f1,
        'dt': self.dt_train_f1,
        'rf': self.rf_train_f1,
        'stack': self.stack_model_train_f1}
        
        print("acc_train_list: ", acc_train_list)
        print("mcc_train_list: ", mcc_train_list)
        print("f1_train_list: ", f1_train_list)
        
    def predict(self,test_case):
        """
        Predict the dependent feature based on the input test case.
        
        Parameters
        ----------
        test_case: It is the independent variable list value. 
        
        Returns:
        ----------
        Returns the predicted feature. 
        """               
        logging.info('Prediction will be done for the testcase {}.'.format(test_case))
        try:
            return self.stack_model.predict(test_case)
        except Exception as e:
            logging.error("{} occured while predicting dependent feature.".format(str(e)))
            return None
        
    def save_stacking_model(self,file_name):
        """
        Save the stacking model based on the input file name.
        
        Parameters
        ----------
        file_name: stacking model will be saved with this file name. 
        
        Returns:
        ----------
        None. 
        """  
        logging.info('Save stacking model into file: {}.'.format(file_name))
        try:
            pickle.dump(self.stack_model,open(file_name,'wb'))
        except Exception as e:
            logging.error("{} occured while saving stacking model.".format(str(e)))        

In [3]:
stacking_obj = Human_activity_recognition_stacking('../Datasets_Healthy_Older_People')

In [4]:
stacking_obj.prepare_datset()

In [5]:
stacking_obj.load_dataset()

In [6]:
inp_df = stacking_obj.df
pf = stacking_obj.create_profile_report(inp_df)
pf.to_widgets()
pf.to_file("har_profiling.html")

Summarize dataset:   0%|          | 0/17 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
stacking_obj.handle_outlier()

In [8]:
stacking_obj.standard_scaling()

In [9]:
stacking_obj.train_test_split(0.2,144)

In [10]:
stacking_obj.base_model_fit()

In [11]:
stacking_obj.base_evaluate_model()

Model performance for KNN Training set
- Accuracy: 0.991715057143914
- MCC: 0.9810695983862158
- F1 score: 0.9917202798951545
----------------------------------
Model performance for KNN Test set
- Accuracy: 0.9824691175382795
- MCC: 0.9594049104448268
- F1 score: 0.9822741623484657
Model performance for SVC Training set
- Accuracy: 0.9420054000073973
- MCC: 0.8677342385213735
- F1 score: 0.9351528048199231
----------------------------------
Model performance for SVC Test set
- Accuracy: 0.9411938752866337
- MCC: 0.8642839486455386
- F1 score: 0.9342244931186673
Model performance for Decision Tree Training set
- Accuracy: 0.9937862928579354
- MCC: 0.985793836328655
- F1 score: 0.9937551438705274
----------------------------------
Model performance for Decision Tree Test set
- Accuracy: 0.9834307271247873
- MCC: 0.9616227001706041
- F1 score: 0.983168345929421
Model performance for Random Forest Training set
- Accuracy: 0.9551170618041942
- MCC: 0.8973757551296827
- F1 score: 0.95149827

In [12]:
stacking_obj.build_stack_model()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Model performance for Stack Model Training set
- Accuracy: 0.9934904020416466
- MCC: 0.9851159000644004
- F1 score: 0.9934617602359758
----------------------------------
Model performance for Stack Model Test set
- Accuracy: 0.9838745469339448
- MCC: 0.9626287075766038
- F1 score: 0.983582261970428


In [13]:
stacking_obj.stacking_result()

acc_train_list:  {'knn': 0.991715057143914, 'svm_rbf': 0.9420054000073973, 'dt': 0.9937862928579354, 'rf': 0.9551170618041942, 'stack': 0.9934904020416466}
mcc_train_list:  {'knn': 0.9810695983862158, 'svm_rbf': 0.8677342385213735, 'dt': 0.985793836328655, 'rf': 0.8973757551296827, 'stack': 0.9851159000644004}
f1_train_list:  {'knn': 0.9917202798951545, 'svm_rbf': 0.9351528048199231, 'dt': 0.9937551438705274, 'rf': 0.951498272857258, 'stack': 0.9934617602359758}


In [14]:
testcase = [stacking_obj.x_test[700]]
print(testcase)
print(stacking_obj.predict(testcase))

[array([-0.61766687,  1.38984107,  0.80791472])]
[4]


In [15]:
stacking_obj.save_stacking_model('stack_model')