In [4]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.metrics import accuracy_score , confusion_matrix , roc_auc_score , roc_curve, classification_report
from pandas_profiling import ProfileReport
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
import pickle
import os
import csv
import logging
from pandas_profiling import ProfileReport

In [5]:
class Human_activity_recognition_dt(object):
    def __init__(self,dir_path):
        self.dir_path = dir_path
        logging.basicConfig(filename='dt.log', level=logging.DEBUG,
                    format='%(asctime)s:%(levelname)s:%(message)s')
        logging.info('Human_activity_recognition_dt class object is created.')
        
    def prepare_datset(self):
        """
        Create a final csv-'merge.csv'from the directory folder to be used as dataframe for later stage.
        
        
        Parameters
        ----------
        None
        
        Returns:
        ----------
        None        
        """   
        logging.info('Dataset preparation started from the raw data.') 
        try:
            # assign directory
            directory = self.dir_path

            # iterate over files in
            # that directory
            folder_file_dict = {}
            for filename in os.listdir(directory):
                f = os.path.join(directory, filename)
                # checking if it is not a file
                if not os.path.isfile(f):
                    file_list = [os.path.join(f, sub_filename) for sub_filename in os.listdir(f) if sub_filename != "README.txt"]
                    folder_file_dict[filename] = file_list      
            header = []
            df_list = []
            for key in folder_file_dict:
                for file in folder_file_dict[key]:
                    with open(file, "r", encoding="shift_jis", errors="", newline="" ) as f:
                        lst = csv.reader(f, delimiter=",")
                        df = pd.DataFrame(lst)
                        df.drop(df.columns[[0,4,5,6,7]], axis=1, inplace =True)
                        df_list.append(df)
                merged_df = pd.concat(df_list)
                merged_df.columns = ["frontal_axis_reading(g)","vertical_axis_reading(g)","lateral_axis_reading(g)","activity"]
                merged_df.to_csv('merged.csv', index=None, header=True)
        except Exception as e:
            logging.error("{} occured while creating datasets from the raw data.".format(str(e)))               
            
    def load_dataset(self):
        """
        Load csv file as pandas dataframe.
        
        
        Parameters
        ----------
        None
        
        Returns:
        ----------
        None        
        """
        logging.info('Dataset is getting loaded as pandas dataframe.')
        try:        
            self.df = pd.read_csv("merged.csv") 
            self.df.drop(['time','Unnamed: 8'], axis=1, inplace=True)
        except FileNotFoundError:
            logging.error("File not found: exception occured while loading csv as pandas dataframe.")
        except pd.errors.EmptyDataError:
            logging.error("No data: exception occured while loading csv as pandas dataframe.")
        except pd.errors.ParserError:
            logging.errornt("Parse error: exception occured while loading csv as pandas dataframe.")
        except Exception as e:
            logging.error("{} occured while loading csv as pandas dataframe.".format(str(e)))
            
    def create_profile_report(self,inp_df):
        """
        Create pandas profile report for the input data frame.
        
        
        Parameters
        ----------
        inp_df: Input data frame.
        
        Returns:
        ----------
        None        
        """    
        logging.info('Profile reporting started for dataframe.')
        return ProfileReport(inp_df)
    
    def handle_outlier(self):
        """
        remove outliers for the impacted feature columns.


        Parameters
        ----------
        None

        Returns:
        ----------
        None        
        """
        logging.info('Outliers are getting removed.')
        q = self.df['lateral_axis_reading(g)'].quantile(.90)
        self.df_new = self.df[self.df['lateral_axis_reading(g)'] < q]
        
    def standard_scaling(self):
        """
        Perform standard scaling on input dataframe.
        
        
        Parameters
        ----------
        None
        
        Returns:
        ----------
        None        
        """      
        logging.info('Standard scalling started for feature columsn.')
        self.y = self.df_new['activity']
        self.x = self.df_new.drop(columns=['activity'])
        scalar = StandardScaler()
        self.x_scaled = scalar.fit_transform(self.x)
        self.df_new_scalar = pd.DataFrame(scalar.fit_transform(self.df_new))
        
    def train_test_split(self, test_size, random_state):
        """
        Split data frame into train and test.
         
        Parameters
        ----------
        None
        
        Returns:
        ----------
        Train and test data for feature and predicted columns.        
        """
        logging.info('train and test split for dataframe started.')
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_scaled , self.y , test_size = test_size , random_state = random_state)
        
    def hyperparam_tuning_fit(self):        
        dt = DecisionTreeClassifier()
        grid_param = {"criterion" : ['gini','entropy'], 
                      "splitter" : ['best','random'],
                      "max_depth" : range(2,40,1),
                      "min_samples_split" : range(2,10,1),
                      "min_samples_leaf" : range(1,10,1),
                      "ccp_alpha" : np.random.rand(20)
                     }

        # defining parameter range
        self.grid = GridSearchCV(estimator = dt, param_grid = grid_param, n_jobs = -1, cv = 5, refit = True, verbose = 3)
        
        # fitting the model for grid search
        self.dt_model = self.grid.fit(self.x_train, self.y_train)

        # print best parameter after tuning
        print(self.grid.best_params_)

        # print how our model looks after hyper-parameter tuning
        print(self.grid.best_estimator_)
        
    def evaluate_model(self):
        """
        Calculate the classification score.
        
        Parameters
        ----------
        None. 
        
        Returns:
        ----------
        None. 
        """        
        grid_predictions = self.grid.predict(self.x_test)
        
        accuracy = accuracy_score(self.y_test, grid_predictions)
        report = classification_report(self.y_test, grid_predictions)
        cm = confusion_matrix(self.y_test, grid_predictions)

        print("Classification report:")
        print("Accuracy: ", accuracy)
        print(report)
        print("Confusion matrix:")
        print(cm)        
        
    def predict(self,test_case):
        """
        Predict the dependent feature based on the input test case.
        
        Parameters
        ----------
        test_case: It is the independent variable list value. 
        
        Returns:
        ----------
        Returns the predicted feature. 
        """               
        logging.info('Prediction will be done for the testcase {}.'.format(test_case))
        try:
            return self.grid.predict(test_case)
        except Exception as e:
            logging.error("{} occured while predicting dependent feature.".format(str(e)))
            return None
        
    def save_dt_model(self,file_name):
        """
        Save the dt model based on the input file name.
        
        Parameters
        ----------
        file_name: dt model will be saved with this file name. 
        
        Returns:
        ----------
        None. 
        """  
        logging.info('Save dt model into file: {}.'.format(file_name))
        try:
            pickle.dump(self.dt_model,open(file_name,'wb'))
        except Exception as e:
            logging.error("{} occured while saving dt model.".format(str(e)))        

In [6]:
dt_obj = Human_activity_recognition_dt('../Datasets_Healthy_Older_People')

In [7]:
dt_obj.prepare_datset()

In [8]:
dt_obj.load_dataset()

In [9]:
inp_df = dt_obj.df
dt_obj.create_profile_report(inp_df)

AttributeError: 'Human_activity_recognition_dt' object has no attribute 'df'

In [None]:
dt_obj.handle_outlier()

In [None]:
dt_obj.standard_scaling()

In [None]:
dt_obj.train_test_split(0.2,144)

In [None]:
dt_obj.hyperparam_tuning_fit()

In [None]:
dt_obj.evaluate_model()

In [None]:
testcase = [dt_obj.x_test[0]]
print(testcase)
print(dt_obj.predict(testcase))

In [None]:
dt_obj.save_dt_model('dt_model')