In [1]:
import pandas as pd 
import numpy as np
from numpy import mean
from numpy import std
from sklearn.preprocessing import StandardScaler, LabelEncoder 
from sklearn.linear_model  import Ridge,Lasso,RidgeCV, LassoCV, ElasticNet, ElasticNetCV, LogisticRegression
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
import seaborn as sns
import pickle
import os
import csv
import logging
from matplotlib import pyplot

In [2]:
class Logisticregression(object):
    def __init__(self,dir_path):
        self.dir_path = dir_path
        logging.basicConfig(filename='logistic_regression.log', level=logging.DEBUG,
                    format='%(asctime)s:%(levelname)s:%(message)s')
        logging.info('Logisticregression class object is created.')
        
    def prepare_datset(self):
        """
        Create a final csv-'merge.csv'from the directory folder to be used as dataframe for later stage.
        
        
        Parameters
        ----------
        None
        
        Returns:
        ----------
        None        
        """   
        logging.info('Dataset preparation started from the raw data.')
        try:
            # assign directory
            directory = self.dir_path

            # iterate over files in
            # that directory
            folder_file_dict = {}
            for filename in os.listdir(directory):
                f = os.path.join(directory, filename)
                # checking if it is not a file
                if not os.path.isfile(f):
                    file_list = [os.path.join(f, sub_filename) for sub_filename in os.listdir(f)]
                    folder_file_dict[filename] = file_list

            for key in folder_file_dict:
                for file in folder_file_dict[key]:
                    if file == r"AReM\bending2\dataset4.csv":
                        with open(file, "r", encoding="shift_jis", errors="", newline="" ) as f:
                            lst = csv.reader(f)
                            new_rows_list = []
                            for row in lst:
                                new_row = row[0].replace(" ",",")
                                print(new_row)
                                new_rows_list.append(new_row)
                        with open(file, "w", encoding="shift_jis", errors="", newline="" ) as f:
                            writer = csv.writer(f)
                            for row in new_rows_list:
                                columns = [c.strip() for c in row.strip(', ').split(',')]
                                writer.writerow(columns)                
                        break

            header = []
            df_list = []
            for key in folder_file_dict:
                for file in folder_file_dict[key]:
                    with open(file, "r", encoding="shift_jis", errors="", newline="" ) as f:
                        lst = csv.reader(f, delimiter=",")
                        for i,row in enumerate(lst):
                            if i==4:
                                if header == []:
                                    temp_col = row[0].replace("# Columns: ","")
                                    row[0] = temp_col
                                    header = row
                                break
                        df = pd.DataFrame(lst)
                        new_row = pd.DataFrame(header)
                        df = pd.concat([new_row.T, df])
                        new_file = file.replace('.csv','_new.csv')
                        new_header = df.iloc[0] #grab the first row for the header
                        df = df[1:] #take the data less the header row
                        df.columns = new_header #set the header row as the df header
                        df['lable']=key
            #             df.to_csv(new_file, sep=",", header = True,index=False)
                        df_list.append(df)
            merged = pd.concat(df_list)
            merged.to_csv('merged.csv', index=None, header=True) 
        except Exception as e:
            logging.error("{} occured while creating datasets from the raw data.".format(str(e)))            
        
    def load_dataset(self):
        """
        Load csv file as pandas dataframe.
        
        
        Parameters
        ----------
        None
        
        Returns:
        ----------
        None        
        """
        logging.info('Dataset is getting loaded as pandas dataframe.')
        try:        
            self.df = pd.read_csv("merged.csv") 
            self.df.drop(['time','Unnamed: 8'], axis=1, inplace=True)
        except FileNotFoundError:
            logging.error("File not found: exception occured while loading csv as pandas dataframe.")
        except pd.errors.EmptyDataError:
            logging.error("No data: exception occured while loading csv as pandas dataframe.")
        except pd.errors.ParserError:
            logging.errornt("Parse error: exception occured while loading csv as pandas dataframe.")
        except Exception as e:
            logging.error("{} occured while loading csv as pandas dataframe.".format(str(e)))
            
    def labelencode_y(self):
        """
        Perform label encoding on target categorical column.
        
        
        Parameters
        ----------
        None
        
        Returns:
        ----------
        None        
        """
        logging.info('Label encoding started for categorical features.')
        try:
            le = LabelEncoder()
            y = self.df['lable']
            labels_id = le.fit_transform(y)
            self.df['lable'] = labels_id 
        except Exception as e:
            logging.error("{} occured while label encoding categorical column.".format(str(e)))            
        
    def create_profile_report(self,inp_df):
        """
        Create pandas profile report for the input data frame.
        
        
        Parameters
        ----------
        inp_df: Input data frame.
        
        Returns:
        ----------
        None        
        """    
        logging.info('Profile reporting started for dataframe.')
        return ProfileReport(inp_df)
        
    def handle_outlier(self):
        """
        remove outliers for the impacted feature columns.
        
        
        Parameters
        ----------
        None
        
        Returns:
        ----------
        None        
        """
        logging.info('Outliers are getting removed.')
        q = self.df['var_rss12'].quantile(.99)
        self.df_new = self.df[self.df['var_rss12'] < q]

        q = self.df_new['avg_rss13'].quantile(.99)
        self.df_new = self.df_new[self.df_new['avg_rss13'] < q]   

        q = self.df_new['var_rss13'].quantile(.95)
        self.df_new = self.df_new[self.df_new['var_rss13'] < q]

        q = self.df_new['avg_rss23'].quantile(.95)
        self.df_new = self.df_new[self.df_new['avg_rss23'] < q]

        q = self.df_new['var_rss23'].quantile(.95)
        self.df_new = self.df_new[self.df_new['var_rss23'] < q]

        q = self.df_new['avg_rss12'].quantile(.60)
        self.df_new = self.df_new[self.df_new['avg_rss12'] < q] 
        
    def standard_scaling(self):
        """
        Perform standard scaling on input dataframe.
        
        
        Parameters
        ----------
        None
        
        Returns:
        ----------
        None        
        """      
        logging.info('Standard scalling started for feature columsn.')
        self.y = self.df_new['lable']
        self.x = self.df_new.drop(columns=['lable'])
        scalar = StandardScaler()
        self.x_scaled = scalar.fit_transform(self.x)
        self.df_new_scalar = pd.DataFrame(scalar.fit_transform(self.df_new))  
        
    def vif_score(self):
        """
        Calculate vif score for input feature columns.
        
        
        Parameters
        ----------
        None
        
        Returns:
        ----------
        Dataframe cotaining vif scores for the feature colums.        
        """    
        logging.info('Vif score is calculation is in progress.')
        try:
            arr = self.x_scaled
            return pd.DataFrame([[self.x.columns[i], variance_inflation_factor(arr,i)] for i in range(arr.shape[1])], columns=["FEATURE", "VIF_SCORE"])
        except Exception as e:
            logging.error("{} occured while vif score calculation.".format(str(e))) 
    
    def drop_multicolinearity_by_vif(self, vif_thresh):
        """
        This functions drops tyhose columns whose values are more than threshold VIF passed as parameter.

        Parameters
        ----------
        vif_thresh: This is the threshold VIF value above which dataset column will be dropped.

        Returns:
        ----------
        None. 
        """
        logging.info('All features with VIF more than {} will be dropped from the dataset.'.format(vif_thresh))
        try:
            X = self.x_scaled
            variables = [X.columns[i] for i in range(X.shape[1])]
            dropped=True
            while dropped:
                dropped=False
                vif = Parallel(n_jobs=-1,verbose=5)(delayed(variance_inflation_factor)(X[variables].values, ix) for ix in range(len(variables)))

                maxloc = vif.index(max(vif))
                if max(vif) > vif_thresh:
                    if X[variables].columns[maxloc] is not self.predicted_col:
                        logging.info(time.ctime() + ' dropping \'' + X[variables].columns[maxloc] + '\' at index: ' + str(maxloc))
                        variables.pop(maxloc)
                        dropped=True

            logging.info('Remaining variables:')
            logging.info([variables])
            self.final_df = X[[i for i in variables]]
        except Exception as e:
            logging.error("{} occured while droping some of the feature from dataset based on vif threshold.".format(str(e)))
            
    
    def train_test_split(self, test_size, random_state):
        """
        Split data frame into train and test.
         
        Parameters
        ----------
        None
        
        Returns:
        ----------
        Train and test data for feature and predicted columns.        
        """
        logging.info('train and test split for dataframe started.')
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_scaled , self.y , test_size = test_size , random_state = random_state)
        
    # get a list of models to evaluate
    def get_models(self):
        """
        Create logistic regression models instance for various hyper parametrs like penalty, regularization and solver.
        
        Parameters
        ----------
        None
        
        Returns:
        ----------
        Dictionary contains all logistics regression model instances.        
        """
        logging.info('various logitic regression model instances are getting created based on hyper parameters.')
        try:
            models = dict()
            p = [0.0, 0.0001, 0.001, 0.01, 0.1, 1.0]
            regularization = ['l1', 'l2', 'elasticnet']
            solver = ["lbfgs","newton-cg","sag","saga"]
            l1_ratio = 0.5  # L1 weight in the Elastic-Net regularization    

            for i in p:
                for j in regularization:
                    for k in solver:
                        # create name for model
                        key = str(i) + '_' + j + '_' + k
                        # turn off penalty in some cases
                        if i == 0.0:
                            key = str(i) + '_' + 'none' + '_' + k
                            # no penalty in this case
                            models[key] = LogisticRegression(multi_class='multinomial', solver=k, penalty='none')
                        elif k == "lbfgs" or "newton-cg":
                            if j == "l2":
                                models[key] = LogisticRegression(multi_class='multinomial', solver=k, penalty=j, C=i)
                        elif j == 'elasticnet':
                            models[key] = LogisticRegression(multi_class='multinomial', solver=k, penalty=j, C=i, l1_ratio=l1_ratio, tol=0.01)
                        else:
                            models[key] = LogisticRegression(multi_class='multinomial', solver=k, penalty=j, C=i)                

            return models   
        except Exception as e:
            logging.error("{} occured while creating logistics regression model instances".format(str(e)))
        
    
    def fit_evaluate_model(self):
        """
        Fit all the models with train and test data and then evaluate with various parameters.        
        
        Parameters
        ----------
        None
        
        Returns:
        ----------
        Multiple dictionaries model evaluation scores.        
        """
        logging.info('Fitting all the models with train and test data and then evaluate with various parameters are in progress.')        
        try:
            averages = ['macro', 'weighted']  # 'micro' is not allowed for multiclass problem
            # get the models to evaluate
            self.models = self.get_models()
            # evaluate the models and store results
            results, names = list(), list()
            roc_auc_score_dict = dict()
            precision_score_dict = dict()
            recall_score_dict = dict()
            recall_score_dict = dict()
            accuracy_score_dict = dict()
            f1_score_dict = dict()

            for name, model in models.items():  
                model.fit(self.x_train,self.y_train)
                y_pred = model.predict(self.x_test)
                y_pred_proba = model.predict_proba(self.x_test)

                accuracy_score_dict[name] = accuracy_score(self.y_test, y_pred)

                for j in averages:        
                    roc_auc_score_dict[name+j] = roc_auc_score(self.y_test,y_pred_proba,multi_class = 'ovr', average=j)
                    precision_score_dict[name+j] = precision_score(self.y_test, y_pred, average=j)
                    recall_score_dict[name+j] = recall_score(self.y_test, y_pred, average=j)
                    f1_score_dict[name+j] = f1_score(self.y_test, y_pred, average=j)

            return accuracy_score_dict, roc_auc_score_dict, precision_score_dict, recall_score_dict, f1_score_dict  
        except Exception as e:
            logging.error("{} occured while fitting and evaluating models".format(str(e)))        
           
    # evaluate a give model using cross-validation
    def evaluate_model_using_cv(self, model, X, y):
        """
        Evaluate model using cross validation technique.        
        
        Parameters
        ----------
        None
        
        Returns:
        ----------
        Returns the cross validation score for model.        
        """ 
        logging.info('Evaluating model using cross validation started.') 
        try:
            # define the evaluation procedure
            cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
            # evaluate the model
            scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
            return scores
        except Exception as e:
            logging.error("{} occured while evaluating model using cross validation".format(str(e)))         

In [3]:
logi_regr = Logisticregression('AReM')

In [4]:
logi_regr.prepare_datset()

#,Task:,bending2
#,Frequency,(Hz):,20
#,Clock,(millisecond):,250
#,Duration,(seconds):,120
#,Columns:,time
0,32.50,0.50,0.00,0.00,19.00,1.00,
250,32.50,0.50,0.00,0.00,18.50,0.50,
500,32.75,0.43,1.00,0.00,18.00,0.00,
750,32.50,0.50,0.00,0.00,17.50,0.50,
1000,32.50,0.50,7.50,0.50,17.50,0.87,
1250,32.67,0.47,11.00,1.00,16.75,0.83,
1500,32.50,0.50,6.25,0.83,18.00,0.00,
1750,32.50,0.50,3.50,0.87,18.00,0.00,
2000,32.33,0.47,6.00,0.00,18.33,0.47,
2250,32.67,0.47,8.00,0.82,18.00,0.00,
2500,32.50,0.50,8.33,3.30,16.67,0.47,
2750,32.50,0.50,10.33,1.25,16.00,1.22,
3000,32.25,0.83,9.00,3.08,16.00,0.71,
3250,32.00,0.00,2.67,0.47,15.75,0.83,
3500,31.75,0.43,2.75,1.09,16.50,1.12,
3750,32.00,0.00,1.00,0.00,17.50,0.87,
4000,32.25,0.83,0.00,0.00,17.50,0.50,
4250,32.25,0.43,0.00,0.00,17.00,1.00,
4500,31.75,0.43,5.33,3.30,17.75,0.43,
4750,31.50,0.50,9.50,0.50,17.75,0.43,
5000,31.50,0.50,5.00,2.16,17.33,0.47,
5250,31.50,0.50,4.50,0.50,17.50,0.50,
5500,31.67,0.47,4.50,1.12,18.00,0.00,
5750,31.50,0.50,1.67,0.

In [5]:
logi_regr.load_dataset()

In [6]:
logi_regr.labelencode_y()

In [None]:
inp_df = logi_regr.df
logi_regr.create_profile_report(inp_df)

Summarize dataset:   0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
logi_regr.handle_outlier()

In [None]:
logi_regr.standard_scaling()

In [None]:
logi_regr.vif_score()

In [None]:
logi_regr.drop_multicolinearity_by_vif(10)

In [None]:
logi_regr.train_test_split(0.2,144)

In [None]:
# get the models to evaluate
models = logi_regr.get_models()

In [None]:
accuracy_score_dict, roc_auc_score_dict, precision_score_dict, recall_score_dict, f1_score_dict = logi_regr.fit_evaluate_model()

In [None]:
for name, accuracy in accuracy_score_dict.items():
    hyper_param = name.split('_')
    print('\nc={}, penalty={}, solver={}-->Accuracy: {:.3f}\n'.format(hyper_param[0], hyper_param[1], hyper_param[2], accuracy))  

In [None]:
for name, roc_auc in roc_auc_score_dict.items():
    hyper_param = name.split('_')
    print('\nc={}, penalty={}, solver={}-->roc_auc_score: {:.3f}\n'.format(hyper_param[0], hyper_param[1], hyper_param[2], roc_auc))      

In [None]:
for name, precision in precision_score_dict.items():
    hyper_param = name.split('_')
    print('\nc={}, penalty={}, solver={}-->precision_score: {:.3f}\n'.format(hyper_param[0], hyper_param[1], hyper_param[2], precision))          

In [None]:
for name, recall in recall_score_dict.items():
    hyper_param = name.split('_')
    print('\nc={}, penalty={}, solver={}-->recall_score: {:.3f}\n'.format(hyper_param[0], hyper_param[1], hyper_param[2], recall))              

In [None]:
for name, f1 in f1_score_dict.items():
    hyper_param = name.split('_')
    print('\nc={}, penalty={}, solver={}-->f1_score: {:.3f}\n'.format(hyper_param[0], hyper_param[1], hyper_param[2], f1))                  

In [None]:
results, names = list(), list()
for name, model in models.items():
    # evaluate the model and collect the scores
    scores = logi_regr.evaluate_model_using_cv(model, logi_regr.x_scaled , logi_regr.y)
    # store the results
    results.append(scores)
    names.append(name)
    # summarize progress along the way
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.gcf().autofmt_xdate()
pyplot.rcParams["figure.figsize"] = (10,10)
pyplot.tight_layout()
pyplot.show()  