# Clone Repository & Install Packages, just for google colab

## Clone

In [51]:
!git clone https://github.com/Ruhallah93/Driver-Identification.git

fatal: destination path 'Driver-Identification' already exists and is not an empty directory.


## Prepare

In [52]:
!pip install tsfel



In [53]:
!pip install skfeature-chappers



In [54]:
!pip install data-complexity



In [55]:
!pip install pingouin



# Libraries

In [56]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import pywt
import tsfel
%matplotlib inline
import os
import seaborn as sns
from scipy import stats
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import mutual_info_score
from skfeature.function.similarity_based import fisher_score
from dcm import dcm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import LeaveOneOut
from scipy.sparse.csgraph import minimum_spanning_tree
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

from scipy.spatial import distance

import warnings
warnings.filterwarnings('ignore')

# Dataset

In [76]:
import os
from datetime import datetime, timedelta
import re
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sn
import math
from sklearn import preprocessing
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.metrics import classification_report


class Utils:

    def __init__(self, sample_rate):
        self.all_features = ['x-accelerometer', 'y-accelerometer', 'z-accelerometer',
                             'GRAVITY X (m/s²)', 'GRAVITY Y (m/s²)', 'GRAVITY Z (m/s²)',
                             'LINEAR ACCELERATION X (m/s²)', 'LINEAR ACCELERATION Y (m/s²)',
                             'LINEAR ACCELERATION Z (m/s²)',
                             'x-gyroscope', 'y-gyroscope', 'z-gyroscope',
                             'LIGHT (lux)',
                             'MAGNETIC FIELD X (μT)', 'MAGNETIC FIELD Y (μT)', 'MAGNETIC FIELD Z (μT)',
                             'ORIENTATION Z (azimuth °)', 'ORIENTATION X (pitch °)', 'ORIENTATION Y (roll °)',
                             'LOCATION Latitude : ',
                             'LOCATION Longitude : ',
                             'LOCATION Altitude ( m)',
                             'LOCATION Altitude-google ( m)',
                             'LOCATION Speed ( Kmh)',
                             'LOCATION Accuracy ( m)',
                             'LOCATION ORIENTATION (°)',
                             'Satellites in range',
                             'Time since start in ms',
                             'timestamp']
        self.show_toolbar = True
        self.sample_rate = sample_rate
        self.data_split_ratio = 0.7

    def read_data(self, db_path_prefix, file_name, features, driver_i):
        db_path = db_path_prefix + file_name
        db_new_path = db_path_prefix + "cleaned_by_acc/"

        if os.path.exists(db_new_path + file_name):
            data = pd.read_csv(db_new_path + file_name, low_memory=False)
            original_size = pd.read_csv(db_path, low_memory=False).shape[0]
            driving_size = data.shape[0]
            stay_size = original_size - driving_size
        else:
            data = pd.read_csv(db_path, low_memory=False)
            data.columns = self.all_features

            original_size = data.shape[0]

            def aggregate(point1, point2):
                return math.sqrt(math.pow(point1[0] - point2[0], 2) +
                                 math.pow(point1[1] - point2[1], 2) +
                                 math.pow(point1[2] - point2[2], 2))

            staypoints = [0]
            points_acc = list(zip(data['x-accelerometer'], data['y-accelerometer'], data['z-accelerometer']))
            for j in range(0, len(points_acc) - 12):
                node = points_acc[j]
                add = True
                for j2 in range(j + 1, j + 12):
                    if aggregate(node, points_acc[j2]) > 0.5:
                        add = False
                if add:
                    staypoints.append(j)

            if not staypoints.__contains__(len(points_acc) - 1):
                staypoints.append(len(points_acc) - 1)

            stay_size = len(staypoints)

            data = data.drop(index=data['x-accelerometer'][staypoints].index)

            driving_size = data.shape[0]

            if not os.path.exists(db_new_path):
                os.makedirs(db_new_path)
            data.to_csv(db_new_path + file_name, index=False)

        data = data[features]

        # data = data.fillna(data.mean())
        data = data.dropna()

        clean_driving_size = data.shape[0]

        template = "{0:20}{1:20}{2:20}{3:20}{4:20}"
        if self.show_toolbar:
            self.show_toolbar = False
            print(template.format("driver_id: ", "original_size: ", "stay_size: ", "driving_size: ",
                                  "cleaned_driving_size: "))
        print(template.format(str(driver_i),
                              str(timedelta(seconds=int(original_size / self.sample_rate))),
                              str(timedelta(seconds=int(stay_size / self.sample_rate))),
                              str(timedelta(seconds=int(driving_size / self.sample_rate))),
                              str(timedelta(seconds=int(clean_driving_size / self.sample_rate)))))

        return self.split_to_train_test(data)

    def split_to_train_test(self, data):
        return data[:int(len(data) * self.data_split_ratio)], data[int(len(data) * self.data_split_ratio):]

    def save_result(self, saving_path, result, data, running_time):
        if not os.path.exists(saving_path):
            os.makedirs(saving_path)
        
        # Save to file
        with open(saving_path + 'statistics.txt', 'a') as f:
            f.write('\n==========***==========\n' +
                    datetime.now().strftime("%Y:%m:%d %H:%M:%S") +
                    '\n' +
                    'running time :' + str(running_time.seconds) + " seconds" +
                    '\n')
            f.write(str(data))
            f.write('\n')
            f.write(str(result))
            f.write('\n')

# Segmentation & Feature Extraction

In [58]:
class TransferToHistogram:

    def __init__(self, window_size=90, window_ovrlap_size=45, num_bins=100):
        self.window_size = window_size
        self.overlapping = window_ovrlap_size
        self.num_bins = num_bins

    def transfer(self, dataset, features):
        print("segmenting data with " + str(len(dataset)) + " points")
        segments, labels = self.segment_signal(dataset, features)
        new_dataset = []
        print("making " + str(len(segments)) + " segments")
        win = 0
        for segment, label in zip(segments, labels):
            row = []
            win += 1
            for feature_i in range(len(segment[1])):
                segment_f = segment[0:, [feature_i]]
                mu = np.mean(dataset[features[feature_i]])
                sigma = np.std(dataset[features[feature_i]])
                r = (mu - sigma * 2, mu + sigma * 2)
                count, bins = np.histogram(segment_f, bins=self.num_bins, range=r)
                his = count
                row = np.append(row, his, axis=0)
            row = np.append(row, [label], axis=0)
            new_dataset.append(row)

        columns = []
        for feature in features:
            for i in range(self.num_bins):
                columns.append(feature + '-0_Histogram_' + str(i))
        columns.append('id')
        df = pd.DataFrame(new_dataset, columns=columns)
        return df

    def windows(self, data):
        start = 0
        while start < data.count():
            yield int(start), int(start + self.window_size)
            start += (self.window_size - self.overlapping)

    def segment_signal(self, dataset, features):
        segments = np.empty((0, self.window_size, len(features)))
        labels = np.empty((0))
        for class_i in np.unique(dataset["id"]):
            subset = dataset[dataset["id"] == class_i]
            subset = subset.reset_index(drop=True)
            for (start, end) in self.windows(subset["id"]):
                feature_slices = []
                for feature in features:
                    feature_slices.append(subset[feature][start:end])
                if len(feature_slices[0]) == self.window_size:
                    segments = np.vstack([segments, np.dstack(
                        [feature_slices[k] for k in range(len(feature_slices))])])
                    labels = np.append(labels, stats.mode(subset["id"][start:end])[0][0])
        return segments, labels

In [59]:
import os
import seaborn as sns
import numpy as np
import pandas as pd
from scipy import stats
import tsfel
import matplotlib.pyplot as plt


class TsfelFeatureExtractor:

    def __init__(self, window_size=90, window_ovrlap_size=45, num_bins=100, sampling=2, approach='all'):
        self.window_size = window_size
        self.overlapping = window_ovrlap_size
        self.num_bins = num_bins
        self.sampling = sampling
        self.approach = approach

        if self.approach != 'all':
            self.cfg = tsfel.get_features_by_domain(domain=self.approach)
        else:
            self.cfg = tsfel.get_features_by_domain()

        # # @title Feature Extraction
        # googleSheet_name = "/home/ruhiii/PycharmProjects/DriverIdentification/identification/Features_Tsfel.xlsx"
        # # Extract excel info
        # self.cfg = tsfel.extract_sheet(googleSheet_name)
        # self.cfg = {'statistical': {
        #     'Histogram': {'complexity': 'constant', 'description': 'Computes histogram of the signal.',
        #                   'parameters': {'nbins': 10, 'r': 1}, 'function': 'tsfel.hist', 'use': 'yes'}}}
        if self.approach == 'statistical' or self.approach == 'all':
            self.cfg['statistical']['Histogram']['parameters']['nbins'] = self.num_bins

    def transfer(self, dataset, features, using_2std=True):
        print("segmenting data with " + str(len(dataset)) + " rows")
        segments, labels = self.segment_signal(dataset, features)
        new_dataset = []
        print("making " + str(len(segments)) + " segments")
        win = 0
        for segment, label in zip(segments, labels):
            row = []
            win += 1
            for feature_i in range(len(segment[1])):
                segment_f = segment[0:, [feature_i]]

                mu = np.mean(dataset[features[feature_i]])
                sigma = np.std(dataset[features[feature_i]])
                if using_2std:
                    r = mu + sigma * 2
                else:
                    r = dataset[features[feature_i]].max()
                if self.approach == 'statistical':
                    self.cfg['statistical']['Histogram']['parameters']['r'] = r
                extracted = tsfel.time_series_features_extractor(self.cfg, segment_f.reshape(1, -1)[0],
                                                                 fs=self.sampling, verbose=0)
                row = np.append(row, extracted.iloc[0, :], axis=0)

            row = np.append(row, [label], axis=0)
            new_dataset.append(row)

        columns = []
        for feature in features:
            for i in extracted:
                columns.append(feature + '-' + i)
        columns.append('id')
        df = pd.DataFrame(new_dataset, columns=columns)
        return df

    def feature_normalize(self, feature):
        mu = np.mean(feature, axis=0)
        sigma = np.std(feature, axis=0)
        return (feature - mu) / sigma

    def windows(self, data):
        start = 0
        while start < data.count():
            yield int(start), int(start + self.window_size)
            start += (self.window_size - self.overlapping)

    def segment_signal(self, dataset, features):
        segments = np.empty((0, self.window_size, len(features)))
        labels = np.empty((0))
        for class_i in np.unique(dataset["id"]):
            subset = dataset[dataset["id"] == class_i]
            for (start, end) in self.windows(subset["id"]):
                feature_slices = []
                for feature in features:
                    feature_slices.append(subset[feature][start:end])
                if len(feature_slices[0]) == self.window_size:
                    segments = np.vstack([segments, np.dstack(
                        [feature_slices[k] for k in range(len(feature_slices))])])
                    labels = np.append(labels, stats.mode(subset["id"][start:end])[0][0])
        return segments, labels


# Augmenter

In [60]:
import pywt
import numpy as np
import pandas as pd
from keras.models import model_from_json


class Augmenter():
    def __init__(self, filepath, input_labels):
        self.input_labels = input_labels
        self.latent_dim = 100

        # load json and create model
        json_file = open(filepath + '.json', 'r')
        loaded_model_json = json_file.read()
        json_file.close()
        self.generator = model_from_json(loaded_model_json)
        # load weights into new model
        self.generator.load_weights(filepath + '.h5')
        print("Loaded model from disk")

    def generate(self, num_per_driver, features):
        generateds = self.generate_images(num_per_driver)

        Gdata = generateds[0].reshape(generateds[0].shape[0], generateds[0].shape[1], generateds[0].shape[2])
        X_generated = []
        # each generate windows
        for i in range(Gdata.shape[0]):
            cell = []
            # each axis
            for j in range(0, Gdata.shape[1], 2):
                cell.append(pywt.idwt(Gdata[i, j, :], Gdata[i, j + 1, :], 'db2', 'smooth').tolist())
            X_generated.append(cell)

        print(np.shape(X_generated))
        X_generated1 = np.array(X_generated)
        X_generated2 = X_generated1.swapaxes(1, 2)
        X_generated3 = X_generated2.reshape(X_generated2.shape[0] * X_generated2.shape[1], X_generated2.shape[2])
        y_generated = generateds[1].repeat(X_generated2.shape[1])
        Data_generated = pd.DataFrame(X_generated3, columns=features)
        Data_generated['id'] = y_generated.tolist()
        Data_generated['id'] = Data_generated['id'] + 201
        return Data_generated

    def generate_images(self, num):
        labels = np.array(self.input_labels)
        sampled_labels = labels.reshape(-1, 1)
        for i in range(num):
            noise = np.random.normal(0, 1, (len(labels), self.latent_dim))
            if i == 0:
                sampled_images = self.generator.predict([noise, sampled_labels])
                final_labels = labels
            else:
                sampled_images = np.concatenate((sampled_images, self.generator.predict([noise, sampled_labels])),
                                                axis=0)
                final_labels = np.concatenate((final_labels, labels), axis=0)
        return (sampled_images, final_labels)

# VARIABLES

In [77]:
def initialization(n_driver):
  global features
  features = ['x-accelerometer','y-accelerometer','z-accelerometer','x-gyroscope','y-gyroscope','z-gyroscope']
  global db_path_prefix
  db_path_prefix = 'Driver-Identification/dataset/'
  global sample_rate
  sample_rate = 2

  global window_size
  window_size = sample_rate * 60 * 15
  global overlapping
  overlapping = int(window_size * 0.75)

  global utils
  utils = Utils(sample_rate=sample_rate)

#SEGMENTATION

In [62]:
def read_data(drivers):
  global train_dataset
  train_dataset = pd.DataFrame()
  global test_dataset
  test_dataset = pd.DataFrame()
  global labels
  labels = []
  for i in drivers:
    labels.append(i-201)
    train_temp_dataset, test_temp_dataset = utils.read_data(db_path_prefix, str(i) + '.1.csv', features, i)
    train_temp_dataset['id'] = i
    test_temp_dataset['id'] = i
    train_dataset = pd.concat([train_dataset, train_temp_dataset])
    test_dataset = pd.concat([test_dataset, test_temp_dataset])

In [63]:
def Standardization(X_train, X_test):
  CX_train = X_train.copy()
  CX_test = X_test.copy()

  C = pd.concat([CX_train, CX_test])
  NC = preprocessing.scale(C)
  NCX_train = NC[: len(CX_train)]
  NCX_test = NC[len(CX_train):]

  return NCX_train, NCX_test

In [64]:
def data_standardization():
  global n_train_dataset
  global n_test_dataset
  n_train_dataset, n_test_dataset = Standardization(train_dataset.iloc[:, :-1], test_dataset.iloc[:, :-1])
  n_train_dataset = pd.DataFrame(n_train_dataset, columns=features)
  n_test_dataset = pd.DataFrame(n_test_dataset, columns=features)
  n_train_dataset['id'] = train_dataset.iloc[:, -1].tolist()
  n_test_dataset['id'] = test_dataset.iloc[:, -1].tolist()

In [65]:
def data_augmentation(b:bool):
  global an_train_dataset
  if b:
    augment_size_per_driver = 6 * 15
    generator = Augmenter("Driver-Identification/model/GAN-g-(4)20", labels)
    generated = generator.generate(augment_size_per_driver, features)
    n_generated = preprocessing.scale(generated.iloc[:, :-1])
    n_generated = pd.DataFrame(n_generated, columns=features)
    n_generated['id'] = generated.iloc[:, -1].tolist()
    an_train_dataset = pd.concat([n_train_dataset, n_generated])
  else:
    an_train_dataset = n_train_dataset

In [66]:
def feature_extraction(approach):
    global S_train
    global S_test
    if approach == 'histogram':
      featureExtractor = TransferToHistogram(window_size=window_size, 
                                             window_ovrlap_size=overlapping, 
                                             num_bins=100)
    else:
      featureExtractor = TsfelFeatureExtractor(window_size=window_size,
                                               window_ovrlap_size=overlapping,
                                               num_bins=100,
                                               approach=approach)
    
    S_train = featureExtractor.transfer(an_train_dataset, features)
    S_test = featureExtractor.transfer(n_test_dataset, features)

    global SX_train
    SX_train = S_train.iloc[:, :-1]
    global y_train
    y_train = S_train.iloc[:, -1]
    global SX_test
    SX_test = S_test.iloc[:, :-1]
    global y_test
    y_test = S_test.iloc[:, -1]

    replace_y_numbers = {k: v for v, k in enumerate(sorted(set(y_train)))}
    y_train = y_train.replace(replace_y_numbers)
    y_test = y_test.replace(replace_y_numbers)

In [67]:
def correlation_report(features, threshold=0.95):
    features = pd.DataFrame(features)
    corr_matrix = features.corr()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    # Find index and column name of features with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    # if len(to_drop) == 0:
    #     print('No features to remove')
    # for rej in to_drop:
    #     print('Removing ' + str(rej))

    return to_drop

In [86]:
def feature_selection():
  to_drop = correlation_report(SX_train.copy(), threshold=0.95)
  global SSX_train, SSX_test
  SSX_train = SX_train.drop(to_drop, axis=1)
  SSX_test = SX_test.drop(to_drop, axis=1)

In [69]:
def Normalization(X_train, X_test):
  CX_train = X_train.copy()
  CX_test = X_test.copy()

  scaler = preprocessing.Normalizer()
  scaler = scaler.fit(CX_train)
  NCX_train = scaler.transform(CX_train)
  NCX_test = scaler.transform(CX_test)
  
  return NCX_train, NCX_test

In [70]:
def feature_normalization():
  global NSX_train
  global NSX_test
  NSX_train, NSX_test = Normalization(SSX_train, SSX_test)
  NSX_train = pd.DataFrame(NSX_train)
  NSX_test = pd.DataFrame(NSX_test)

# TRAINING

In [71]:
def train_model():
  svc = LinearSVC()
  mlp = MLPClassifier()
  knn = KNeighborsClassifier()
  rf4 = RandomForestClassifier(n_jobs=70, bootstrap=False, max_depth=2000, max_features=0.01, n_estimators=1500)
  mlp3 = MLPClassifier(hidden_layer_sizes=600, max_iter=1000, random_state=1, solver='adam', alpha=0.01, momentum=0.9)
  algms = [('svc', svc), ('mlp', mlp), ('knn', knn), ('rf1', rf4)]
  model = StackingClassifier(estimators=algms, final_estimator=LogisticRegression())
  model_train = model.fit(NSX_train, y_train)
  tmp = model.predict(NSX_test)
  accuracy = accuracy_score(y_test, tmp)
  precision = precision_score(y_test, tmp, average='macro')
  recall = recall_score(y_test, tmp, average='macro')
  f1 = f1_score(y_test, tmp, average='macro')
  return accuracy, precision, recall, f1

In [72]:
def save_result(approach, accuracy_list, recall_list, precision_list, f1_list, running_time):
  saving_path = 'log/'
  data = {'window size':window_size / (60*sample_rate),'overlap':overlapping / window_size, 
          'approach':approach, 'dataset':'eftekhari', 'drivers':n_driver, 'features':features}
  (accuracy_mean,accuracy_std) = (np.average(accuracy_list),np.std(accuracy_list))
  (recall_mean,recall_std) = (np.average(recall_list),np.std(recall_list))
  (precision_mean,precision_std) = (np.average(precision_list),np.std(precision_list))
  (f1_mean,f1_std) = (np.average(f1_list),np.std(f1_list))
  result = {
      'accuracy_mean':accuracy_mean,'accuracy_std':accuracy_std,
      'recall_mean':recall_mean,'recall_std':recall_std,
      'precision_mean':precision_mean,'precision_mean':precision_mean,
      'f1_mean':f1_mean,'f1_std':f1_std,
  }
  print('\nMean Accuracy:{:.4f}({:.4f}) Mean Recall:{:.4f}({:.4f}) Mean Precision:{:.4f}({:.4f}) Mean F1:{:.4f}({:.4f})\n'.format(
      accuracy_mean,accuracy_std, recall_mean,recall_std, precision_mean,precision_std, f1_mean,f1_std))
  utils.save_result(saving_path=saving_path, result=result, data=data, running_time=running_time)

# RUN

In [74]:
for n_driver in range(10,11):
  drivers = random.sample([i for i in range(201,211)], n_driver)
  for approach in ['histogram', 'statistical', 'all', 'temporal', 'spectral']:
    for data_aug_condition in [True]:
        accuracy_list = []
        recall_list = []
        precision_list = []
        f1_list = []
        start = datetime.now()
        for i in range(10):
          initialization(n_driver)
          read_data(drivers)
          data_standardization()
          data_augmentation(data_aug_condition)
          feature_extraction(approach)
          feature_selection()
          feature_normalization()

          accuracy, precision, recall, f1 = train_model()
          accuracy_list.append(accuracy)
          precision_list.append(precision)
          recall_list.append(recall)
          f1_list.append(f1)
          print('Feature Set: %s \t Augmentation: %s' % (approach, data_aug_condition))
          print('Accuracy:{:.4f}, Precision:{:.4f}, Recall:{:.4f}, F1:{:.4f}'.format(accuracy, precision, recall, f1))
        end = datetime.now()
        running_time = end - start
        save_result(approach, accuracy_list, recall_list, precision_list, f1_list, running_time)

driver_id:          original_size:      stay_size:          driving_size:       cleaned_driving_size: 
207                 12:13:19            9:10:35             3:02:44             3:02:39             
208                 8:00:58             5:54:09             2:06:49             2:06:44             
201                 4:19:09             2:38:41             1:40:28             1:40:28             
202                 6:53:36             4:59:01             1:54:35             1:54:35             
205                 9:38:07             8:10:24             1:27:43             1:27:38             
204                 8:39:25             6:50:00             1:49:25             1:49:19             
206                 6:44:40             4:46:42             1:57:58             1:57:52             
209                 6:34:13             4:20:44             2:13:29             2:13:24             
203                 8:24:28             5:41:16             2:43:11             2:43:06  

#**analysing**

##MRMR

In [100]:
import pandas as pd
from sklearn.feature_selection import f_regression

# inputs:
#    X: pandas.DataFrame, features
#    y: pandas.Series, target variable
#    K: number of features to select

def _mrmr_base(X, y, K, approach):
  # compute F-statistics and initialize correlation matrix
  X = X.reset_index(drop=True)
  
  F = pd.Series(f_regression(X, y)[0], index = X.columns)
  # rf = RandomForestClassifier(max_depth=2, random_state=0)
  # rf.fit(X, y)
  # F = pd.Series(rf.feature_importances_, index = X.columns)

  corr = pd.DataFrame(.00001, index = X.columns, columns = X.columns)

  # initialize list of selected features and list of excluded features
  selected = []
  scores = []
  rels = []
  reds = []
  combination = []
  not_selected = X.columns.to_list()

  # repeat K times
  for i in range(K):
    
      # compute (absolute) correlations between the last selected feature and all the (currently) excluded features
      if i > 0:
          last_selected = selected[-1]
          corr.loc[not_selected, last_selected] = X[not_selected].corrwith(X[last_selected]).abs().clip(.00001).fillna(.00001)

      # compute FCQ score for all the (currently) excluded features (this is Formula 2)
      rel = F.loc[not_selected].fillna(.00001)
      red = corr.loc[not_selected, selected].mean(axis = 1).round(5).fillna(.00001).replace(1.0, float('Inf'))
      score = rel / red

      # find best feature, add it to selected and remove it from not_selected
      if len(score) > 0:
        # print(score.argmax())
        best = score.index[score.argmax()]
        selected.append(best)
        not_selected.remove(best)
        rels.append(rel.iloc[score.argmax()])
        reds.append(red.iloc[score.argmax()])
        scores.append(score.max())
        combination.append([score.max(),rel.iloc[score.argmax()],red.iloc[score.argmax()],approach])
        # print(best, score.max())
  return scores, rels, reds, combination, selected

In [122]:
columns =['scores','rels','reds','method']
df = pd.DataFrame(columns = columns)
for n_driver in range(10,11):
  drivers = random.sample([i for i in range(201,211)], n_driver)
  for approach in ['histogram']:
    for data_aug_condition in [True, False]:
        initialization(n_driver)
        read_data(drivers)
        data_standardization()
        data_augmentation(data_aug_condition)
        feature_extraction(approach)
        # feature_selection()
        # feature_normalization()
        Xsubset = SX_train
        Ysubset = y_train
        
        method = approach + " " + str(data_aug_condition)
        scores, rels, reds, combination, selected = _mrmr_base(Xsubset, Ysubset, Xsubset.shape[1], method)
        df = pd.concat([df, pd.DataFrame(combination, columns = columns)])

        print(approach, data_aug_condition, Xsubset.shape)

driver_id:          original_size:      stay_size:          driving_size:       cleaned_driving_size: 
202                 6:53:36             4:59:01             1:54:35             1:54:35             
204                 8:39:25             6:50:00             1:49:25             1:49:19             
209                 6:34:13             4:20:44             2:13:29             2:13:24             
206                 6:44:40             4:46:42             1:57:58             1:57:52             
208                 8:00:58             5:54:09             2:06:49             2:06:44             
205                 9:38:07             8:10:24             1:27:43             1:27:38             
203                 8:24:28             5:41:16             2:43:11             2:43:06             
201                 4:19:09             2:38:41             1:40:28             1:40:28             
207                 12:13:19            9:10:35             3:02:44             3:02:39  

In [123]:
df.groupby("method").mean()

Unnamed: 0_level_0,scores,rels,reds
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
histogram False,14016.789598,13.721216,0.227846
histogram True,14694.131199,13.967997,0.217602


## Random Forest 

In [138]:
columns =['scores', 'method']
df = pd.DataFrame(columns = columns)
for n_driver in range(10,11):
  drivers = random.sample([i for i in range(201,211)], n_driver)
  for approach in ['all', 'histogram', 'statistical', 'temporal', 'spectral']:
    for data_aug_condition in [True]:
        initialization(n_driver)
        read_data(drivers)
        data_standardization()
        data_augmentation(data_aug_condition)
        feature_extraction('all')
        Xset = SX_train
        Yset = y_train

        initialization(n_driver)
        read_data(drivers)
        data_standardization()
        data_augmentation(data_aug_condition)
        feature_extraction(approach)
        feature_selection()
        Xsubset = SSX_train
        
        rf = RandomForestClassifier(max_depth=2, random_state=0)
        rf.fit(Xset, Yset)

        sam = pd.DataFrame()
        sam['score'] = rf.feature_importances_
        sam.index = Xset.columns.to_list()
        score = []
        for label in Xsubset.columns.to_list():
            score.append([sam.loc[label].score, approach])
        df = pd.concat([df, pd.DataFrame(score, columns = columns)])

driver_id:          original_size:      stay_size:          driving_size:       cleaned_driving_size: 
206                 6:44:40             4:46:42             1:57:58             1:57:52             
210                 6:05:37             3:14:38             2:50:59             2:50:54             
207                 12:13:19            9:10:35             3:02:44             3:02:39             
205                 9:38:07             8:10:24             1:27:43             1:27:38             
209                 6:34:13             4:20:44             2:13:29             2:13:24             
204                 8:39:25             6:50:00             1:49:25             1:49:19             
208                 8:00:58             5:54:09             2:06:49             2:06:44             
201                 4:19:09             2:38:41             1:40:28             1:40:28             
203                 8:24:28             5:41:16             2:43:11             2:43:06  

In [139]:
df.groupby("method").mean()

Unnamed: 0_level_0,scores
method,Unnamed: 1_level_1
all,0.000376
histogram,0.000792
spectral,0.000138
statistical,0.000738
temporal,0.000502


##Mutual Information

In [142]:
columns =['scores', 'method']
df = pd.DataFrame(columns = columns)
for n_driver in range(10,11):
  drivers = random.sample([i for i in range(201,211)], n_driver)
  for approach in ['histogram', 'statistical', 'all', 'temporal', 'spectral']:
    for data_aug_condition in [True]:
        initialization(n_driver)
        read_data(drivers)
        data_standardization()
        data_augmentation(data_aug_condition)
        feature_extraction('all')
        Xset = SX_train
        Yset = y_train

        initialization(n_driver)
        read_data(drivers)
        data_standardization()
        data_augmentation(data_aug_condition)
        feature_extraction(approach)
        feature_selection()
        Xsubset = SSX_train
        
        sam = pd.DataFrame()
        sam['score'] = mutual_info_classif(Xset, Yset, n_neighbors=2)
        sam.index = Xset.columns.to_list()
        score = []
        for label in Xsubset.columns.to_list():
            score.append([sam.loc[label].score, approach])
        df = pd.concat([df, pd.DataFrame(score, columns = columns)])
        # counter = 0
        # score = 0
        # for label in Xsubset.columns.to_list():
        #   counter += 1
        #   score += sam.loc[label].score
        
        # print("mutual_info_classif", score/counter, approach, data_aug_condition)

driver_id:          original_size:      stay_size:          driving_size:       cleaned_driving_size: 
209                 6:34:13             4:20:44             2:13:29             2:13:24             
204                 8:39:25             6:50:00             1:49:25             1:49:19             
202                 6:53:36             4:59:01             1:54:35             1:54:35             
205                 9:38:07             8:10:24             1:27:43             1:27:38             
206                 6:44:40             4:46:42             1:57:58             1:57:52             
210                 6:05:37             3:14:38             2:50:59             2:50:54             
207                 12:13:19            9:10:35             3:02:44             3:02:39             
201                 4:19:09             2:38:41             1:40:28             1:40:28             
203                 8:24:28             5:41:16             2:43:11             2:43:06  

In [143]:
df.groupby("method").mean()

Unnamed: 0_level_0,scores
method,Unnamed: 1_level_1
all,0.26708
histogram,0.344685
spectral,0.21993
statistical,0.3397
temporal,0.321033
