# Driver Identification based on Virojboonkiate et al. 2019
Virojboonkiate, N., Chanakitkarnchok, A., Vateekul, P. and Rojviboonchai, K., 2019. Public transport driver identification system using histogram of acceleration data. Journal of Advanced Transportation, 2019.

https://www.hindawi.com/journals/jat/2019/6372597/

# Prepare

In [None]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import os
import seaborn as sns
from scipy import stats
from sklearn.neural_network import MLPClassifier
from sklearn.metrics.classification import accuracy_score, recall_score, f1_score, precision_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn import preprocessing
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
# access to my google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#################### DOWNLOAD AND UNZIP FILE SAVED IN DRIVE ####################

!pip install -U -q PyDrive


# HERE YOUR FILE ID ( GET IT WITH THE SHARING URL: https://drive.google.com/open?id=1MWZpk6SRmqBUGjwcdaEzoVDm7mUKOnci )

zip_id = '1MWZpk6SRmqBUGjwcdaEzoVDm7mUKOnci'



from pydrive.auth import GoogleAuth

from pydrive.drive import GoogleDrive

from google.colab import auth

from oauth2client.client import GoogleCredentials

import zipfile, os


# 1. Authenticate and create the PyDrive client.

auth.authenticate_user()

gauth = GoogleAuth()

gauth.credentials = GoogleCredentials.get_application_default()

drive = GoogleDrive(gauth)



# DOWNLOAD ZIP

print ("Downloading zip file")

myzip = drive.CreateFile({'id': zip_id})

myzip.GetContentFile('DrEftekhari.zip')



# UNZIP ZIP

print ("Uncompressing zip file")

zip_ref = zipfile.ZipFile('DrEftekhari.zip', 'r')

zip_ref.extractall()

zip_ref.close()

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/googleapiclient/discovery_cache/__init__.py", line 36, in autodetect
    from google.appengine.api import memcache
ModuleNotFoundError: No module named 'google.appengine'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/googleapiclient/discovery_cache/file_cache.py", line 33, in <module>
    from oauth2client.contrib.locked_file import LockedFile
ModuleNotFoundError: No module named 'oauth2client.contrib.locked_file'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/googleapiclient/discovery_cache/file_cache.py", line 37, in <module>
    from oauth2client.locked_file import LockedFile
ModuleNotFoundError: No module named 'oauth2client.locked_file'

During handling of the above exception, another exceptio

Downloading zip file
Uncompressing zip file


# Dataset

In [None]:
import os
from datetime import datetime, timedelta
import re
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sn
import math
from sklearn import preprocessing
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.metrics import classification_report


class Utils:

    def __init__(self, sample_rate, data_split_ratio=0.7):
        self.all_features = ['x-accelerometer', 'y-accelerometer', 'z-accelerometer',
                             'GRAVITY X (m/s²)', 'GRAVITY Y (m/s²)', 'GRAVITY Z (m/s²)',
                             'LINEAR ACCELERATION X (m/s²)', 'LINEAR ACCELERATION Y (m/s²)',
                             'LINEAR ACCELERATION Z (m/s²)',
                             'x-gyroscope', 'y-gyroscope', 'z-gyroscope',
                             'LIGHT (lux)',
                             'MAGNETIC FIELD X (μT)', 'MAGNETIC FIELD Y (μT)', 'MAGNETIC FIELD Z (μT)',
                             'ORIENTATION Z (azimuth °)', 'ORIENTATION X (pitch °)', 'ORIENTATION Y (roll °)',
                             'LOCATION Latitude : ',
                             'LOCATION Longitude : ',
                             'LOCATION Altitude ( m)',
                             'LOCATION Altitude-google ( m)',
                             'LOCATION Speed ( Kmh)',
                             'LOCATION Accuracy ( m)',
                             'LOCATION ORIENTATION (°)',
                             'Satellites in range',
                             'Time since start in ms',
                             'timestamp']
        self.show_toolbar = True
        self.sample_rate = sample_rate
        self.data_split_ratio = data_split_ratio

    def read_data(self, db_path_prefix, file_name, features, driver_i):
        db_path = db_path_prefix + file_name
        db_new_path = db_path_prefix + "cleaned_by_acc/"

        if os.path.exists(db_new_path + file_name):
            data = pd.read_csv(db_new_path + file_name, low_memory=False)
            original_size = pd.read_csv(db_path, low_memory=False).shape[0]
            driving_size = data.shape[0]
            stay_size = original_size - driving_size
        else:
            data = pd.read_csv(db_path, low_memory=False)
            data.columns = self.all_features

            original_size = data.shape[0]

            def aggregate(point1, point2):
                return math.sqrt(math.pow(point1[0] - point2[0], 2) +
                                 math.pow(point1[1] - point2[1], 2) +
                                 math.pow(point1[2] - point2[2], 2))

            staypoints = [0]
            points_acc = list(zip(data['x-accelerometer'], data['y-accelerometer'], data['z-accelerometer']))
            for j in range(0, len(points_acc) - 12):
                node = points_acc[j]
                add = True
                for j2 in range(j + 1, j + 12):
                    if aggregate(node, points_acc[j2]) > 0.5:
                        add = False
                if add:
                    staypoints.append(j)

            if not staypoints.__contains__(len(points_acc) - 1):
                staypoints.append(len(points_acc) - 1)

            stay_size = len(staypoints)

            data = data.drop(index=data['x-accelerometer'][staypoints].index)

            driving_size = data.shape[0]

            if not os.path.exists(db_new_path):
                os.makedirs(db_new_path)
            data.to_csv(db_new_path + file_name, index=False)

        missing_features = self.all_features.copy()
        for feature in features:
            missing_features.remove(feature)
        data = data.drop(columns=missing_features)

        data = data.fillna(data.mean())

        clean_driving_size = data.shape[0]

        template = "{0:20}{1:20}{2:20}{3:20}{4:20}"
        if self.show_toolbar:
            self.show_toolbar = False
            print(template.format("driver_id: ", "original_size: ", "stay_size: ", "driving_size: ",
                                  "cleaned_driving_size: "))
        print(template.format(str(driver_i),
                              str(timedelta(seconds=int(original_size / self.sample_rate))),
                              str(timedelta(seconds=int(stay_size / self.sample_rate))),
                              str(timedelta(seconds=int(driving_size / self.sample_rate))),
                              str(timedelta(seconds=int(clean_driving_size / self.sample_rate)))))

        return self.split_to_train_test(data)

    def split_to_train_test(self, data):
        return data[:int(len(data) * self.data_split_ratio)], data[int(len(data) * self.data_split_ratio):]

    def save_result(self, saving_path, result, data, running_time):
        if not os.path.exists(saving_path):
            os.makedirs(saving_path)
        
        # Save to file
        with open(saving_path + 'statistics.txt', 'a') as f:
            f.write('\n==========***==========\n' +
                    datetime.now().strftime("%Y:%m:%d %H:%M:%S") +
                    '\n' +
                    'running time :' + str(running_time.seconds) + " seconds" +
                    '\n')
            f.write(str(data))
            f.write('\n')
            f.write(str(result))
            f.write('\n')

# Segmentation & Feature Extration

In [None]:
class TransferToHistogram:

    def __init__(self, window_size=90, window_ovrlap_size=45, num_bins=8):
        self.window_size = window_size
        self.overlapping = window_ovrlap_size
        self.num_bins = num_bins

    def transfer(self, dataset, features, saving_histogram_charts=False):
        print("segmenting data with " + str(len(dataset)) + " points")
        segments, labels = self.segment_signal(dataset, features)
        new_dataset = []
        print("making " + str(len(segments)) + " segments")
        win = 0
        for segment, label in zip(segments, labels):
            row = []
            win += 1
            for feature_i in range(len(segment[1])):
                segment_f = segment[0:, [feature_i]]
                r1 = np.percentile(dataset[features[feature_i]],10)
                r2 = np.percentile(dataset[features[feature_i]],90)
                r = (r1 - r2 * 2, r1 + r2 * 2)
                count, bins = np.histogram(segment_f, bins=self.num_bins, range=r)
                his = count
                row = np.append(row, his, axis=0)
            row = np.append(row, [label], axis=0)
            new_dataset.append(row)

        columns = []
        for feature in features:
            for i in range(self.num_bins):
                columns.append(feature + '-' + str(i))
        columns.append('id')
        df = pd.DataFrame(new_dataset, columns=columns)
        return df

    def windows(self, data):
        start = 0
        while start < data.count():
            yield int(start), int(start + self.window_size)
            start += (self.window_size - self.overlapping)

    def segment_signal(self, dataset, features):
        segments = np.empty((0, self.window_size, len(features)))
        labels = np.empty((0))
        for class_i in np.unique(dataset["id"]):
            subset = dataset[dataset["id"] == class_i]
            for (start, end) in self.windows(subset["id"]):
                feature_slices = []
                for feature in features:
                    feature_slices.append(subset[feature][start:end])
                if len(feature_slices[0]) == self.window_size:
                    segments = np.vstack([segments, np.dstack(
                        [feature_slices[k] for k in range(len(feature_slices))])])
                    labels = np.append(labels, stats.mode(subset["id"][start:end])[0][0])
        return segments, labels


# VARIABLES

In [None]:
def initialization(n_driver):
  global features
  features = ['x-accelerometer', 'y-accelerometer']
  global db_path_prefix
  db_path_prefix = ''
  global sample_rate
  sample_rate = 2

  global window_size
  window_size = sample_rate * 60 * 15
  global overlapping
  overlapping = int(window_size * 0.75)

  print("window size :", str(timedelta(seconds=window_size / sample_rate)))
  print("overlap size :", str(timedelta(seconds=overlapping / sample_rate)))

  global utils
  utils = Utils(sample_rate=sample_rate)

#SEGMENTATION

In [None]:
def read_data():
  global train_dataset
  train_dataset = pd.DataFrame()
  global test_dataset
  test_dataset = pd.DataFrame()
  for i in random.sample([i for i in range(201,211)], n_driver):
    train_temp_dataset, test_temp_dataset = utils.read_data(db_path_prefix, str(i) + '.1.csv', features, i)
    train_temp_dataset['id'] = i
    test_temp_dataset['id'] = i
    train_dataset = pd.concat([train_dataset, train_temp_dataset])
    test_dataset = pd.concat([test_dataset, test_temp_dataset])

In [None]:
def replace_ids():
  replace_y_numbers = {k: v for v, k in enumerate(sorted(set(train_dataset.iloc[:, -1])))}
  train_dataset.iloc[:, -1] = train_dataset.iloc[:, -1].replace(replace_y_numbers)
  test_dataset.iloc[:, -1] = test_dataset.iloc[:, -1].replace(replace_y_numbers)

In [None]:
def feature_extraction():
  featureExtractor = TransferToHistogram(window_size=window_size, window_ovrlap_size=overlapping, num_bins=100)

  global S_train
  S_train = featureExtractor.transfer(train_dataset, features)
  global S_test
  S_test = featureExtractor.transfer(test_dataset, features)

  global SX_train
  SX_train = S_train.iloc[:, :-1]
  global y_train
  y_train = S_train.iloc[:, -1]
  global SX_test
  SX_test = S_test.iloc[:, :-1]
  global y_test
  y_test = S_test.iloc[:, -1]

In [None]:
def Normalization(X_train, X_test):
  CX_train = X_train.copy()
  CX_test = X_test.copy()

  scaler = preprocessing.Normalizer()
  scaler = scaler.fit(CX_train)
  NCX_train = scaler.transform(CX_train)
  NCX_test = scaler.transform(CX_test)
  
  return NCX_train, NCX_test

In [None]:
def feature_normalization():
  global NSX_train
  global NSX_test
  NSX_train, NSX_test = Normalization(SX_train, SX_test)
  NSX_train = pd.DataFrame(NSX_train)
  NSX_test = pd.DataFrame(NSX_test)

#TRAINING

In [None]:
def hyperparameter_tuning():
  param_grid = {
            'hidden_layer_sizes': [100],
            'activation': ['tanh', 'relu', 'logistic'],
            'solver': ['sgd', 'adam', 'lbfgs'],
            'alpha': 10.0 ** -np.arange(1, 4),
            'learning_rate': ['constant', "invscaling", 'adaptive'],
            'momentum': [0.2, 0.5, 0.7, 0.8],
            'random_state': [1, 2, 3, 4, 5, 6, 7, 8, 9]
        }
  grid_search = RandomizedSearchCV(MLPClassifier(), param_grid, random_state=0, cv=10,
                                         return_train_score=True, n_jobs=5)
  grid_search.fit(NSX_train, y_train)
  means = grid_search.cv_results_['mean_test_score']
  stds = grid_search.cv_results_['std_test_score']
  for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
      print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
  print(grid_search.best_params_)
  print("score:", grid_search.best_score_)

In [None]:
def train_model():
  model = MLPClassifier(solver='lbfgs', random_state=2, momentum=0.5, learning_rate='invscaling', hidden_layer_sizes=100, alpha=0.001, activation='logistic')
  model_train = model.fit(NSX_train, y_train)
  tmp = model.predict(NSX_test)
  accuracy = accuracy_score(y_test, tmp)
  precision = precision_score(y_test, tmp, average='macro')
  recall = recall_score(y_test, tmp, average='macro')
  f1 = f1_score(y_test, tmp, average='macro')
  return accuracy, precision, recall, f1

In [None]:
def save_result(accuracy_list, recall_list, precision_list, f1_list, running_time):
  saving_path = '/content/drive/MyDrive/DriverIdentification/Driver Identification GSM+GAN/Log/Virojboonkiate/'
  data = {'window size':window_size / (60*sample_rate),'overlap':overlapping / window_size, 
          'algoritm':'MLP', 'dataset':'eftekhari', 'drivers':n_driver, 'features':features}
  (accuracy_mean,accuracy_std) = (np.average(accuracy_list),np.std(accuracy_list))
  (recall_mean,recall_std) = (np.average(recall_list),np.std(recall_list))
  (precision_mean,precision_std) = (np.average(precision_list),np.std(precision_list))
  (f1_mean,f1_std) = (np.average(f1_list),np.std(f1_list))
  result = {
      'accuracy_mean':accuracy_mean,'accuracy_std':accuracy_std,
      'recall_mean':recall_mean,'recall_std':recall_std,
      'precision_mean':precision_mean,'precision_mean':precision_mean,
      'f1_mean':f1_mean,'f1_std':f1_std,
  }
  print('Mean Accuracy:{:.4f}({:.4f}) Mean Recall:{:.4f}({:.4f}) Mean Precision:{:.4f}({:.4f}) Mean F1:{:.4f}({:.4f})'.format(
      accuracy_mean,accuracy_std, recall_mean,recall_std, precision_mean,precision_std, f1_mean,f1_std))
  utils.save_result(saving_path=saving_path, result=result, data=data, running_time=running_time)

# RUN

In [None]:
for n_driver in range(4,11):
  accuracy_list = []
  recall_list = []
  precision_list = []
  f1_list = []
  start = datetime.now()
  for i in range(10):
    initialization(n_driver)
    read_data()
    replace_ids()
    feature_extraction()
    feature_normalization()
    accuracy, precision, recall, f1 = train_model()
    accuracy_list.append(accuracy)
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)
    print('Accuracy:{:.4f} Precision:{:.4f} Recall:{:.4f} F1:{:.4f}'.format(accuracy, precision, recall, f1))
  end = datetime.now()
  running_time = end - start
  save_result(accuracy_list, recall_list, precision_list, f1_list, running_time)

window size : 0:15:00
overlap size : 0:11:15
driver_id:          original_size:      stay_size:          driving_size:       cleaned_driving_size: 
203                 8:24:28             5:41:16             2:43:11             2:43:11             
210                 6:05:37             3:14:38             2:50:59             2:50:59             
202                 6:53:36             4:59:01             1:54:35             1:54:35             
207                 12:13:19            9:10:35             3:02:44             3:02:44             
209                 6:34:13             4:20:44             2:13:29             2:13:29             
206                 6:44:40             4:46:42             1:57:58             1:57:58             
201                 4:19:09             2:38:41             1:40:28             1:40:28             
204                 8:39:25             6:50:00             1:49:25             1:49:25             
208                 8:00:58             5:54