## Problem Statement:

Given the provided dataset containing telematics data from over 50,000 anonymized driver trips, the objective is to leverage this data to develop useful applications in the automobile domain. Specifically, we aim to explore potential use cases such as driver recommendations for trips and driver profiling based on their behavior.

Objective:

1. Driver Recommendations for Trips: Develop an algorithm that can recommend suitable drivers for specific types of trips based on their driving behavior and preferences as well (if customer-specific or some other context that we think of using  can be obtained then more of a contextual bandit can be prepared,  multiple algos can be fit into the framework letting the framework pick the best model)

2. Driver Profiling: Create profiles for individual drivers based on their driving patterns, habits, and risk factors to better understand their driving style and preferences.


In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# Importing packages

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

In [3]:
# Reading cleaned hvcan data from csv file

hvcan = pd.read_csv('/content/drive/MyDrive/Honda Capstone Project- MOVE/cleaned_hvcan.csv')

hvcan.head()

Unnamed: 0.1,Unnamed: 0,partitionDate,device,filedate,filetimeutc,engineTorque,engineCoolantTemp,engineAtmosphericPressure,steeringSpeed,engineSpeed,steeringTorque,epsLoad,accLsfSubseg,fuelUsed,cabinTemp,compCurrent,latAccel,accelPedalPos,wheelVelLF,wheelVelRF,wheelVelLR,wheelVelRR,accStatus,odometerMiles,driverSeatbelt,accSetSpeed,cruiseSetSpeed,tirePressureLF,tirePressureRF,tirePressureLR,tirePressureRR,outsideAirTemp,tmDisplayedGear,absWheelSpeedFL,absWheelSpeedFR,absWheelSpeedRL,absWheelSpeedRR
0,0,2022-01-01,3077,20220101,130718,6553.4,-41,91.4,0,0,0.0,0.0,0,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,False,0.0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0
1,1,2022-01-01,3077,20220101,130718,6553.4,-41,91.4,0,0,0.0,0.0,0,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,False,0.0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0
2,2,2022-01-01,3077,20220101,130718,6553.4,-41,91.4,0,0,0.0,0.0,0,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,False,0.0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0
3,3,2022-01-01,3077,20220101,130718,6553.4,-41,91.4,0,0,0.0,0.0,0,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,False,0.0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0
4,4,2022-01-01,3077,20220101,130718,6553.4,-41,91.4,0,0,0.0,0.0,0,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,False,0.0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0


In [4]:
# Removing 'Unnamed: 0' from hvcan df

hvcan = hvcan.drop('Unnamed: 0', axis= 1)

In [5]:
# Eliminating redundant columns from the dataframe as they do not contribute to the problem statement.

hvcan = hvcan.drop(['tirePressureLF',
                    'tirePressureRF',
                    'tirePressureLR',
                    'tirePressureRR',
                    'outsideAirTemp',
                    'outsideAirTemp',
                    'tmDisplayedGear',
                    'absWheelSpeedFL',
                    'absWheelSpeedFR',
                    'absWheelSpeedRL',
                    'absWheelSpeedRR',
                    'wheelVelLF',
                    'wheelVelRF',
                    'wheelVelLR',
                    'wheelVelRR'], axis=1)

In [6]:
# Checking dimension of cleaned hvcan data
hvcan.shape

(1358196, 22)

In [7]:
# Checking data type, values and columns of hvcan
hvcan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1358196 entries, 0 to 1358195
Data columns (total 22 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   partitionDate              1358196 non-null  object 
 1   device                     1358196 non-null  int64  
 2   filedate                   1358196 non-null  int64  
 3   filetimeutc                1358196 non-null  int64  
 4   engineTorque               1358196 non-null  float64
 5   engineCoolantTemp          1358196 non-null  int64  
 6   engineAtmosphericPressure  1358196 non-null  float64
 7   steeringSpeed              1358196 non-null  int64  
 8   engineSpeed                1358196 non-null  int64  
 9   steeringTorque             1358196 non-null  float64
 10  epsLoad                    1358196 non-null  float64
 11  accLsfSubseg               1358196 non-null  int64  
 12  fuelUsed                   1358196 non-null  float64
 13  cabinTemp   

In [8]:
# Function to filter out groups with any NaN values and retain the first 240 rows
def get_first_240_rows(group):
    # Check if any NaN values exist in the group
    if group.isna().any().any() != True:
        return group  # Return the group if it doesn't contain any NaN values

# Function to preprocess the DataFrame by grouping it based on 'device' and 'filetimeutc',
# and applying the get_first_240_rows function to each group
def preprocess_df(filter_data):
    # Group the DataFrame by 'device' and 'filetimeutc', apply the get_first_240_rows function,
    # and reset the index of the resulting DataFrame
    result = filter_data.groupby(['device', 'filetimeutc']).apply(get_first_240_rows).reset_index(drop=True)
    # Reset the index of the resulting DataFrame again
    return result.reset_index()

In [9]:
# Applying preprocess_df function to hvcan dataframe
filter_data = preprocess_df(hvcan)

In [10]:
# Function to generate lag features
def generate_lag(df):
    # Combining 'filedate' and 'filetimeutc' columns to create a datetime column
    df['datetime_utc'] = pd.to_datetime(df['filedate'], format='%Y%m%d') + pd.to_timedelta(df['filetimeutc'], unit='ms')

    # Sorting the DataFrame by 'device' and 'datetime_utc' columns
    df = df.sort_values(by=['device', 'datetime_utc'])

    # Grouping the DataFrame by 'device' and 'datetime_utc' columns
    grouped = df.groupby(['device', 'datetime_utc'])

    # Initializing an empty list to store lag features
    lag_features = []

    # Looping through a range from 1 to 7 (inclusive) to create lag features
    for i in range(1, 8):
        # Shifting the grouped data by 'i' to create lag features and add suffix to column names
        lag_feature = grouped[df.columns.to_list()].shift(i).add_suffix(f'_lag{i}')

        # Appending the lag feature to the list
        lag_features.append(lag_feature)

    # Concatenating the original DataFrame and lag features along columns axis
    return pd.concat([df] + lag_features, axis=1)

In [11]:
# Applying generate_lag function to filter_data
df_with_lag = generate_lag(filter_data)

In [12]:
# Dropping rows with missing values (NaNs) from the DataFrame with lag features
df_with_lag.dropna(inplace=True)

# Generat unique combinations of 'device' and 'datetime_utc' columns to represent unique trips
unique_combinations = df_with_lag[['device', 'datetime_utc']].drop_duplicates()

# Spliting the unique combinations into train and test sets, ensuring all instances of a trip are in the same set
# This step is important for maintaining temporal integrity in time-series data

train_combinations, test_combinations = train_test_split(unique_combinations, test_size=0.2, random_state=42)


In [13]:
# Checking train_combinations dataframe
#train_combinations.head()

In [14]:
# Checking dimensions of train_combinations
#train_combinations.shape

In [15]:
# Checking test_combinations dataframe
#test_combinations.head()

In [16]:
# Checking dimensions of test_combinations
#test_combinations.shape

In [17]:

# Initializing an empty list to store train data
train = []

# Iterating through each unique combination in the train_combinations DataFrame
for i in range(len(train_combinations)):
    # Filtering the DataFrame with lag features to get instances corresponding to the current combination
    train.append(df_with_lag[(df_with_lag['device'] == train_combinations['device'].iloc[i]) &
                             (df_with_lag['datetime_utc'] == train_combinations['datetime_utc'].iloc[i])])

# Concatenating the filtered dataframes to create the training dataset
df_train = pd.concat(train)

In [18]:
# Path to save the training data
#path_to_train = 'train2.csv'

# Saving the training dataset DataFrame to a CSV file at the specified path
#df_train.to_csv(path_to_train)

In [19]:
#from google.colab import files
#files.download('train2.csv')

In [20]:
# Checking df_train dataframe
#df_train.head()

In [21]:
# Initializing an empty list to store test data
test = []

# Iteratting through each unique combination in the test_combinations DataFrame
for i in range(len(test_combinations)):
    # Filter the DataFrame with lag features to get instances corresponding to the current combination
    test.append(df_with_lag[(df_with_lag['device'] == test_combinations['device'].iloc[i]) &
                            (df_with_lag['datetime_utc'] == test_combinations['datetime_utc'].iloc[i])])

# Concatenating the filtered dataframes to create the testing dataset
df_test = pd.concat(test)

In [22]:
# Path to save the testing data
#path_to_test = 'test2.csv'

# Saving the testing dataset DataFrame to a CSV file at the specified path
#df_test.to_csv(path_to_test)

In [23]:
#from google.colab import files
#files.download('test2.csv')

In [24]:
# Checking df_test dataframe
#df_test.head()

## Training the isolation forest

In [25]:
# Dropping specified columns from the training dataset DataFrame

df_train = df_train.drop(['index',
                          'device',
                          'filetimeutc',
                          'filedate',
                          'datetime_utc',
                          'partitionDate',
                          'partitionDate_lag1',
                          'datetime_utc_lag1',
                          'partitionDate_lag2',
                          'datetime_utc_lag2',
                          'partitionDate_lag3',
                          'datetime_utc_lag3',
                          'partitionDate_lag4',
                          'datetime_utc_lag4',
                          'partitionDate_lag5',
                          'datetime_utc_lag5',
                          'partitionDate_lag6',
                          'datetime_utc_lag6',
                          'partitionDate_lag7',
                          'datetime_utc_lag7'], axis=1)



In [26]:
# Checking dataframe df_train
#df_train.head()

In [27]:
# Initialize and fit an IsolationForest model to the training data
clf = IsolationForest(random_state=0).fit(df_train)

In [28]:
# Predicting outliers in the training dataset using the IsolationForest model
df_train['pred'] = clf.predict(df_train)

# Defining Scoring Model


In [29]:
def get_trip_data(df, device, filetimeutc):

    # Filtering DataFrame to get samples for a specific device and filetimeutc
    return df[(df['device'] == device) & (df['filetimeutc'] == filetimeutc)]

In [30]:
def calculate_trip_duration(sample_count):

    # Calculating the duration of a trip in minutes based on the number of samples
    return sample_count / 300  # Each row represents 5 seconds

In [31]:
def calculate_score_change(df_sample):

    """Calculate the change in score based on predictions and scores in the sample."""

    # Initialize variables
    max_neg = 0
    curr_score = 0
    window_len = 30  # Length of the sliding window in seconds
    rows_per_batch = window_len / 5  # Number of rows per batch (each row represents 5 seconds)

    # Iterate through each prediction and score in the sample
    for pred, pred_score in zip(df_sample['pred'], df_sample['predscore']):

        # Check if there are rows left in the current batch
        if rows_per_batch > 0:
            # Update 'max_neg' if the prediction is negative (outlier)
            max_neg = max(max_neg, abs(pred_score) - 0.5) if pred == -1 else max_neg
            rows_per_batch -= 1  # Decrement rows_per_batch

        # If the batch window is exhausted
        elif rows_per_batch == 0:
            # Update the score based on 'max_neg'
            curr_score -= 5 * (max_neg * 10) ** 2 if max_neg > 0 else 0

            # Reset 'max_neg' if the current prediction is negative (outlier)
            max_neg = abs(pred_score) - 0.5 if pred == -1 else 0

            # Reset rows_per_batch and decrement it for the next batch
            rows_per_batch = window_len / 5 - 1

    return curr_score

In [32]:
def score_trip(df, keys, d_curr_ratings, trip_list=[]):

    """Score each trip based on predictions and update the rating dictionary."""

    # Iterate over each key in the keys list
    for key in keys:
        # Filter DataFrame to get samples for the current key
        df_sample = get_trip_data(df, key[0], key[1])

        # Calculate trip duration
        duration_of_trip = calculate_trip_duration(len(df_sample))

        # Calculate duration of positive predictions
        duration_pos_len = calculate_trip_duration(len(df_sample[df_sample['pred'] == 1]))

        # Calculate duration of negative predictions
        duration_neg_len = duration_of_trip - duration_pos_len

        # Calculate score change based on predictions and scores
        score_change = calculate_score_change(df_sample)

        # Update the current score for the device
        curr_score = d_curr_ratings[key[0]] + duration_pos_len + score_change
        d_curr_ratings[key[0]] = curr_score

        # Append device, filetimeutc, and score to trip_list
        trip_list.append((key[0], key[1], curr_score))

    return trip_list

In [None]:
# Reading the CSV file into a DataFrame
df_train = pd.read_csv('/content/drive/MyDrive/Honda Capstone Project- MOVE/train2.csv')

# Displaying the first few rows of the DataFrame
df_train.head()

In [None]:
# Dropping specified columns from the DataFrame 'df_train' and assign the result to 'df_req'

df_req = df_train.drop(['index',
                        'device',
                        'filetimeutc',
                        'filedate',
                        'datetime_utc',
                        'partitionDate',
                        'partitionDate_lag1',
                        'datetime_utc_lag1',
                        'partitionDate_lag2',
                        'datetime_utc_lag2',
                        'partitionDate_lag3',
                        'datetime_utc_lag3',
                        'partitionDate_lag4',
                        'datetime_utc_lag4',
                        'partitionDate_lag5',
                        'datetime_utc_lag5',
                        'partitionDate_lag6',
                        'datetime_utc_lag6',
                        'partitionDate_lag7',
                        'datetime_utc_lag7'], axis=1)


In [None]:
# Removing 'Unnamed: 0' from df_req

df_req.drop('Unnamed: 0', inplace = True, axis = 1)

In [None]:
# Predicting outliers in the DataFrame 'df_req' using the IsolationForest model 'clf'
# and assign the predictions to a new column 'pred' in 'df_train'

df_train['pred'] = clf.predict(df_req)

In [None]:
# Calculating anomaly scores for each sample in the DataFrame 'df_req' using the IsolationForest model 'clf'
# and assign the scores to a new column 'predscore' in 'df_train'

df_train['predscore'] = clf.score_samples(df_req)

In [None]:
# Getting the unique keys (combinations of 'device' and 'filetimeutc') for the trip from the DataFrame 'df_train' along with their counts

df_keys = df_train[['device','filetimeutc']].value_counts()

In [None]:
# Resetting the index of the DataFrame 'df_keys'
df_keys = df_keys.reset_index()

In [None]:
# Create a list of tuples containing unique combinations of 'device' and 'filetimeutc' from the DataFrame 'df_keys'
keys = list(zip(df_keys['device'], df_keys['filetimeutc']))

In [None]:
# Importing defaultdict from collections module
from collections import defaultdict

# Initialize an empty list to store trip details
tp_list = []

# Initialize a defaultdict to store current ratings for drivers, defaulting to 1500
driver_curr_ratings = defaultdict(lambda: 1500)

# Calculate scores for each trip based on predictions and update driver ratings
tp_list = score_trip(df_train, keys, driver_curr_ratings, tp_list)
