# About

* **Author**: Adil Rashitov
* **Creation Date**: 19.02.2020
* **Goal**: This notebook is created for experiments with modeling of vehicle fast movement for clustering needs
* **Deliverable**: Model detecting if car moving fast or not

In [1]:
# Imports
import os
import numpy as np
import pandas as pd
import logging
import plotly.express as px
import geopandas as gpd
import plotly.express as px
from multiprocessing import Pool
from sklearn.model_selection import train_test_split
from GPSOdyssey import Polaris, Kepler, Void, Vega
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

%load_ext autoreload
%autoreload 2

# Data

In [2]:
HISTORY = []

## Train GPS records 

In [3]:
I_OUTPUT_GPS_METRICS_ANALYSIS = '/Data/Intermediate/FastMovementDetection/TrainTestFastMovementDetection.csv'
gps = pd.read_csv(I_OUTPUT_GPS_METRICS_ANALYSIS, parse_dates=['datetime'])

gps['trip_id'] = gps['vehicle_id'] + ' ' + gps['datetime'].dt.date.astype('str')

gps.head()

Unnamed: 0,truck_id,lat_match,lon_match,lon,lat,datetime,velocity,is_speed_large,vehicle_id,trip_id
0,XE 5565T,1.334048,103.643252,103.643272,1.334069,2020-10-01 05:20:47+00:00,3.700296,True,XE-5565T,XE-5565T 2020-10-01
1,XE 5565T,1.334188,103.643109,103.643047,1.334273,2020-10-01 05:21:04+00:00,4.88102,True,XE-5565T,XE-5565T 2020-10-01
2,XE 5565T,1.33434,103.64295,103.642965,1.334356,2020-10-01 05:21:09+00:00,5.130609,True,XE-5565T,XE-5565T 2020-10-01
3,XE 5565T,1.334502,103.642785,103.642804,1.334521,2020-10-01 05:21:14+00:00,7.08742,True,XE-5565T,XE-5565T 2020-10-01
4,XE 5565T,1.334802,103.642673,103.642714,1.334771,2020-10-01 05:21:19+00:00,6.801627,True,XE-5565T,XE-5565T 2020-10-01


## Deliverable model

In [4]:
OUTPUT_MODEL = '/models/fast_movement_detector.sav'
OUTPUT_MODEL

'/models/fast_movement_detector.sav'

In [5]:
!ls /models

README.md  fast_movement_detector.sav


# Preprocessing

## Feature engineering

In [6]:
unique_trips = gps['trip_id'].unique()
gps_trips = list(map(lambda gps, trip: gps[gps['trip_id'] == trip], [gps]*len(unique_trips), unique_trips))


def preprocess_trips(gps):
    gps = gps.reset_index(drop=True)
    gps = Polaris(gps) \
        .add_lag_of_column('velocity', lag_shifts=[1, 2, 3, 4, 5]) \
        .pandas_df_operation(func_name='fillna', columns=['velocity_lag_1', 'velocity_lag_2', 'velocity_lag_3',
                                                          'velocity_lag_4', 'velocity_lag_5'], arguments={'method': 'bfill'}) \
        .select_columns(['velocity', 'velocity_lag_1', 'velocity_lag_2', 'velocity_lag_3',
                         'velocity_lag_4', 'velocity_lag_5', 'is_speed_large']) \
        .df
    return gps

dataset = pd.concat(list(map(preprocess_trips, gps_trips)))
dataset

Unnamed: 0,velocity,velocity_lag_1,velocity_lag_2,velocity_lag_3,velocity_lag_4,velocity_lag_5,is_speed_large
0,3.700296,3.700296,3.700296,3.700296,3.700296,3.700296,True
1,4.881020,3.700296,3.700296,3.700296,3.700296,3.700296,True
2,5.130609,4.881020,3.700296,3.700296,3.700296,3.700296,True
3,7.087420,5.130609,4.881020,3.700296,3.700296,3.700296,True
4,6.801627,7.087420,5.130609,4.881020,3.700296,3.700296,True
...,...,...,...,...,...,...,...
379,13.608232,12.913178,10.618835,7.221887,7.056251,9.075022,True
380,8.219281,13.608232,12.913178,10.618835,7.221887,7.056251,True
381,2.685028,8.219281,13.608232,12.913178,10.618835,7.221887,True
382,2.460051,2.685028,8.219281,13.608232,12.913178,10.618835,True


## X & y split

In [7]:
dataset.columns
X = dataset[['velocity', 'velocity_lag_1', 'velocity_lag_2', 'velocity_lag_3', 'velocity_lag_4', 'velocity_lag_5']]
y = dataset['is_speed_large']

Index(['velocity', 'velocity_lag_1', 'velocity_lag_2', 'velocity_lag_3',
       'velocity_lag_4', 'velocity_lag_5', 'is_speed_large'],
      dtype='object')

## Train & test split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modeling

In [9]:
from sklearn.linear_model import LogisticRegression

parameters = {
}

classifier = LogisticRegression(**parameters)
classifier.fit(X_train, y_train)


LogisticRegression()

# Performance estimation

In [10]:
from sklearn.metrics import roc_auc_score


y_pred = classifier.predict(X_test)
HISTORY.append((parameters, roc_auc_score(y_test, y_pred)))

In [11]:
HISTORY

[({}, 0.9772228448678435)]

# MODEL EXPORT

In [12]:
import pickle


with open(OUTPUT_MODEL, 'wb') as file:
    pickle.dump(classifier, file)

In [13]:
classifier

LogisticRegression()