# Grab Data Safety Challenge 

Objectives: To determine if a certain drive is a safe drive or reckless drive
How to know:
    1. From the acceleration at the certain point
    2. From the Location and any other geolocation behavior

In [30]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
import pprint

import os, math

features_list_files = [f for f in os.listdir('./features') if not f.startswith('.')]
labels_list_files = [f for f in os.listdir('./labels') if not f.startswith('.')]

In [31]:
print(features_list_files)
print(labels_list_files)

['part-00000-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv', 'part-00001-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv', 'part-00002-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv', 'part-00003-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv', 'part-00004-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv', 'part-00005-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv', 'part-00006-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv', 'part-00007-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv', 'part-00008-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv', 'part-00009-e6120af0-10c2-4248-97c4-81baf4304e5c-c000.csv']
['part-00000-e9445087-aa0a-433b-a7f6-7f4c19d78ad6-c000.csv']


# Extractions of csv and stitchings

In [41]:
def generateDf(filename,folder):
    drive_path = './'+folder+'/' + filename
    df_category = pd.read_csv(drive_path)
    return df_category

In [52]:
def generateSafetyDf(filenames):
    df_safety= pd.DataFrame(columns=['bookingID', 'Accuracy', 'Bearing', 'acceleration_x', 'acceleration_y',
       'acceleration_z', 'gyro_x', 'gyro_y', 'gyro_z', 'second', 'Speed'])
    for filename in filenames:
        df_safety = pd.concat([df_safety,generateDf(filename,'features')])
    return df_safety
        

In [55]:
df_safety = generateSafetyDf(features_list_files)
df_label = generateDf(labels_list_files[0],'labels')

In [79]:
df_label['bookingID'] = df_label['bookingID'].astype(object)

In [80]:
print(df_safety.size)
print(df_label.size)

177491171
40036


In [81]:
df_safety.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed
0,1202590843006,3.0,353.0,1.228867,8.9001,3.986968,0.008221,0.002269,-0.009966,1362.0,0.0
1,274877907034,9.293,17.0,0.032775,8.659933,4.7373,0.024629,0.004028,-0.010858,257.0,0.19
2,884763263056,3.0,189.0,1.139675,9.545974,1.951334,-0.006899,-0.01508,0.001122,973.0,0.667059
3,1073741824054,3.9,126.0,3.871543,10.386364,-0.136474,0.001344,-0.339601,-0.017956,902.0,7.913285
4,1056561954943,3.9,50.0,-0.112882,10.55096,-1.56011,0.130568,-0.061697,0.16153,820.0,20.419409


In [82]:
df_label.head()

Unnamed: 0,bookingID,label
0,111669149733,0
1,335007449205,1
2,171798691856,0
3,1520418422900,0
4,798863917116,0


In [83]:
df_merge = pd.merge(df_safety,df_label,on='bookingID')

In [84]:
df_merge.head()

Unnamed: 0,bookingID,Accuracy,Bearing,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z,second,Speed,label
0,1202590843006,3.0,353.0,1.228867,8.9001,3.986968,0.008221,0.002269,-0.009966,1362.0,0.0,1
1,1202590843006,3.033,77.0,1.137217,8.975278,3.98105,0.021118,-0.022515,0.024522,541.0,0.0,1
2,1202590843006,6.173,283.0,1.522818,7.947064,5.075056,0.137567,-0.297055,-0.017418,154.0,2.863286,1
3,1202590843006,3.014,353.0,1.134775,8.932489,3.929421,-0.007959,-0.020543,-0.022986,1384.0,0.0,1
4,1202590843006,5.209,340.0,0.858197,9.078037,2.944571,0.061924,0.015219,0.025517,1142.0,9.750134,1


## Generate Pickle for easy nitpicking of df_merge

In [85]:
import pickle

with open('df_merge.pickle', 'wb') as f:
    pickle.dump(df_merge, f)
    
with open('df_safety.pickle', 'wb') as f:
    pickle.dump(df_safety, f)

with open('df_label.pickle', 'wb') as f:
    pickle.dump(df_label, f)

## Opening Pickles for easy dataframe generation

In [87]:
df_merge = pd.read_pickle('df_merge.pickle')
df_safety = pd.read_pickle('df_safety.pickle')
df_label = pd.read_pickle('df_label.pickle')

In [89]:
print(df_merge.shape)
print(df_safety.shape)
print(df_label.shape)

(16154418, 12)
(16135561, 11)
(20018, 2)


# Let's do an exploratory analysis

1. Create a grouping of merged Booking ID. 
2. Find out the aggregations of each variable.