### Feature engineering

- Remove unnecessary columns to reduce dataframe size
- Filter train records according to min/max langtitude and longitude in grid info
- Generate grid info and aggregate records accroding to VIN and grid id

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

# loading training files
ecar_training_data_path = 'data/training/ecar/'
rcar_training_data_path = 'data/training/rcar/'

ecar_filenames = [f for f in os.listdir(ecar_training_data_path)]
rcar_filenames = [f for f in os.listdir(rcar_training_data_path)]

# define drop columns
ecar_drop_columns = ['work_mode','mileage','speed','avg_fuel_consumption','system_mode']
rcar_drop_columns = ['power_mode','mileage','speed','fuel_consumption']
common_drop_columns = ['lat','lon']

# loading grid min/max lantitude and longitude
grids = pd.read_csv('data/grid_info.csv')
grid_max_lantitude = grids['latitude_to'].max()
grid_min_lantitude = grids['latitude_from'].min()
grid_max_longitude = grids['longitude_to'].max()
grid_min_longitude = grids['longitude_from'].min()

In [8]:
def preprocess_ecar_data():
    for ecar_filename in ecar_filenames:
        # loading data
        df = pd.read_csv(ecar_training_data_path + ecar_filename)
        # drop uncessary columns
        df = df.drop(columns=ecar_drop_columns)
        # filter data by lat and lon
        df = filter_data_by_lan_lon(df)
        # generate new column grid_id
        df['grid_id'] = df.apply(lambda row: retrieve_grid_id(row), axis=1)
        # filter data by grid_id
        df = df[df.grid_id != -1]
        # drop lat and lon
        df = df.drop(columns=common_drop_columns)
        # format data_time column 
        df['date_time'] = df['date_time'].apply(lambda x : format_date_time(x))
        # aggregate data by date_time,vin and grid_id
        df = df.drop_duplicates()
        # generating new feature file
        df.to_csv(ecar_training_data_path + "_" + ecar_filename)
        print("Finish to process file ", ecar_filename)
        
def preprocess_rcar_data():
    df_ = pd.DataFrame(index=[0], columns=['car_id', 'date_time', 'grid_id'])
    df_ = df_.fillna(-1)
    for rcar_filename in rcar_filenames:
        # loading data
        print("Start to loading file ", rcar_filename)
        df = pd.read_csv(rcar_training_data_path + rcar_filename)
        # drop uncessary columns
        df = df.drop(columns=rcar_drop_columns)
        # filter data by lat and lon
        df = filter_data_by_lan_lon(df)
        # generate new column grid_id
        df['grid_id'] = df.apply(lambda row: retrieve_grid_id(row), axis=1)
        # filter data by grid_id
        df = df[df.grid_id != -1]
        # drop lat and lon
        df = df.drop(columns=common_drop_columns)
        # format data_time column 
        df['date_time'] = df['date_time'].apply(lambda x : format_date_time(x))
        # aggregate data by date_time,vin and grid_id
        df = df.drop_duplicates()
        # appending df
        df_ = df_.append(df)
    # write final file
    df_.to_csv(rcar_training_data_path + 'rcar_training.csv')
        
def filter_data_by_lan_lon(df):
    df = df[(df.lat >= grid_min_lantitude ) & (df.lat <= grid_max_lantitude)]
    df = df[(df.lon >= grid_min_longitude ) & (df.lon <= grid_max_longitude)]
    return df

def retrieve_grid_id(row):
    for i in range(len(grids)):
        if (row.lat >= grids['latitude_from'][i] and row.lat <= grids['latitude_to'][i] and 
            row.lon >= grids['longitude_from'][i] and row.lon <= grids['longitude_to'][i]):
            return grids['grid_id'][i]
    return -1

def format_date_time(date_time):
    return date_time[:-6].replace('-','').replace(' ', '')

In [88]:
preprocess_rcar_data()

Start to loading file  BOT_data_rcar_20170102_20170102_part0.csv
Start to loading file  BOT_data_rcar_20170102_20170102_part1.csv
Start to loading file  BOT_data_rcar_20170102_20170102_part2.csv
Start to loading file  BOT_data_rcar_20170103_20170103_part0.csv
Start to loading file  BOT_data_rcar_20170103_20170103_part1.csv
Start to loading file  BOT_data_rcar_20170103_20170103_part2.csv
Start to loading file  BOT_data_rcar_20170104_20170104_part0.csv
Start to loading file  BOT_data_rcar_20170104_20170104_part1.csv
Start to loading file  BOT_data_rcar_20170104_20170104_part2.csv
Start to loading file  BOT_data_rcar_20170105_20170105_part0.csv
Start to loading file  BOT_data_rcar_20170105_20170105_part1.csv
Start to loading file  BOT_data_rcar_20170105_20170105_part2.csv
Start to loading file  BOT_data_rcar_20170106_20170106_part0.csv
Start to loading file  BOT_data_rcar_20170106_20170106_part1.csv
Start to loading file  BOT_data_rcar_20170106_20170106_part2.csv
Start to loading file  BO

Start to loading file  BOT_data_rcar_20170213_20170213_part1.csv
Start to loading file  BOT_data_rcar_20170213_20170213_part2.csv
Start to loading file  BOT_data_rcar_20170214_20170214_part0.csv
Start to loading file  BOT_data_rcar_20170214_20170214_part1.csv
Start to loading file  BOT_data_rcar_20170214_20170214_part2.csv
Start to loading file  BOT_data_rcar_20170215_20170215_part0.csv
Start to loading file  BOT_data_rcar_20170215_20170215_part1.csv
Start to loading file  BOT_data_rcar_20170215_20170215_part2.csv
Start to loading file  BOT_data_rcar_20170216_20170216_part0.csv
Start to loading file  BOT_data_rcar_20170216_20170216_part1.csv
Start to loading file  BOT_data_rcar_20170216_20170216_part2.csv
Start to loading file  BOT_data_rcar_20170217_20170217_part0.csv
Start to loading file  BOT_data_rcar_20170217_20170217_part1.csv
Start to loading file  BOT_data_rcar_20170217_20170217_part2.csv
Start to loading file  BOT_data_rcar_20170218_20170218_part0.csv
Start to loading file  BO

In [None]:
preprocess_ecar_data()

  if self.run_code(code, result):


Finish to process file  BOT_data_ecar_20170105_20170105_part2.csv
Finish to process file  BOT_data_ecar_20170106_20170106_part0.csv
Finish to process file  BOT_data_ecar_20170106_20170106_part1.csv
Finish to process file  BOT_data_ecar_20170106_20170106_part2.csv
Finish to process file  BOT_data_ecar_20170107_20170107_part0.csv
Finish to process file  BOT_data_ecar_20170107_20170107_part1.csv
Finish to process file  BOT_data_ecar_20170107_20170107_part2.csv
Finish to process file  BOT_data_ecar_20170108_20170108_part0.csv
Finish to process file  BOT_data_ecar_20170108_20170108_part1.csv
Finish to process file  BOT_data_ecar_20170108_20170108_part2.csv
Finish to process file  BOT_data_ecar_20170109_20170109_part0.csv
Finish to process file  BOT_data_ecar_20170109_20170109_part1.csv
Finish to process file  BOT_data_ecar_20170109_20170109_part2.csv
Finish to process file  BOT_data_ecar_20170110_20170110_part0.csv
Finish to process file  BOT_data_ecar_20170110_20170110_part1.csv
Finish to 

Finish to process file  BOT_data_ecar_20170216_20170216_part1.csv
Finish to process file  BOT_data_ecar_20170216_20170216_part2.csv
Finish to process file  BOT_data_ecar_20170217_20170217_part0.csv
Finish to process file  BOT_data_ecar_20170217_20170217_part1.csv
Finish to process file  BOT_data_ecar_20170217_20170217_part2.csv
Finish to process file  BOT_data_ecar_20170218_20170218_part0.csv
Finish to process file  BOT_data_ecar_20170218_20170218_part1.csv
Finish to process file  BOT_data_ecar_20170218_20170218_part2.csv
Finish to process file  BOT_data_ecar_20170219_20170219_part0.csv


In [11]:
import timeit

# Have a view to see which steps consume most of time
# Retrieve grid id conusme 99.7% of the total run time 
# But there is no room to improve this step as grid_id is dependent
# on other columns like latitude and longitude

# loading data
init_start = timeit.default_timer()
df = pd.read_csv('data/training/rcar/BOT_data_rcar_20170102_20170102_part0.csv')
print('Loading csv time: ', timeit.default_timer() - init_start)

# drop uncessary columns
start = timeit.default_timer()
df = df.drop(columns=rcar_drop_columns)
print('drop rcar column time: ', timeit.default_timer() - start)

# filter data by lat and lon
start = timeit.default_timer()
df = filter_data_by_lan_lon(df)
print('filter by lat and lon time: ', timeit.default_timer() - start)

# generate new column grid_id
start = timeit.default_timer()
df['grid_id'] = df.apply(lambda row: retrieve_grid_id(row), axis=1)
print('retrieve grid id time: ', timeit.default_timer() - start)

# filter data by grid_id
start = timeit.default_timer()
df = df[df.grid_id != -1]
print('filter grid id time: ', timeit.default_timer() - start)

# drop lat and lon
start = timeit.default_timer()
df = df.drop(columns=common_drop_columns)
print('drop lat and lon time: ', timeit.default_timer() - start)

# format data_time column 
start = timeit.default_timer()
df['date_time'] = df['date_time'].apply(lambda x : format_date_time(x))
print('Format date time: ', timeit.default_timer() - start)

# aggregate data by date_time,vin and grid_id
start = timeit.default_timer()
df = df.drop_duplicates()
print('Drop duplicates time: ', timeit.default_timer() - start)

# generating new feature file
start = timeit.default_timer()
df.to_csv(rcar_training_data_path + "_test.csv")
print('Write csv time: ', timeit.default_timer() - start)
print('Total consume time: ', timeit.default_timer() - init_start)

Loading csv time:  0.2476450320459594
drop rcar column time:  0.007401510347551626
filter by lat and lon time:  0.01283933382830027
retrieve grid id time:  90.22524556305386
filter grid id time:  0.0031992306721377872
drop lat and lon time:  0.0006683586837823441
Format date time:  0.0020962855506212463
Drop duplicates time:  0.0021369082687954233
Write csv time:  0.00611129194039961
Total consume time:  90.50847226388109
