### Feature engineering

- Remove unnecessary columns to reduce dataframe size
- Filter train records according to min/max langtitude and longitude in grid info
- Generate grid info and aggregate records accroding to VIN and grid id

In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import timeit
%matplotlib inline

# loading training files
ecar_training_data_path = 'data/training/ecar/'
rcar_training_data_path = 'data/training/rcar/'

ecar_filenames = [f for f in os.listdir(ecar_training_data_path)]
rcar_filenames = [f for f in os.listdir(rcar_training_data_path)]

# define drop columns
ecar_drop_columns = ['work_mode','mileage','avg_fuel_consumption','system_mode']
rcar_drop_columns = ['power_mode','mileage','fuel_consumption']
common_drop_columns = ['lat','lon','date_time']

# loading grid min/max lantitude and longitude
grids = pd.read_csv('data/grid_info.csv')
grid_max_lantitude = grids['latitude_to'].max()
grid_min_lantitude = grids['latitude_from'].min()
grid_max_longitude = grids['longitude_to'].max()
grid_min_longitude = grids['longitude_from'].min()

In [80]:
def round_down_lat_lon(df):
    df['lat'] = df.lat.round(3)
    df['lon'] = df.lon.round(3)
    return df
        
def filter_data_by_lan_lon(df):
    df = df[(df.lat >= grid_min_lantitude ) & (df.lat <= grid_max_lantitude)]
    df = df[(df.lon >= grid_min_longitude ) & (df.lon <= grid_max_longitude)]
    return df

def add_grid_id(df):
    df_with_grid_id = pd.DataFrame(index=[0], columns=['car_id', 'date_time','speed','lat','lon','grid_id'])
    df_with_grid_id = df_with_grid_id.fillna(-1)
    for i in range(len(grids)):
        min_lan, max_lan = grids['latitude_from'][i], grids['latitude_to'][i]
        min_lon, max_lon, grid_id = grids['longitude_from'][i], grids['longitude_to'][i], grids['grid_id'][i]
        df_ = df[(df.lat >= min_lan ) & (df.lat <= max_lan)]
        df_ = df_[(df_.lon >= min_lon ) & (df_.lon <= max_lon)]
        df_['grid_id'] = grid_id
        df_with_grid_id = df_with_grid_id.append(df_)
    return df_with_grid_id

def format_date(date_time):
    return date_time[:-9].replace('-','')

def retrieve_hour(date_time):
    return date_time[-8:-6]

def preprocess_data(dataType='rcar'):
    # define common variables
    filenames = rcar_filenames
    filepath = rcar_training_data_path
    drop_columns = rcar_drop_columns
    
    if dataType == 'ecar':
        filenames = ecar_filenames
        filepath = ecar_training_data_path
        drop_columns = ecar_drop_columns
    
    df_ = pd.DataFrame(index=[0], columns=['car_id', 'date','hour','speed', 'grid_id'])
    df_ = df_.fillna(-1)
    for filename in filenames:
        if os.path.isdir(filepath + filename):
            continue
        # init run time
        start = timeit.default_timer()
        # loading data
        df = pd.read_csv(filepath + filename)
        # drop uncessary columns
        df = df.drop(columns=drop_columns)
        # filter data by lat and lon
        df = filter_data_by_lan_lon(df)
        # format data_time column
        df['hour'] = df['date_time'].apply(lambda x : retrieve_hour(x))
        df['date'] = df['date_time'].apply(lambda x : format_date(x))
        # generate new column grid_id
        df = add_grid_id(df)
        # filter data by grid_id
        df = df[df.grid_id != -1]
        # drop lat and lon
        df = df.drop(columns=common_drop_columns)
        # appending df
        df_ = df_.append(df)
        # remove grid_id = -1
        df_ = df_[df_.grid_id != -1]
        # group by below column and get average speed for each record
        df_ = df.groupby(['car_id', 'date','hour','grid_id'], as_index=False).mean()
        # round average speed
        df_.speed = df_.speed.round(3)
        # write final file
        df_.to_csv(filepath + 'processed/' + filename)
        # log run time
        print("Finish preprocess file:[", filename, "] total cost:[", timeit.default_timer() - start ,"]")

In [81]:
preprocess_data('rcar')

Finish preprocess file:[ BOT_data_rcar_20170102_20170102_part0.csv ] total cost:[ 1.3908099659024629 ]
Finish preprocess file:[ BOT_data_rcar_20170102_20170102_part1.csv ] total cost:[ 1.476288340564679 ]
Finish preprocess file:[ BOT_data_rcar_20170102_20170102_part2.csv ] total cost:[ 1.3097060703626084 ]
Finish preprocess file:[ BOT_data_rcar_20170103_20170103_part0.csv ] total cost:[ 1.6841139112111705 ]
Finish preprocess file:[ BOT_data_rcar_20170103_20170103_part1.csv ] total cost:[ 1.6587571594345718 ]
Finish preprocess file:[ BOT_data_rcar_20170103_20170103_part2.csv ] total cost:[ 1.7155589609374147 ]
Finish preprocess file:[ BOT_data_rcar_20170104_20170104_part0.csv ] total cost:[ 1.9067264067598444 ]
Finish preprocess file:[ BOT_data_rcar_20170104_20170104_part1.csv ] total cost:[ 1.7699001498699545 ]
Finish preprocess file:[ BOT_data_rcar_20170104_20170104_part2.csv ] total cost:[ 1.6372950788263552 ]
Finish preprocess file:[ BOT_data_rcar_20170105_20170105_part0.csv ] total

Finish preprocess file:[ BOT_data_rcar_20170128_20170128_part2.csv ] total cost:[ 1.4961704784741414 ]
Finish preprocess file:[ BOT_data_rcar_20170129_20170129_part0.csv ] total cost:[ 1.4149881478747375 ]
Finish preprocess file:[ BOT_data_rcar_20170129_20170129_part1.csv ] total cost:[ 1.4755900896921048 ]
Finish preprocess file:[ BOT_data_rcar_20170129_20170129_part2.csv ] total cost:[ 1.4677491386191832 ]
Finish preprocess file:[ BOT_data_rcar_20170130_20170130_part0.csv ] total cost:[ 1.4760272310177243 ]
Finish preprocess file:[ BOT_data_rcar_20170130_20170130_part1.csv ] total cost:[ 1.4450903485021627 ]
Finish preprocess file:[ BOT_data_rcar_20170130_20170130_part2.csv ] total cost:[ 1.3500418746134528 ]
Finish preprocess file:[ BOT_data_rcar_20170131_20170131_part0.csv ] total cost:[ 1.5407139276649104 ]
Finish preprocess file:[ BOT_data_rcar_20170131_20170131_part1.csv ] total cost:[ 1.5966958656294992 ]
Finish preprocess file:[ BOT_data_rcar_20170131_20170131_part2.csv ] tota

Finish preprocess file:[ BOT_data_rcar_20170224_20170224_part1.csv ] total cost:[ 1.6947315144916502 ]
Finish preprocess file:[ BOT_data_rcar_20170224_20170224_part2.csv ] total cost:[ 1.6779004237537265 ]
Finish preprocess file:[ BOT_data_rcar_20170225_20170225_part0.csv ] total cost:[ 1.690533833614154 ]
Finish preprocess file:[ BOT_data_rcar_20170225_20170225_part1.csv ] total cost:[ 1.6517396487438418 ]
Finish preprocess file:[ BOT_data_rcar_20170225_20170225_part2.csv ] total cost:[ 1.5428395944268232 ]
Finish preprocess file:[ BOT_data_rcar_20170226_20170226_part0.csv ] total cost:[ 1.5440973657573522 ]
Finish preprocess file:[ BOT_data_rcar_20170226_20170226_part1.csv ] total cost:[ 1.6091328046077251 ]
Finish preprocess file:[ BOT_data_rcar_20170226_20170226_part2.csv ] total cost:[ 1.4838183617753202 ]
Finish preprocess file:[ BOT_data_rcar_20170227_20170227_part0.csv ] total cost:[ 1.75773837232191 ]
Finish preprocess file:[ BOT_data_rcar_20170227_20170227_part1.csv ] total c

In [None]:
preprocess_data('ecar')