### Feature engineering phase 1

- Remove unnecessary columns to reduce dataframe size
- Filter train records according to min/max langtitude and longitude in grid info
- Generate grid info and aggregate records accroding to VIN and grid id

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import timeit
%matplotlib inline

# loading training files
ecar_training_data_path = 'data/training/ecar/'
rcar_training_data_path = 'data/training/rcar/'

ecar_output_filepath = 'data/training/ecar/processed/ecar_training.csv'
rcar_output_filepath = 'data/training/rcar/processed/rcar_training.csv'

weather_filepath = 'data/weather.csv'
training_features_filepath = 'data/training.csv'
test_features_filepath = 'data/test.csv'
submission_filepath = 'data/submit_samples.csv'

ecar_filenames = [f for f in os.listdir(ecar_training_data_path)]
rcar_filenames = [f for f in os.listdir(rcar_training_data_path)]

# define drop columns
ecar_drop_columns = ['work_mode','mileage','avg_fuel_consumption','system_mode']
rcar_drop_columns = ['power_mode','mileage','fuel_consumption']
common_drop_columns = ['lat','lon','date_time']

# training features
feature_columns = ['year','month','day','hour','weekday','grid_id',
                   'temperture','rainy','holiday','car_number']

feature_columns_with_speed = ['year','month','day','hour','weekday','grid_id',
                   'temperture','rainy','holiday','speed','car_number']

feature_drop_to_calc_avg_car_number = ['year','month','day','temperture','rainy','holiday']


# loading grid min/max lantitude and longitude
grids = pd.read_csv('data/grid_info_b.csv')
grid_max_lantitude = grids['latitude_to'].max()
grid_min_lantitude = grids['latitude_from'].min()
grid_max_longitude = grids['longitude_to'].max()
grid_min_longitude = grids['longitude_from'].min()

In [2]:
def round_down_lat_lon(df):
    df['lat'] = df.lat.round(3)
    df['lon'] = df.lon.round(3)
    return df
        
def filter_data_by_lan_lon(df):
    df = df[(df.lat >= grid_min_lantitude ) & (df.lat <= grid_max_lantitude)]
    df = df[(df.lon >= grid_min_longitude ) & (df.lon <= grid_max_longitude)]
    return df

def add_grid_id(df):
    df_with_grid_id = pd.DataFrame(index=[0], columns=['car_id', 'date_time','speed','lat','lon','grid_id'])
    df_with_grid_id = df_with_grid_id.fillna(-1)
    for i in range(len(grids)):
        min_lan, max_lan = grids['latitude_from'][i], grids['latitude_to'][i]
        min_lon, max_lon, grid_id = grids['longitude_from'][i], grids['longitude_to'][i], grids['grid_id'][i]
        df_ = df[(df.lat >= min_lan ) & (df.lat <= max_lan)]
        df_ = df_[(df_.lon >= min_lon ) & (df_.lon <= max_lon)]
        df_['grid_id'] = grid_id
        df_with_grid_id = df_with_grid_id.append(df_)
    return df_with_grid_id

def format_date(date_time):
    return date_time[:-9].replace('-','')

def retrieve_hour(date_time):
    return date_time[-8:-6]

def filter_data_by_date(df, filename):
    date_from_filename = filename.split("_")[3]
    df = df[df['date'] == date_from_filename]
    return df

def preprocess_data(dataType='rcar'):
    # init run time
    start = timeit.default_timer()
    # define common variables
    filenames = rcar_filenames
    filepath = rcar_training_data_path
    drop_columns = rcar_drop_columns
    output_filepath = rcar_output_filepath    
    
    if dataType == 'ecar':
        filenames = ecar_filenames
        filepath = ecar_training_data_path
        drop_columns = ecar_drop_columns
        output_filepath = ecar_output_filepath
        
    # if file is already exist return
    if os.path.exists(output_filepath):
        print(output_filepath + " already existed, will return directly")
        return
    
    df_ = pd.DataFrame(index=[0], columns=['car_id', 'date','hour','speed', 'grid_id'])
    df_ = df_.fillna(-1)
    for filename in filenames:
        # if is directory skip
        if os.path.isdir(filepath + filename):
            continue
        # loading data
        df = pd.read_csv(filepath + filename, low_memory=False)
        # drop uncessary columns
        df = df.drop(columns=drop_columns)
        # filter data by lat and lon
        df = filter_data_by_lan_lon(df)
        # format data_time column
        df['hour'] = df['date_time'].apply(lambda x : retrieve_hour(x))
        df['date'] = df['date_time'].apply(lambda x : format_date(x))
        # filter data by date
        df = filter_data_by_date(df, filename)
        # generate new column grid_id
        df = add_grid_id(df)
        # filter data by grid_id
        df = df[df.grid_id != -1]
        # drop date_time, lat and lon
        df = df.drop(columns=common_drop_columns)
        # group by below column and get average speed for each record
        df = df.groupby(['car_id', 'date','hour','grid_id'], as_index=False).mean()
        # round average speed
        df.speed = df.speed.round(3)
        # appending df
        df_ = df_.append(df)
        # remove grid_id = -1
        df_ = df_[df_.grid_id != -1]
    # print log and total time
    print("Finish process all the files, total cost:[", timeit.default_timer() - start ,"] seconds.")
    # write final file
    df_.to_csv(output_filepath, index=False)

In [3]:
preprocess_data('rcar')

Finish process all the files, total cost:[ 414.39815879351994 ] seconds.


In [4]:
preprocess_data('ecar')

Finish process all the files, total cost:[ 1168.559825286454 ] seconds.


### Feature engineering phase 2
- Merge ecar and rcar training data
- Drop column speed and car_id first(these 2 columns maybe used in future but let's try a simple solution first)
- Aggregate training data and calculate total number of cars per day/per hour/per grid
- Merge training data with weather data

In [3]:
def preprocess_weather_data():
    # split date to year, month and day, adding one more column weekday
    df = pd.read_csv(weather_filepath)
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['weekday'] = df['date'].dt.weekday
    # format date yyyy-mm-dd to yyyymmdd for later join with car's data
    df['date'] = df['date'].astype(str)
    df['date'] = df['date'].apply(lambda x : x.replace('-', ''))
    df['date'] = df['date'].astype(np.int64)
    return df

def preprocess_phase1_data(use_full_data=True, use_avg_car_number=False):    
    # loading files
    ecar = pd.read_csv(ecar_output_filepath)
    rcar = pd.read_csv(rcar_output_filepath)
    weather = preprocess_weather_data()
    full_data = generate_full_data_set()    
    # merge weather with full data
    f_weather = pd.merge(full_data, weather, on=['year','month','day','hour'], how='left')    
    # merge ecar and rcar data
    df = ecar.append(rcar)    
    # drop car id and speed
    df = df.drop(columns=['car_id','speed'])    
    # add column car_number
    df['car_number'] = 1    
    # aggregate data by date,hour and grid id
    df = df.groupby(['date','hour','grid_id'], as_index=False).sum()
    # merge data set
    df = pd.merge(f_weather, df, on=['date','hour','grid_id'], how='left')    
    # remove 23:00 ~ 8:00 data
    df = remove_other_hours_data(df)    
    # remove CNY
    df = remove_chinese_new_year_data(df)    
    # handle N/A car number
    df = handle_na_car_number(df, use_full_data, use_avg_car_number)    
    # write to disk
    df.to_csv(training_features_filepath, columns=feature_columns, index=False)
    
def generate_average_speed_per_grid_weekday_hour():
    ecar = pd.read_csv(ecar_output_filepath)
    rcar = pd.read_csv(rcar_output_filepath)
    df = ecar.append(rcar)
    # remove outlier
    df = df[(df.date != 20161230 ) & (df.date != 20170101)]
    # remove CNY
    df = df[(df.date != 20170127 ) & (df.date != 20170128)]
    df = df[(df.date != 20170129 ) & (df.date != 20170130)]
    df = df[(df.date != 20170131 ) & (df.date != 20170201)]
    df = df[(df.date != 20170202 )]
    df = df[(df.speed != 0 )]
    # add new column weekday
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
    df['weekday'] = df['date'].dt.weekday
    # drop uncessary columns
    df = df.drop(columns=['date','car_id'])
    # get average speed per grid, per weekday, per hour
    df = df.groupby(['weekday','hour','grid_id'], as_index=False).mean()
    # round speed
    df['speed'] = df.speed.round() 
    df['speed'] = df['speed'].astype(int)
    return df

# train data from 2017-01-02 to 2017-03-12
def generate_full_data_set():
    jan_df = generate_data_by_range(2017,1,2,31,0,23,51,100)
    feb_df = generate_data_by_range(2017,2,1,28,0,23,51,100)
    march_df = generate_data_by_range(2017,3,1,12,0,23,51,100)
    df = jan_df.append(feb_df)
    df = df.append(march_df)
    return df
    
def generate_data_by_range(year, month, start_date, end_date, start_time, end_time, start_gid, end_gid):
    # define arrays
    years = []
    months = []
    days = []
    hours = []
    grid_id = []
    # generate grid id, date and hour according to criteria
    for date in range(start_date, end_date+1):
        for time in range(start_time, end_time+1):
            for gid in range(start_gid, end_gid+1):
                years.append(year)
                months.append(month)
                days.append(date)
                hours.append(time)
                grid_id.append(gid)
    # genreate dataframe            
    df = pd.DataFrame(data={'year':year,'month':month,'day':days, 'hour': hours, 'grid_id':grid_id})
    df['flag'] = 0
    df=df.reindex(columns=['year','month','day','hour','grid_id','flag'])
    return df

def append_avg_speed_to_train_and_test():
    # loading train and test
    train = pd.read_csv(training_features_filepath)
    test = pd.read_csv(test_features_filepath)
    # if already append speed then return
    if 'speed' in train.columns:
        return
    # generate 
    avg_spd = generate_average_speed_per_grid_weekday_hour()
    # merge with train
    train = pd.merge(train, avg_spd, on=['grid_id','weekday','hour'], how='left')
    # merge with test
    test = pd.merge(test, avg_spd, on=['grid_id','weekday','hour'], how='left')
    # generate new train and test data set
    train.to_csv(training_features_filepath, columns=feature_columns_with_speed, index=False)
    test.to_csv(test_features_filepath, columns=feature_columns_with_speed, index=False)
    
def remove_chinese_new_year_data(df):
    df = df[(df.month != 1) | (df.day != 27)]
    df = df[(df.month != 1) | (df.day != 28)]
    df = df[(df.month != 1) | (df.day != 29)]
    df = df[(df.month != 1) | (df.day != 30)]
    df = df[(df.month != 1) | (df.day != 31)]
    df = df[(df.month != 2) | (df.day != 1)]
    df = df[(df.month != 2) | (df.day != 2)]
    return df

def remove_other_hours_data(df):
    df = df[(df.hour != 0)]
    df = df[(df.hour != 1)]
    df = df[(df.hour != 2)]
    df = df[(df.hour != 3)]
    df = df[(df.hour != 4)]
    df = df[(df.hour != 5)]
    df = df[(df.hour != 6)]
    df = df[(df.hour != 7)]
    df = df[(df.hour != 8)]
    df = df[(df.hour != 23)]
    return df

def handle_na_car_number(df, use_full_data, use_avg_car_number):
    if use_full_data:
        df.fillna(0, inplace = True)
        if use_avg_car_number:
            df = update_car_number_to_avg_if_zero(df)
    else:
        df = df.dropna()
    return df

# update car number to average if zero (per grid/per weekday/per hour)
def update_car_number_to_avg_if_zero(df):
    # generate average car number dataframe
    avg_car = df.drop(columns=feature_drop_to_calc_avg_car_number)
    avg_car = avg_car.groupby(['hour', 'weekday','grid_id'], as_index=False).mean()
    avg_car['car_number'] = avg_car['car_number'].apply(lambda x : np.ceil(x))
    avg_car = avg_car.rename(columns={'car_number': 'avg_car_number'})
    # merge with traing dataframe
    df = pd.merge(df, avg_car, on=['grid_id','weekday','hour'], how='left')
    # found car_number = 0 row and set to average car number
    car_number_index = df.car_number == 0
    df.loc[car_number_index,'car_number'] = df.loc[car_number_index, 'avg_car_number']
    # drop avg_car_number
    df = df.drop(columns=['avg_car_number'])
    return df

In [4]:
preprocess_phase1_data(use_full_data=True, use_avg_car_number=False)
train = pd.read_csv(training_features_filepath)
train.head()

Unnamed: 0,year,month,day,hour,weekday,grid_id,temperture,rainy,holiday,car_number
0,2017,1,2,9,0,51,13,0,1,14.0
1,2017,1,2,9,0,52,13,0,1,0.0
2,2017,1,2,9,0,53,13,0,1,4.0
3,2017,1,2,9,0,54,13,0,1,4.0
4,2017,1,2,9,0,55,13,0,1,6.0


### Generating test features for submission

- from 20170313 to 20170326 2 weeks data
- 9<=hour<=22's data

In [9]:
def generate_sample_submission_file(start_date, end_date, start_time, end_time, start_gid, end_gid):
    # define arrays
    grid_id = []
    dates = []
    hours = []
    # generate grid id, date and hour according to criteria
    for date in range(start_date, end_date+1):
        for time in range(start_time, end_time+1):
            for gid in range(start_gid, end_gid+1):
                grid_id.append(gid)
                dates.append('201703' + str(date))
                hours.append(time)
    # genreate dataframe            
    df = pd.DataFrame(data={'grid_id': grid_id, 'date': dates, 'hour': hours})
    df['car_number'] = 0
    df=df.reindex(columns=['grid_id','date','hour','car_number'])
    # write to csv
    df.to_csv(submission_filepath, index=False) 


def preprocess_test_data():
    test = pd.read_csv(submission_filepath)
    weather = preprocess_weather_data()
    df = pd.merge(test, weather, on=['date', 'hour'], how='left')
    df.to_csv(test_features_filepath, columns=feature_columns, index=False)

In [10]:
generate_sample_submission_file(start_date=13, end_date=26, start_time=9, end_time=22, start_gid=51, end_gid=100)

preprocess_test_data()
test = pd.read_csv(test_features_filepath)
test.head()

Unnamed: 0,year,month,day,hour,weekday,grid_id,temperture,rainy,holiday,car_number
0,2017,3,13,9,0,51,9,1,0,0
1,2017,3,13,9,0,52,9,1,0,0
2,2017,3,13,9,0,53,9,1,0,0
3,2017,3,13,9,0,54,9,1,0,0
4,2017,3,13,9,0,55,9,1,0,0


### Revmoe outliers from trainning

In [29]:
def remove_outliers(train):
    train_q1_q3 = train.drop(columns=['year','month','day','temperture','rainy','holiday'])    
    # calc Q1
    train_q1 = train_q1_q3.groupby(['hour','weekday','grid_id'], as_index=False).quantile(0.25)
    train_q1 = train_q1.rename(columns={'car_number': 'car_number_q1'})
    # calc Q3
    train_q3 = train_q1_q3.groupby(['hour','weekday','grid_id'], as_index=False).quantile(0.75)
    train_q3 = train_q3.rename(columns={'car_number': 'car_number_q3'})    
    # merge Q1 and Q3
    train_q1_q3 = pd.merge(train_q1, train_q3, on=['grid_id','weekday','hour'], how='left')
    # calc step
    train_q1_q3['step'] = train_q1_q3.car_number_q3 - train_q1_q3.car_number_q1
    train_q1_q3['step'] = train_q1_q3['step'] * 1.5
    # calc boundary
    train_q1_q3['q1_boundary'] = train_q1_q3.car_number_q1 - train_q1_q3.step
    train_q1_q3['q3_boundary'] = train_q1_q3.car_number_q3 + train_q1_q3.step
    # drop uncessary column
    train_q1_q3 = train_q1_q3.drop(columns=['car_number_q1','car_number_q3','step'])
    # merge with train
    train = pd.merge(train, train_q1_q3, on=['grid_id','weekday','hour'], how='left')
    # filter data according to boundary
    train = train[(train.car_number >= train.q1_boundary ) & (train.car_number <= train.q3_boundary)]
    # drop uncessary column
    train = train.drop(columns=['q1_boundary','q3_boundary'])
    # write to csv
    train.to_csv(training_features_filepath, index=False)
    return train

In [30]:
train = remove_outliers(train)
train.head()

Unnamed: 0,year,month,day,hour,weekday,grid_id,temperture,rainy,holiday,car_number
0,2017,1,2,9,0,1,13,0,1,20.0
1,2017,1,2,9,0,2,13,0,1,7.0
2,2017,1,2,9,0,3,13,0,1,4.0
3,2017,1,2,9,0,4,13,0,1,3.0
4,2017,1,2,9,0,6,13,0,1,9.0


### Append avg car number and median car number to train and test datasets

In [87]:
from scipy import stats

def calc_avg_car_number_per_grid_weekday_hour(df):
    avg_car_number = df.drop(columns=['year','month','day','temperture','rainy','holiday'])
    avg_car_number = avg_car_number.groupby(['hour','weekday','grid_id'], as_index=False).mean()
    avg_car_number['car_number'] = avg_car_number['car_number'].apply(lambda x : np.ceil(x))
    avg_car_number = avg_car_number.rename(columns={'car_number': 'avg_car_number'})
    return avg_car_number

def calc_median_car_number_per_grid_weekday_hour(df):
    avg_car_number = df.drop(columns=['year','month','day','temperture','rainy','holiday'])
    avg_car_number = avg_car_number.groupby(['hour','weekday','grid_id'], as_index=False).median()
    avg_car_number['car_number'] = avg_car_number['car_number'].apply(lambda x : np.ceil(x))
    avg_car_number = avg_car_number.rename(columns={'car_number': 'median_car_number'})
    return avg_car_number

def calc_std_car_number_per_grid_weekday_hour(df):
    std_car_number = df.drop(columns=['year','month','day','temperture','rainy','holiday'])
    std_car_number = std_car_number.groupby(['hour','weekday','grid_id'], as_index=False).apply(lambda row : np.std(row.car_number))
    std_car_number.to_csv('data/temp/std_temp.csv')
    std_car_number = pd.read_csv('data/temp/std_temp.csv', header=None, names=['hour','weekday','grid_id', 'std_car_number'])
    std_car_number['std_car_number'] = std_car_number['std_car_number'].apply(lambda x : np.ceil(x))
    return std_car_number

def calc_hmean_car_number_per_grid_weekday_hour(df):
    hmean_car_number = df.drop(columns=['year','month','day','temperture','rainy','holiday'])
    hmean_car_number = hmean_car_number.groupby(['hour','weekday','grid_id'], as_index=False).apply(lambda row : stats.hmean(row.car_number))
    hmean_car_number.to_csv('data/temp/hmean_temp.csv')
    hmean_car_number = pd.read_csv('data/temp/hmean_temp.csv', header=None, names=['hour','weekday','grid_id', 'hmean_car_number'])
    hmean_car_number['hmean_car_number'] = hmean_car_number['hmean_car_number'].apply(lambda x : np.ceil(x))
    return hmean_car_number

def append_avg_median_car_number_to_train_and_test(train, test):
    avg_car_number = calc_avg_car_number_per_grid_weekday_hour(train)
    median_car_number = calc_median_car_number_per_grid_weekday_hour(train)
    std_car_number = calc_std_car_number_per_grid_weekday_hour(train)
    hmean_car_number = calc_hmean_car_number_per_grid_weekday_hour(train)
    # append avg car number to train and test
    train = pd.merge(train, avg_car_number, on=['grid_id','weekday','hour'], how='left')
    test = pd.merge(test, avg_car_number, on=['grid_id','weekday','hour'], how='left')
    # append median car number to train and test
    train = pd.merge(train, median_car_number, on=['grid_id','weekday','hour'], how='left')
    test = pd.merge(test, median_car_number, on=['grid_id','weekday','hour'], how='left')
    # append std car number to train and test
    train = pd.merge(train, std_car_number, on=['grid_id','weekday','hour'], how='left')
    test = pd.merge(test, std_car_number, on=['grid_id','weekday','hour'], how='left')
    # append hmean car number to train and test
    #train = pd.merge(train, hmean_car_number, on=['grid_id','weekday','hour'], how='left')
    #test = pd.merge(test, hmean_car_number, on=['grid_id','weekday','hour'], how='left')
    # write to file
    train.to_csv(training_features_filepath, index=False)
    test.to_csv(test_features_filepath, index=False)
    
append_avg_median_car_number_to_train_and_test(train, test)

### Append average speed to train and test datasets

In [18]:
append_avg_speed_to_train_and_test()
# show trian data after append with avgerage speed
train = pd.read_csv(training_features_filepath)
train.head()

Unnamed: 0,year,month,day,hour,weekday,grid_id,temperture,rainy,holiday,speed,car_number
0,2017,1,2,9,0,1,13,0,1,21,20.0
1,2017,1,2,9,0,2,13,0,1,16,7.0
2,2017,1,2,9,0,3,13,0,1,22,4.0
3,2017,1,2,9,0,4,13,0,1,72,3.0
4,2017,1,2,9,0,6,13,0,1,41,9.0


In [19]:
# show test data after append with avgerage speed
test = pd.read_csv(test_features_filepath)
test.head()

Unnamed: 0,year,month,day,hour,weekday,grid_id,temperture,rainy,holiday,speed,car_number
0,2017,3,13,9,0,1,9,1,0,21,0
1,2017,3,13,9,0,2,9,1,0,16,0
2,2017,3,13,9,0,3,9,1,0,22,0
3,2017,3,13,9,0,4,9,1,0,72,0
4,2017,3,13,9,0,5,9,1,0,72,0


### Append cluster category into to train and test datasets

In [20]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import calinski_harabaz_score
import time

# fitler features for clustering
train_for_clustering = train.drop(columns=['year','month','day','temperture','rainy','holiday','speed'])
train_for_clustering = train_for_clustering.groupby(['hour', 'weekday','grid_id'], as_index=False).mean()
train_for_clustering['car_number'] = train_for_clustering.car_number.round()

# use mini batch Keamns to clustering data
mbk = MiniBatchKMeans(init='k-means++', n_clusters=100, batch_size=128 ,n_init=10, max_no_improvement=10, verbose=0)
mbk.fit(train_for_clustering.drop(columns=['grid_id']))
pred = mbk.predict(train_for_clustering.drop(columns=['grid_id']))

# append clustering result to datasets
train_for_clustering['cluster_category'] = pred
train_for_clustering = train_for_clustering.drop(columns=['car_number'])

# join with train and to csv
train = pd.merge(train, train_for_clustering, on=['grid_id','weekday','hour'], how='left')
train.to_csv(training_features_filepath, index=False)

# join with test and to csv
test = pd.merge(test, train_for_clustering, on=['grid_id','weekday','hour'], how='left')
test.to_csv(test_features_filepath, index=False)

In [21]:
train.head()

Unnamed: 0,year,month,day,hour,weekday,grid_id,temperture,rainy,holiday,speed,car_number,cluster_category
0,2017,1,2,9,0,1,13,0,1,21,20.0,98
1,2017,1,2,9,0,2,13,0,1,16,7.0,49
2,2017,1,2,9,0,3,13,0,1,22,4.0,98
3,2017,1,2,9,0,4,13,0,1,72,3.0,35
4,2017,1,2,9,0,6,13,0,1,41,9.0,49


In [25]:
test.head()

Unnamed: 0,year,month,day,hour,weekday,grid_id,temperture,rainy,holiday,speed,car_number,cluster_category
0,2017,3,13,9,0,1,9,1,0,21,0,98
1,2017,3,13,9,0,2,9,1,0,16,0,49
2,2017,3,13,9,0,3,9,1,0,22,0,98
3,2017,3,13,9,0,4,9,1,0,72,0,35
4,2017,3,13,9,0,5,9,1,0,72,0,18
