In [1]:
import pandas as pd
import datetime
# import re
import ast
import numpy as np
import time

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Feature Vectorize func def

## WeekDay One-hot

In [3]:
def week_onehot(train_x,test_x):
    week= pd.concat([train_x['weekday'],test_x['weekday']])
    week = pd.get_dummies(week,prefix='weekday')
    train_x_week = week.iloc[:-320,:]
    test_x_week = week.iloc[-320:,:]
    train_x = train_x.drop(columns=['weekday'])
    test_x = test_x.drop(columns=['weekday'])
    train_x = pd.concat([train_x_week,train_x],axis=1)
    test_x = pd.concat([test_x_week,test_x],axis=1)
    return train_x,test_x

In [384]:
# train_x,test_x = week_onehot(train_x,test_x)

## Month One-hot

In [4]:
def month_onehot(train_x,test_x):
    month= pd.concat([train_x['month'],test_x['month']])
    month = pd.get_dummies(month,prefix='month')
    train_x_month = month.iloc[:-320,:]
    test_x_month = month.iloc[-320:,:]
    train_x = train_x.drop(columns=['month'])
    test_x = test_x.drop(columns=['month'])
    train_x = pd.concat([train_x_month,train_x],axis=1)
    test_x = pd.concat([test_x_month,test_x],axis=1)
    return train_x,test_x

In [389]:
# train_x,test_x = month_onehot(train_x,test_x)

## Hour One-hot

In [5]:
def hour_onehot(train_x,test_x):
    hour= pd.concat([train_x['hour'],test_x['hour']])
    hour = pd.get_dummies(hour,prefix='hour')
    train_x_hour = hour.iloc[:-320,:]
    test_x_hour = hour.iloc[-320:,:]
    train_x = train_x.drop(columns=['hour'])
    test_x = test_x.drop(columns=['hour'])
    train_x = pd.concat([train_x_hour,train_x],axis=1)
    test_x = pd.concat([test_x_hour,test_x],axis=1)
    return train_x,test_x

In [391]:
# train_x,test_x = hour_onehot(train_x,test_x)

## CallType&OgStand 

In [6]:
def Vectorize_CallType(train_x,test_x):
    callstand = pd.concat([train_x['CALL_TYPE_STAND'],test_x['CALL_TYPE_STAND']])
    callstand = pd.get_dummies(callstand)
    train_x_call = callstand.iloc[:-320,:]
    test_x_call = callstand.iloc[-320:,:]
    train_x = train_x.drop(columns=['CALL_TYPE_STAND'])
    test_x = test_x.drop(columns=['CALL_TYPE_STAND'])
    train_x = pd.concat([train_x_call,train_x],axis=1)
    test_x = pd.concat([test_x_call,test_x],axis=1)
    return train_x,test_x

In [393]:
# train_x,test_x = Vectorize_CallType(train_x,test_x)

## Cluster (One_hot)

In [7]:
def Vectorize_Cluster_onehot(train_x,test_x,cluster_name):
    cluster= pd.concat([train_x[cluster_name],test_x[cluster_name]])
    cluster = pd.get_dummies(cluster,prefix=cluster_name)
    train_x_cluster = cluster.iloc[:-320,:]
    test_x_cluster = cluster.iloc[-320:,:]
    train_x = train_x.drop(columns=[cluster_name])
    test_x = test_x.drop(columns=[cluster_name])
    train_x = pd.concat([train_x_cluster,train_x],axis=1)
    test_x = pd.concat([test_x_cluster,test_x],axis=1)
    return train_x,test_x

## Cluster (Freq) + Standard

In [8]:
def Vectorize_Cluster_freq(train_x,test_x):

    # freq table
    start_tmp = train_x.StartCluster.value_counts().to_dict()
    end_tmp = train_x.EndCluster.value_counts().to_dict()
    # freq -> prob
    for i in start_tmp.keys():
        start_tmp[i] /= max(train_x.StartCluster.value_counts().to_dict().values())
    for i in end_tmp.keys():
        end_tmp[i] /= max(train_x.EndCluster.value_counts().to_dict().values())
        
    train_x.StartCluster = train_x.StartCluster.map(start_tmp)
    test_x.StartCluster = test_x.StartCluster.map(start_tmp)
    train_x.EndCluster = train_x.EndCluster.map(end_tmp)
    test_x.EndCluster = test_x.EndCluster.map(end_tmp)


    return train_x,test_x

In [263]:
# train_x,test_x = Vectorize_Cluster_hot(train_x,test_x,'StartCluster')
# train_x,test_x = Vectorize_Cluster_hot(train_x,test_x,'EndCluster')
# train_x,test_x = Vectorize_Cluster_freq(train_x,test_x)

# func def.

## init_train_test_data

In [9]:
def init_train_test_data(distance_threshold = 1):
    train_data = pd.read_csv('train_preprocessing.csv')
    test_data = pd.read_csv('test_preprocessing.csv')
    
    train_data = train_data[train_data.distance >= distance_threshold]

    feature_list = ['CALL_TYPE_STAND','month','hour','weekday','StartCluster', 'EndCluster','distance']
    target = 'period'

    train_x = train_data[feature_list] 
    train_y = train_data[target] 
    test_x = test_data[feature_list]

    return train_x, train_y, test_x

## my_train_validation_split

In [10]:
def my_train_validation_split(train_x,train_y,test_x):
    train_x, val_x, train_y, val_y = train_test_split(train_x, np.array(train_y), test_size=0.3, random_state=1)
    train_x = train_x.astype(np.float32)
    train_y = train_y.astype(np.float32)
    val_x = val_x.astype(np.float32)
    val_y = val_y.astype(np.float32)
    test_x = test_x.astype(np.float32)
    print('train_x= ',train_x.shape)
    print('train_y= ',train_y.shape)
    print('val_x= ',val_x.shape)
    print('val_y= ',val_y.shape)
    print('test_x= ',test_x.shape)

    return train_x,train_y,val_x,val_y,test_x

## pred_generate_result_csv

In [11]:
def pred_generate_result_csv(model, test_x, result_file_name):
    result = model.predict(test_x)
    result_csv = pd.read_csv('submission.csv')
    result_csv.TRAVEL_TIME = result
    result_csv.to_csv(result_file_name, index=False)
    return result_csv

## init_ANN_model

In [13]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import Flatten
from keras import backend as K
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

def init_ANN_model(input_size):
        model = Sequential()
        model.add(Dense(units=256, input_dim=input_size, activation='relu')) 
        model.add(Dense(units=512, activation='relu'))
        model.add(Dense(units=128, activation='relu'))
        model.add(Dense(units=64, activation='relu'))
        model.add(Dense(units=1,activation='linear'))
        model.summary()
        
        model.compile(optimizer = "adam", loss = root_mean_squared_error, metrics =[root_mean_squared_error])
        
        return model

# 1 set

|TAXI_ID|CALL_TYPE_STAND|month|hour|weekday|start_lon|start_lat|end_lon|end_lat|StartCluster|EndCluster|distance|
|-------|---------------|-----|----|-------|---------|---------|-------|-------|------------|----------|--------|
|x|one-hot|one-hot|one-hot|one-hot|x|x|x|x|freq+scaler|freq+scaler|num|

## Read Data & Vectorize & Split

In [17]:
# read data
train_x, train_y, test_x = init_train_test_data()

# vectorize 
train_x,test_x = Vectorize_CallType(train_x, test_x)
train_x,test_x = month_onehot(train_x,test_x)
train_x,test_x = hour_onehot(train_x,test_x)
train_x,test_x = week_onehot(train_x,test_x)
train_x,test_x = Vectorize_Cluster_freq(train_x,test_x)

# split
train_x,train_y,val_x,val_y,test_x = my_train_validation_split(train_x, train_y, test_x)

In [31]:
print('train_x.shape:',train_x.shape)
print('test_x.shape:',test_x.shape)

train_x.shape: (1021106, 112)
test_x.shape: (320, 112)


## modeling

### init model

In [28]:
model = init_ANN_model(input_size=train_x.shape[1])

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 256)               28928     
                                                                 
 dense_16 (Dense)            (None, 512)               131584    
                                                                 
 dense_17 (Dense)            (None, 128)               65664     
                                                                 
 dense_18 (Dense)            (None, 64)                8256      
                                                                 
 dense_19 (Dense)            (None, 1)                 65        
                                                                 
Total params: 234,497
Trainable params: 234,497
Non-trainable params: 0
_________________________________________________________________


### training & testing(for kaggle submission)

In [29]:
model.fit(train_x, train_y, epochs=30)
result = model.evaluate(val_x, val_y)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


### generate submission csv file

In [30]:
pred_csv = pred_generate_result_csv(
                                    model = model,
                                    test_x = test_x,
                                    result_file_name = 'ANN_result/ANN_result_1.csv'
                                )



# 2 set

|TAXI_ID|CALL_TYPE_STAND|month|hour|weekday|start_lon|start_lat|end_lon|end_lat|StartCluster|EndCluster|distance|
|-------|---------------|-----|----|-------|---------|---------|-------|-------|------------|----------|--------|
|x|one-hot|one-hot|one-hot|one-hot|x|x|x|x|one-hot|one-hot|num|

## Read Data & Vectorize & Split

In [34]:
# read data
train_x, train_y, test_x = init_train_test_data()

# vectorize 
train_x,test_x = Vectorize_CallType(train_x, test_x)
train_x,test_x = month_onehot(train_x,test_x)
train_x,test_x = hour_onehot(train_x,test_x)
train_x,test_x = week_onehot(train_x,test_x)
train_x,test_x = Vectorize_Cluster_onehot(train_x,test_x,'StartCluster')
train_x,test_x = Vectorize_Cluster_onehot(train_x,test_x,'EndCluster')

# split
train_x,train_y,val_x,val_y,test_x = my_train_validation_split(train_x, train_y, test_x)

train_x=  (1021106, 207)
train_y=  (1021106,)
val_x=  (437617, 207)
val_y=  (437617,)
test_x=  (320, 207)


## modeling

### init model

In [36]:
model = init_ANN_model(input_size=train_x.shape[1])

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_20 (Dense)            (None, 256)               53248     
                                                                 
 dense_21 (Dense)            (None, 512)               131584    
                                                                 
 dense_22 (Dense)            (None, 128)               65664     
                                                                 
 dense_23 (Dense)            (None, 64)                8256      
                                                                 
 dense_24 (Dense)            (None, 1)                 65        
                                                                 
Total params: 258,817
Trainable params: 258,817
Non-trainable params: 0
_________________________________________________________________


### training & testing(for kaggle submission)

In [37]:
model.fit(train_x, train_y, epochs=30)
result = model.evaluate(val_x, val_y)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


### generate submission csv file

In [39]:
pred_csv = pred_generate_result_csv(
                                    model = model,
                                    test_x = test_x,
                                    result_file_name = 'ANN_result/ANN_result_2.csv'
                                )



# 3 set

|TAXI_ID|CALL_TYPE_STAND|month|hour|weekday|start_lon|start_lat|end_lon|end_lat|StartCluster|EndCluster|distance|
|-------|---------------|-----|----|-------|---------|---------|-------|-------|------------|----------|--------|
|x|one-hot|one-hot|one-hot|one-hot|x|x|x|x|freq+scaler&one-hot|freq+scaler&one-hot|num|

## Read Data & Vectorize & Split

In [50]:
# read data
train_x, train_y, test_x = init_train_test_data()

# vectorize 
train_x,test_x = Vectorize_CallType(train_x, test_x)
train_x,test_x = month_onehot(train_x,test_x)
train_x,test_x = hour_onehot(train_x,test_x)
train_x,test_x = week_onehot(train_x,test_x)
train_x,test_x = Vectorize_Cluster_onehot(train_x,test_x,cluster_name='EndCluster')
train_x,test_x = Vectorize_Cluster_onehot(train_x,test_x,cluster_name='StartCluster')

train_tmp,test_y_tmp, test_tmp = init_train_test_data()
train_tmp,test_tmp = Vectorize_Cluster_freq(train_tmp,test_tmp)
train_x[['StartCluster','EndCluster']] = train_tmp[['StartCluster','EndCluster']]
test_x[['StartCluster','EndCluster']] = test_tmp[['StartCluster','EndCluster']]

# split
train_x,train_y,val_x,val_y,test_x = my_train_validation_split(train_x, train_y, test_x)

train_x=  (1021106, 209)
train_y=  (1021106,)
val_x=  (437617, 209)
val_y=  (437617,)
test_x=  (320, 209)


## modeling

### init model

In [52]:
model = init_ANN_model(input_size=train_x.shape[1])

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_25 (Dense)            (None, 256)               53760     
                                                                 
 dense_26 (Dense)            (None, 512)               131584    
                                                                 
 dense_27 (Dense)            (None, 128)               65664     
                                                                 
 dense_28 (Dense)            (None, 64)                8256      
                                                                 
 dense_29 (Dense)            (None, 1)                 65        
                                                                 
Total params: 259,329
Trainable params: 259,329
Non-trainable params: 0
_________________________________________________________________


### training & testing(for kaggle submission)

In [53]:
model.fit(train_x, train_y, epochs=30)
result = model.evaluate(val_x, val_y)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


### generate submission csv file

In [54]:
pred_csv = pred_generate_result_csv(
                                    model = model,
                                    test_x = test_x,
                                    result_file_name = 'ANN_result/ANN_result_3.csv'
                                )



# 1a set

* training set -> drop distance < 3

|TAXI_ID|CALL_TYPE_STAND|month|hour|weekday|start_lon|start_lat|end_lon|end_lat|StartCluster|EndCluster|distance|
|-------|---------------|-----|----|-------|---------|---------|-------|-------|------------|----------|--------|
|x|one-hot|one-hot|one-hot|one-hot|x|x|x|x|freq+scaler|freq+scaler|num|

## Read Data & Vectorize & Split

In [57]:
# read data
train_x, train_y, test_x = init_train_test_data(distance_threshold=3)

# vectorize 
train_x,test_x = Vectorize_CallType(train_x, test_x)
train_x,test_x = month_onehot(train_x,test_x)
train_x,test_x = hour_onehot(train_x,test_x)
train_x,test_x = week_onehot(train_x,test_x)
train_x,test_x = Vectorize_Cluster_freq(train_x,test_x)

# split
train_x,train_y,val_x,val_y,test_x = my_train_validation_split(train_x, train_y, test_x)

train_x=  (500153, 112)
train_y=  (500153,)
val_x=  (214352, 112)
val_y=  (214352,)
test_x=  (320, 112)


## modeling

### init model

In [64]:
model = init_ANN_model(input_size=train_x.shape[1])

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_40 (Dense)            (None, 256)               28928     
                                                                 
 dense_41 (Dense)            (None, 512)               131584    
                                                                 
 dense_42 (Dense)            (None, 128)               65664     
                                                                 
 dense_43 (Dense)            (None, 64)                8256      
                                                                 
 dense_44 (Dense)            (None, 1)                 65        
                                                                 
Total params: 234,497
Trainable params: 234,497
Non-trainable params: 0
_________________________________________________________________


### training & testing(for kaggle submission)

In [65]:
model.fit(train_x, train_y, epochs=30)
result = model.evaluate(val_x, val_y)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


### generate submission csv file

In [67]:
pred_csv = pred_generate_result_csv(
                                    model = model,
                                    test_x = test_x,
                                    result_file_name = 'ANN_result/ANN_result_1a.csv'
                                )



# 2a set

* training set -> drop distance < 3


|TAXI_ID|CALL_TYPE_STAND|month|hour|weekday|start_lon|start_lat|end_lon|end_lat|StartCluster|EndCluster|distance|
|-------|---------------|-----|----|-------|---------|---------|-------|-------|------------|----------|--------|
|x|one-hot|one-hot|one-hot|one-hot|x|x|x|x|one-hot|one-hot|num|

## Read Data & Vectorize & Split

In [69]:
# read data
train_x, train_y, test_x = init_train_test_data(distance_threshold=3)

# vectorize 
train_x,test_x = Vectorize_CallType(train_x, test_x)
train_x,test_x = month_onehot(train_x,test_x)
train_x,test_x = hour_onehot(train_x,test_x)
train_x,test_x = week_onehot(train_x,test_x)
train_x,test_x = Vectorize_Cluster_onehot(train_x,test_x,'StartCluster')
train_x,test_x = Vectorize_Cluster_onehot(train_x,test_x,'EndCluster')

# split
train_x,train_y,val_x,val_y,test_x = my_train_validation_split(train_x, train_y, test_x)

train_x=  (500153, 207)
train_y=  (500153,)
val_x=  (214352, 207)
val_y=  (214352,)
test_x=  (320, 207)


## modeling

### init model

In [70]:
model = init_ANN_model(input_size=train_x.shape[1])

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_45 (Dense)            (None, 256)               53248     
                                                                 
 dense_46 (Dense)            (None, 512)               131584    
                                                                 
 dense_47 (Dense)            (None, 128)               65664     
                                                                 
 dense_48 (Dense)            (None, 64)                8256      
                                                                 
 dense_49 (Dense)            (None, 1)                 65        
                                                                 
Total params: 258,817
Trainable params: 258,817
Non-trainable params: 0
_________________________________________________________________


### training & testing(for kaggle submission)

In [71]:
model.fit(train_x, train_y, epochs=30)
result = model.evaluate(val_x, val_y)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


### generate submission csv file

In [72]:
pred_csv = pred_generate_result_csv(
                                    model = model,
                                    test_x = test_x,
                                    result_file_name = 'ANN_result/ANN_result_2a.csv'
                                )



# 3a set

* training set -> drop distance < 3

|TAXI_ID|CALL_TYPE_STAND|month|hour|weekday|start_lon|start_lat|end_lon|end_lat|StartCluster|EndCluster|distance|
|-------|---------------|-----|----|-------|---------|---------|-------|-------|------------|----------|--------|
|x|one-hot|one-hot|one-hot|one-hot|x|x|x|x|freq+scaler&one-hot|freq+scaler&one-hot|num|

## Read Data & Vectorize & Split

In [73]:
# read data
train_x, train_y, test_x = init_train_test_data(distance_threshold=3)

# vectorize 
train_x,test_x = Vectorize_CallType(train_x, test_x)
train_x,test_x = month_onehot(train_x,test_x)
train_x,test_x = hour_onehot(train_x,test_x)
train_x,test_x = week_onehot(train_x,test_x)
train_x,test_x = Vectorize_Cluster_onehot(train_x,test_x,cluster_name='EndCluster')
train_x,test_x = Vectorize_Cluster_onehot(train_x,test_x,cluster_name='StartCluster')

train_tmp,test_y_tmp, test_tmp = init_train_test_data()
train_tmp,test_tmp = Vectorize_Cluster_freq(train_tmp,test_tmp)
train_x[['StartCluster','EndCluster']] = train_tmp[['StartCluster','EndCluster']]
test_x[['StartCluster','EndCluster']] = test_tmp[['StartCluster','EndCluster']]

# split
train_x,train_y,val_x,val_y,test_x = my_train_validation_split(train_x, train_y, test_x)

train_x=  (500153, 209)
train_y=  (500153,)
val_x=  (214352, 209)
val_y=  (214352,)
test_x=  (320, 209)


## modeling

### init model

In [74]:
model = init_ANN_model(input_size=train_x.shape[1])

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_50 (Dense)            (None, 256)               53760     
                                                                 
 dense_51 (Dense)            (None, 512)               131584    
                                                                 
 dense_52 (Dense)            (None, 128)               65664     
                                                                 
 dense_53 (Dense)            (None, 64)                8256      
                                                                 
 dense_54 (Dense)            (None, 1)                 65        
                                                                 
Total params: 259,329
Trainable params: 259,329
Non-trainable params: 0
_________________________________________________________________


### training & testing(for kaggle submission)

In [75]:
model.fit(train_x, train_y, epochs=30)
result = model.evaluate(val_x, val_y)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


### generate submission csv file

In [76]:
pred_csv = pred_generate_result_csv(
                                    model = model,
                                    test_x = test_x,
                                    result_file_name = 'ANN_result/ANN_result_3a.csv'
                                )



# 1b set

* training set -> drop distance < 3

|TAXI_ID|CALL_TYPE_STAND|month|hour|weekday|start_lon|start_lat|end_lon|end_lat|StartCluster|EndCluster|distance|
|-------|---------------|-----|----|-------|---------|---------|-------|-------|------------|----------|--------|
|x|one-hot|one-hot|one-hot|one-hot|x|x|x|x|freq+scaler|freq+scaler|num|

## Read Data & Vectorize & Split

In [77]:
# read data
train_x, train_y, test_x = init_train_test_data(distance_threshold=5)

# vectorize 
train_x,test_x = Vectorize_CallType(train_x, test_x)
train_x,test_x = month_onehot(train_x,test_x)
train_x,test_x = hour_onehot(train_x,test_x)
train_x,test_x = week_onehot(train_x,test_x)
train_x,test_x = Vectorize_Cluster_freq(train_x,test_x)

# split
train_x,train_y,val_x,val_y,test_x = my_train_validation_split(train_x, train_y, test_x)

train_x=  (207519, 112)
train_y=  (207519,)
val_x=  (88937, 112)
val_y=  (88937,)
test_x=  (320, 112)


## modeling

### init model

In [78]:
model = init_ANN_model(input_size=train_x.shape[1])

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_55 (Dense)            (None, 256)               28928     
                                                                 
 dense_56 (Dense)            (None, 512)               131584    
                                                                 
 dense_57 (Dense)            (None, 128)               65664     
                                                                 
 dense_58 (Dense)            (None, 64)                8256      
                                                                 
 dense_59 (Dense)            (None, 1)                 65        
                                                                 
Total params: 234,497
Trainable params: 234,497
Non-trainable params: 0
_________________________________________________________________


### training & testing(for kaggle submission)

In [79]:
model.fit(train_x, train_y, epochs=30)
result = model.evaluate(val_x, val_y)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


### generate submission csv file

In [80]:
pred_csv = pred_generate_result_csv(
                                    model = model,
                                    test_x = test_x,
                                    result_file_name = 'ANN_result/ANN_result_1b.csv'
                                )



# 2b set

* training set -> drop distance < 5


|TAXI_ID|CALL_TYPE_STAND|month|hour|weekday|start_lon|start_lat|end_lon|end_lat|StartCluster|EndCluster|distance|
|-------|---------------|-----|----|-------|---------|---------|-------|-------|------------|----------|--------|
|x|one-hot|one-hot|one-hot|one-hot|x|x|x|x|one-hot|one-hot|num|

## Read Data & Vectorize & Split

In [14]:
# read data
train_x, train_y, test_x = init_train_test_data(distance_threshold=5)

# vectorize 
train_x,test_x = Vectorize_CallType(train_x, test_x)
train_x,test_x = month_onehot(train_x,test_x)
train_x,test_x = hour_onehot(train_x,test_x)
train_x,test_x = week_onehot(train_x,test_x)
train_x,test_x = Vectorize_Cluster_onehot(train_x,test_x,'StartCluster')
train_x,test_x = Vectorize_Cluster_onehot(train_x,test_x,'EndCluster')

# split
train_x,train_y,val_x,val_y,test_x = my_train_validation_split(train_x, train_y, test_x)

train_x=  (207519, 207)
train_y=  (207519,)
val_x=  (88937, 207)
val_y=  (88937,)
test_x=  (320, 207)


## modeling

### init model

In [16]:
model = init_ANN_model(input_size=train_x.shape[1])

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 256)               53248     
                                                                 
 dense_6 (Dense)             (None, 512)               131584    
                                                                 
 dense_7 (Dense)             (None, 128)               65664     
                                                                 
 dense_8 (Dense)             (None, 64)                8256      
                                                                 
 dense_9 (Dense)             (None, 1)                 65        
                                                                 
Total params: 258,817
Trainable params: 258,817
Non-trainable params: 0
_________________________________________________________________


### training & testing(for kaggle submission)

In [17]:
model.fit(train_x, train_y, epochs=30)
result = model.evaluate(val_x, val_y)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


### generate submission csv file

In [18]:
pred_csv = pred_generate_result_csv(
                                    model = model,
                                    test_x = test_x,
                                    result_file_name = 'ANN_result/ANN_result_2b.csv'
                                )

