In [1]:
import numpy as np
import pandas as pd
import warnings
import geopy.distance
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error
import math

In [2]:
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

In [3]:
idx_trn = train_df.date.isin(train_df.date.unique()[:-7])
trn_df = train_df[idx_trn]
val_df = train_df[np.invert(idx_trn)]

# Function start

In [4]:
def data_prepare(df):
    try:
        y = df['18~20_ride']
    except:
        y = None
        
    df['date'] = pd.to_datetime(df['date'])
    df['weekday'] = df['date'].dt.weekday
    df = pd.get_dummies(df, columns=['weekday'])
    df['in_out'] = df['in_out'].map({'시내':0,'시외':1})


    coords_jejusi = (33.500770, 126.522761) #제주시의 위도 경도
    coords_seoquipo = (33.259429, 126.558217) #서귀포시의 위도 경도

    df['dis_jejusi'] = [geopy.distance.vincenty((df['latitude'].iloc[i], df['longitude'].iloc[i]), coords_jejusi).km for i in range(len(df))]
    df['dis_seoquipo'] = [geopy.distance.vincenty((df['latitude'].iloc[i], df['longitude'].iloc[i]), coords_seoquipo).km for i in range(len(df))]
    
    return df, y

In [5]:
trn, y_trn = data_prepare(trn_df)
val, y_val = data_prepare(val_df)

In [6]:
te, _ = data_prepare(test_df)

In [7]:
input_var=['in_out','latitude','longitude','6~7_ride', '7~8_ride', '8~9_ride', '9~10_ride',
       '10~11_ride', '11~12_ride', '6~7_takeoff', '7~8_takeoff', '8~9_takeoff',
       '9~10_takeoff', '10~11_takeoff', '11~12_takeoff','weekday_0', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4',
       'weekday_5', 'weekday_6', 'dis_jejusi', 'dis_seoquipo']
target=['18~20_ride']

In [8]:
x_trn = trn[input_var]
x_val = val[input_var]
x_te = te[input_var]

In [88]:
keras.backend.clear_session()

In [89]:
inputs = keras.Input(shape=(24,))

In [90]:
x = keras.layers.Dense(128, activation='relu')(inputs)

In [91]:
x = keras.layers.Dense(256, activation='relu')(x)
x = keras.layers.Dense(32, activation='tanh')(x)

In [92]:
outputs = keras.layers.Dense(1, activation='linear')(x)

In [93]:
model = keras.Model(inputs=inputs, outputs=outputs, name='dacon_c13_model')

In [94]:
model.compile(loss='mse' )

In [95]:
model.fit(x=x_trn,
          y=y_trn,
          batch_size=512,epochs=100,validation_data=(x_val, y_val)
         )

Train on 310687 samples, validate on 104736 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74

<tensorflow.python.keras.callbacks.History at 0x23e4ae51148>

In [None]:
# emb_week = keras.layers.Embedding(7, 3)

In [42]:
def esitimate_model(mdl, x_trn, y_trn, x_val, y_val):
    trns = mdl.predict(x_trn)
    vals = mdl.predict(x_val)

    print(math.sqrt(mean_squared_error(y_true=y_trn, y_pred=trns)))
    print(math.sqrt(mean_squared_error(y_true=y_val, y_pred=vals)))

In [40]:
from sklearn.ensemble import RandomForestRegressor
rf = ExtraTreesRegressor(max_depth=20, random_state=1991)

In [51]:
import lightgbm as lgb

In [68]:
m = lgb.LGBMRegressor(n_estimators=2000, num_leaves=31)

In [69]:
m.fit(x_trn, y_trn)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=2000, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [70]:
esitimate_model(m, x_trn, y_trn, x_val, y_val)

1.6630472310665352
2.734588194272103


# Sumbmission

In [72]:
test_df['18~20_ride'] = m.predict(x_te)
test_df[['id','18~20_ride']].to_csv("lgb_base.csv",index=False)