In [46]:
import pandas as pd
import numpy as np

import Jeju_ as jj
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import random

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

from datetime import datetime
import time 

import warnings
warnings.filterwarnings(action='ignore')

# **Data**

In [5]:
Train = pd.read_parquet('../DAT/pro_train2.parquet')
Test = pd.read_parquet('../DAT/pro_test2.parquet')

print(Train.shape, Test.shape)
Train.head(3)

(4701217, 46) (291241, 45)


Unnamed: 0,base_hour,lane_count,maximum_speed_limit,start_latitude,start_longitude,end_latitude,end_longitude,target,dist,dist_jeju_start,...,road_name__3,road_name__4,road_name__5,road_name__6,start_turn_restricted_없음,start_turn_restricted_있음,end_turn_restricted_없음,end_turn_restricted_있음,mpl_lc,dist_lc
0,17,1,60.0,33.427747,126.662612,33.427749,126.662335,52.0,0.025766,15.317086,...,0,1,0,0,1,0,1,0,60.0,0.025766
1,21,2,60.0,33.50073,126.529107,33.504811,126.52624,30.0,0.525188,0.589676,...,0,0,0,0,0,1,1,0,30.0,0.262594
2,7,2,80.0,33.279145,126.368598,33.280072,126.362147,61.0,0.609613,28.459505,...,0,0,0,0,1,0,1,0,40.0,0.304806


## **_데이터 추가 전처리**

- 최대속도제한 변수: One Hot Encoding
- 최대속도제한/차선수 변수: One Hot Encoding

In [11]:
print(Train["mpl_lc"].unique())
print(Train["maximum_speed_limit"].unique())

[60.         30.         40.         25.         50.         23.33333333
 35.         70.         16.66666667 20.         15.         80.
 10.        ]
[60. 80. 50. 70. 30. 40.]


In [37]:
Train2 = jj.cat_onehot(Train, var = ["mpl_lc", "maximum_speed_limit", "lane_count"])
Test2 = jj.cat_onehot(Test, var = ["mpl_lc", "maximum_speed_limit", "lane_count"])

Cat_Onehot


  0%|          | 0/3 [00:00<?, ?it/s]

Cat_Onehot


  0%|          | 0/3 [00:00<?, ?it/s]

In [38]:
print(Train2.shape)
Train2.head(3)

(4701217, 65)


Unnamed: 0,base_hour,start_latitude,start_longitude,end_latitude,end_longitude,target,dist,dist_jeju_start,dist_jeju_end,dist_seoqui_start,...,mpl_lc_80.0,maximum_speed_limit_30.0,maximum_speed_limit_40.0,maximum_speed_limit_50.0,maximum_speed_limit_60.0,maximum_speed_limit_70.0,maximum_speed_limit_80.0,lane_count_1,lane_count_2,lane_count_3
0,17,33.427747,126.662612,33.427749,126.662335,52.0,0.025766,15.317086,15.295145,21.046263,...,0,0,0,0,1,0,0,1,0,0
1,21,33.50073,126.529107,33.504811,126.52624,30.0,0.525188,0.589676,0.552649,26.899768,...,0,0,0,0,1,0,0,0,1,0
2,7,33.279145,126.368598,33.280072,126.362147,61.0,0.609613,28.459505,28.678786,17.801324,...,0,0,0,0,0,0,1,0,1,0


# **Train and Validation**

## **_Split 7:3**

In [39]:
train_X, valid_X, train_y, valid_y = train_test_split(Train2.drop(columns = ["target"]), Train2["target"], 
                                                      test_size = 0.3, shuffle = True, random_state = 717)

train_X = train_X.reset_index(drop = True)
train_y = train_y.reset_index(drop = True)
valid_X = valid_X.reset_index(drop = True)
valid_y = valid_y.reset_index(drop = True)

print(train_X.shape, valid_X.shape, train_y.shape, valid_y.shape)

(3290851, 64) (1410366, 64) (3290851,) (1410366,)


## **_표준화**

In [40]:
num_var = ['base_hour', 'start_latitude', 'start_longitude', 'end_latitude', 'end_longitude', 
           "dist", 'dist_jeju_start', 'dist_jeju_end', 'dist_seoqui_start', 'dist_seoqui_end', "dist_airport_start", "dist_airport_end",
           'temp', 'rain', 'wind', 'humidity', 'sunny', 'snow', 'land_temp', "covid",
           "dist_lc"]

# num_var

In [41]:
st_train_X, st_valid_X = jj.std_(train_X, valid_X, var = num_var)

## **_Modeling**

### **__Models**

In [47]:
SEED = 717
model_dict = {}
model_dict["RF"] = RandomForestRegressor(random_state = SEED)
model_dict["ET"] = ExtraTreesRegressor(random_state = SEED)
model_dict["LGBM"] = LGBMRegressor(random_state = SEED)
model_dict["XGB"] = XGBRegressor(random_state = SEED)
model_dict["GB"] = GradientBoostingRegressor(random_state = SEED)
model_dict["CAT"] = CatBoostRegressor(random_state = SEED, silent = True, loss_function = "MAE")

### **__Fit and Predict**

In [None]:
pred_dict = {}
for name_, model_ in tqdm(model_dict.items()):
    start_time = time.time()
    model_fit = model_.fit(train_X, train_y)
    pred = model_fit.predict(valid_X)
    pred_dict[name_] = pred
    print("%s NMAE: %.4f| 소요시간: %.4fsec"% (name_, mean_absolute_error(np.ravel(valid_y), np.ravel(pred)), time.time()-start_time))

  0%|          | 0/6 [00:00<?, ?it/s]

### **__Ensemble**

In [None]:
def ensemble_result(pred_dict, true_y = valid_y, ascending = True):
    import itertools

    ensemble_df = pd.DataFrame()
    arr_name = pred_dict.keys()
    arr_value = pred_dict.values()

    for i in range(1, len(arr_name)+1):
        ncr_name = list(itertools.combinations(arr_name, i))
        ncr_value = list(itertools.combinations(arr_value, i))
        for j in range(len(ncr_name)):
            pred_ens = np.mean(list(ncr_value[j]), axis = 0)       
            ens_NMAE = NMAE(np.ravel(true_y), pred_ens)
            ensemble_df[ncr_name[j]] = [np.round(ens_NMAE, 4)]


    ensemble_df.index = ["result"]
    ensemble_df = ensemble_df.T.sort_values(by = "result", ascending = ascending)
    return ensemble_df

In [None]:
ensemble_df = ensemble_result(pred_dict)
ensemble_df.head(3)

## **Test and Submission**

## **_Test**

In [None]:
st_Train, st_Test = jj.std_(Train2, Test2, var = num_var)
st_Train["target"]

In [None]:
st_Train_X = st_Train.drop(columns = ["target"])
st_Train_y = st_Train["target"]
print(st_Train_X.shape, st_Train_y.shape)

In [None]:
Pred_list = []
for model_name in ensemble_df.index[0]:
    model_fit = model_dict[model_name].fit(Train_X_pls, Train_y)
    pred = model_fit.predict(test_pls)
    Pred_list.append(pred)   

In [None]:
Final_pred = np.mean(Pred_list, axis = 0)
print(ensemble_df.index[0], Final_pred) 

## **_Submission**

In [18]:
sample_submission = pd.read_csv('../DAT/sample_submission.csv')

In [19]:
sample_submission['target'] = Final_pred
sample_submission.head()

Unnamed: 0,id,target
0,TEST_000000,27.0
1,TEST_000001,43.0
2,TEST_000002,68.0
3,TEST_000003,38.0
4,TEST_000004,44.0
...,...,...
291236,TEST_291236,47.0
291237,TEST_291237,52.0
291238,TEST_291238,22.0
291239,TEST_291239,22.0


In [20]:
sample_submission.to_csv("../OUT/"+ str(datetime.today())[:10] + " final.csv", index = False)