In [None]:
import pandas as pd
import numpy as np
import holidays
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [3]:
# NOTE: Custome Module !!
from module.dataframe import df_query_by_type, series_string_filter

---

In [4]:
kr_holidays = holidays.KR()

tourism_road = ["관광단지", "관광도로"]
general_road = ["지방도", "일반국도", "길", "로"]
bridge_road = ["교", "천교"]
nontaged_road = ["-"]


def mapping_road(road_name):
    if sum([target in road_name for target in tourism_road]) > 0:
        return "tourism_road"
    elif sum([target in road_name for target in general_road]) > 0:
        return "general_road"
    elif sum([target in road_name for target in bridge_road]) > 0:
        return "bridge"
    elif sum([target in road_name for target in nontaged_road]) > 0:
        return "-"
    else:
        return "-"


def to_date_str(int_date: int):
    day = int_date % 100
    year = (int_date - day) // 10000
    month = (int_date - year * 10000) // 100
    return f"{year}-{month:02}-{day:02}"


# NOTE: 모든 값이 같은 열을 제거한다.
def do_preprocessing(train, test):
    # non-unique 제거하기
    is_unique = train.apply(pd.Series.unique, axis=0).apply(len)
    is_unique[is_unique == len(train)].index, is_unique[is_unique == 1].index
    train = train.drop(columns=is_unique[is_unique == 1].index)
    test = test.drop(columns=is_unique[is_unique == 1].index)

    # weight_restricted - label추가하기
    train["weight_restricted"] = (train["weight_restricted"] != 0).astype(int)
    test["weight_restricted"] = (test["weight_restricted"] != 0).astype(int)

    # is_holiday - label추가하기
    train["is_holiday"] = train["base_date"].apply(to_date_str)
    test["is_holiday"] = test["base_date"].apply(to_date_str)
    train["is_holiday"] = train["is_holiday"].apply(lambda _X: _X in kr_holidays).astype(int)
    test["is_holiday"] = test["is_holiday"].apply(lambda _X: _X in kr_holidays).astype(int)

    # road_type - label추가하기
    train["road_type"] = train["road_name"].apply(mapping_road)
    test["road_type"] = test["road_name"].apply(mapping_road)

    # label encode.
    str_col = [
        "day_of_week",
        "start_turn_restricted",
        "end_turn_restricted",
        "road_type",
        "is_holiday",
        "weight_restricted",
    ]
    for col in str_col:
        label_enc = LabelEncoder()
        label_enc = label_enc.fit(train[col])
        train[col] = label_enc.transform(train[col])

        for label in np.unique(test[col]):
            if label not in label_enc.classes_:
                label_enc.classes_ = np.append(label_enc.classes_, label)
        test[col] = label_enc.transform(test[col])

    # One-Hot Enc
    train = pd.get_dummies(
        train,
        columns=str_col,
        dtype=int,
    )
    test = pd.get_dummies(
        test,
        columns=str_col,
        dtype=int,
    )

    # 날짜 삽입입
    train["base_date_year"] = train["base_date"] // 10000
    train["base_date_month"] = train["base_date"] // 100 % 100
    train["base_date_day"] = train["base_date"] % 100
    train = train.drop(columns="base_date")

    test["base_date_year"] = test["base_date"] // 10000
    test["base_date_month"] = test["base_date"] // 100 % 100
    test["base_date_day"] = test["base_date"] % 100
    test = test.drop(columns="base_date")

    # 문자형 데이터 제거하기.
    train = pd.concat([df_query_by_type(train, float), df_query_by_type(train, int)], axis=1)
    test = pd.concat([df_query_by_type(test, float), df_query_by_type(test, int)], axis=1)

    # 좌표 정보 제거하기.
    train = train.drop(columns=["start_latitude", "start_longitude", "end_latitude", "end_longitude"])
    test = test.drop(columns=["start_latitude", "start_longitude", "end_latitude", "end_longitude"])

    return train, test

In [5]:
train = pd.read_parquet("./database/train.parquet")
test = pd.read_parquet("./database/test.parquet")

In [6]:
train, test = do_preprocessing(train, test)

---

In [7]:
train_x, train_y = train.drop(columns="target"), train["target"]
test_x = test

---

In [8]:
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor

from lightgbm import LGBMRegressor

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(train_x, train_y, test_size=0.01, random_state=42)

models = []
for i in tqdm(range(10)):
    x_train_part, x_eval, y_train_part, y_eval = train_test_split(train_x, train_y, test_size=0.3)
    lgb_params = {
        "objective": "regression",
        "boosting_type": "gbdt",
        "metric": "rmse",
        "n_estimators": 750,
        "learning_rate": 0.0882,
        "feature_fraction": 0.8,
        "reg_lambda": 0.001,
        "reg_alpha": 0.0001,
        "subsample": 0.85,
        "verbose": 1,
        "num_leaves": 190,
    }
    lgb_regress = LGBMRegressor(
        **lgb_params,
    )
    eval_set = [(x_valid, y_valid), (x_eval, y_eval)]
    lgb_regress.fit(x_train_part, y_train_part, eval_metric="rmse", eval_set=eval_set)
    models.append(lgb_regress)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.072980 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 3290851, number of used features: 28
[LightGBM] [Info] Start training from score 42.779781
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058638 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 127
[LightGBM] [Info] Number of data points in the train set: 3290851, number of used features: 28
[LightGBM] [Info] Start training from score 42.788018
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.066238 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not e

In [None]:
# lgb_params = {
#     "objective": "regression",
#     "boosting_type": "gbdt",
#     "metric": "rmse",
#     "n_estimators": 750,
#     "learning_rate": 0.0882,
#     "feature_fraction": 0.95,
#     "reg_lambda": 0.001,
#     "reg_alpha": 0.0001,
#     "subsample": 0.85,
#     "verbose": 2,
#     'num_leaves': 190
# }
# lgb_regress = LGBMRegressor(
#     **lgb_params,
# )

# lgb_regress.fit(x_train, y_train, eval_metric="rmse", eval_set=eval_set)

In [None]:
# # Eval
# from sklearn.metrics import root_mean_squared_error as RMSE
# from sklearn.metrics import median_absolute_error as MAE
# from sklearn.metrics import r2_score as R2

# [
#     MAE(lgb_regress.predict(x_valid), y_valid),
#     R2(lgb_regress.predict(x_valid), y_valid),
# ]




[5.672243434386836, 0.4447738314754437]

## Submission
---

In [None]:
sample_submission = pd.read_csv('./database/sample_submission.csv')
final_submission = pd.read_csv('./database/sample_submission.csv')


In [None]:
# lgb_params = {
#     "objective": "regression",
#     "boosting_type": "gbdt",
#     "metric": "rmse",
#     "n_estimators": 1500,
#     "learning_rate": 0.09,
#     "feature_fraction": 0.95,
#     "reg_lambda": 0.001,
#     "reg_alpha": 0.0001,
#     "subsample": 0.85,
#     "verbose": 2,
#     'num_leaves': 90
# }
# [5.746946134178014, 0.43311401006703243]

In [None]:
# lgb_params = {
#     "objective": "regression",
#     "boosting_type": "gbdt",
#     "metric": "rmse",
#     "n_estimators": 1500,
#     "learning_rate": 0.0885,
#     "feature_fraction": 0.95,
#     "reg_lambda": 0.001,
#     "reg_alpha": 0.0001,
#     "subsample": 0.85,
#     "verbose": 2,
#     'num_leaves': 90
# }
# [5.747548193187486, 0.43289453456149374]

In [None]:
# NOTE:1
# [5.761627551595902, 0.43105196246630095]
# -> 'num_leaves': 256
# [5.751544901162482, 0.4324136364045469]
# -> 'num_leaves': 190
# [5.752340975253492, 0.4324743850641275]
# -> 'num_leaves': 180
# lgb_params = {
#     "objective": "regression",
#     "boosting_type": "gbdt",
#     "metric": "rmse",
#     "n_estimators": 1500,
#     "learning_rate": 0.0882,
#     "feature_fraction": 0.95,
#     "reg_lambda": 0.001,
#     "reg_alpha": 0.0001,
#     "subsample": 0.85,
#     "verbose": 2,
#     'num_leaves': 90
# }
# [5.79309681117978, 0.42550610451006965]
# -> 'num_leaves': 48
# [5.7543164050935545, 0.4317961056989281]
# -> 'num_leaves': 10
# [5.838684291356461, 0.4139329591331222]

In [None]:
# NOTE:2
# lgb_params = {
#     "objective": "regression",
#     "boosting_type": "gbdt",
#     "metric": "rmse",
#     "n_estimators": 2000,
#     "learning_rate": 0.0882,
#     "feature_fraction": 0.95,
#     "reg_lambda": 0.001,
#     "reg_alpha": 0.0001,
#     "subsample": 0.85,
#     "verbose": 2,
#     'num_leaves': 190
# }
# [5.757305149277347, 0.43135064392503075]
# n_estimators -> 1500
# [5.751544901162482, 0.4324136364045469]
# n_estimators -> 900
# [5.748957234746737, 0.432866128090696]
# n_estimators -> 850
# [5.748207896096723, 0.4329333047085594]
# n_estimators -> 800
# [5.7471590588239, 0.4329834431038184]
# n_estimators -> 750
# [5.74673096535663, 0.4329565497995935]
# n_estimators -> 700
# [5.748021316320747, 0.43296032209695046]
# n_estimators -> 600
# [5.74997947849212, 0.4327542009939448]
# n_estimators -> 400
# [5.753948085151837, 0.43145049845013994]
# n_estimators -> 200
# [5.769750016732333, 0.42777351387225127]

In [None]:
# NOTE:3
# -> 1
# [5.747376345440024, 0.43301548771802834]
# lgb_params = { ############## 이거씀씀
#     "objective": "regression",
#     "boosting_type": "gbdt",
#     "metric": "rmse",
#     "n_estimators": 750,
#     "learning_rate": 0.0882,
#     "feature_fraction": 0.95,
#     "reg_lambda": 0.001,
#     "reg_alpha": 0.0001,
#     "subsample": 0.85,
#     "verbose": 2,
#     'num_leaves': 190
# }
# [5.74673096535663, 0.4329565497995935]
# feature_fraction -> 0.7
# [5.751887897278465, 0.4317971745459652]
# feature_fraction -> 0.3
# [5.902017632629661, 0.4022951157263006]


In [None]:
# lgb_params = {
#     "objective": "regression",
#     "boosting_type": "gbdt",
#     "metric": "rmse",
#     "n_estimators": 1500,
#     "learning_rate": 0.088,
#     "feature_fraction": 0.95,
#     "reg_lambda": 0.001,
#     "reg_alpha": 0.0001,
#     "subsample": 0.85,
#     "verbose": 2,
#     'num_leaves': 90
# }
# [5.745848408718423, 0.43343802094811135]

In [None]:
# lgb_params = {
#     "objective": "regression",
#     "boosting_type": "gbdt",
#     "metric": "rmse",
#     "n_estimators": 1500,
#     "learning_rate": 0.085,
#     "feature_fraction": 0.95,
#     "reg_lambda": 0.001,
#     "reg_alpha": 0.0001,
#     "subsample": 0.85,
#     "verbose": 2,
#     'num_leaves': 90
# }
# [5.746014257796521, 0.4331837299248704]

In [None]:

lgb_params = {
    "objective": "regression",
    "boosting_type": "gbdt",
    "metric": "rmse",
    "n_estimators": 1500,
    "learning_rate": 0.08,
    "feature_fraction": 0.95,
    "reg_lambda": 0.001,
    "reg_alpha": 0.0001,
    "subsample": 0.85,
    "verbose": 2,
    'num_leaves': 90
}
[5.746019162681328, 0.43304242874413157]

import pickle
with open("./model/lgb_regress-ens-6.pkl","wb") as f:
    pickle.dump(models,f)




In [None]:
sample_submission["target"] = models[0].predict(test_x)
for idx,lgb_model in enumerate(models[1:]):
    sample_submission[idx] = lgb_model.predict(test_x)



In [None]:
final_submission["target"]=sample_submission.drop(columns="id").mean(axis=1)
final_submission.to_csv("./lgb_regress-ens-6.csv", index = False)
final_submission

Unnamed: 0,id,target
0,TEST_000000,23.372196
1,TEST_000001,47.294446
2,TEST_000002,47.929473
3,TEST_000003,37.162168
4,TEST_000004,39.868535
...,...,...
291236,TEST_291236,48.519542
291237,TEST_291237,37.438202
291238,TEST_291238,22.898877
291239,TEST_291239,26.570180


In [None]:
# lgb_params = {
#     "objective": "regression",
#     "boosting_type": "gbdt",
#     "metric": "rmse",
#     "n_estimators": 1500,
#     "learning_rate": 0.07,
#     "feature_fraction": 0.95,
#     "reg_lambda": 0.001,
#     "reg_alpha": 0.0001,
#     "subsample": 0.85,
#     "verbose": 2,
#     'num_leaves': 90
# }
# [5.747496090135723, 0.4327742262599782]
# 6.095245684
# import pickle
# with open("./model/lgb_regress-3.pkl","wb") as f:
#     pickle.dump(lgb_regress,f)
# sample_submission["target"] = lgb_regress.predict(test_x)
# sample_submission.to_csv("./lgb_regress-3.csv", index = False)
# sample_submission

![decisiontree](./0004.png)
![linear](./0004.png)