In [30]:
import pandas as pd
import numpy as np
import holidays
from sklearn.preprocessing import LabelEncoder

In [31]:
# NOTE: Custome Module !!
from module.dataframe import df_query_by_type, series_string_filter

---

In [32]:
kr_holidays = holidays.KR()

tourism_road = ["관광단지", "관광도로"]
general_road = ["지방도", "일반국도", "길", "로"]
bridge_road = ["교", "천교"]
nontaged_road = ["-"]


def mapping_road(road_name):
    if sum([target in road_name for target in tourism_road]) > 0:
        return "tourism_road"
    elif sum([target in road_name for target in general_road]) > 0:
        return "general_road"
    elif sum([target in road_name for target in bridge_road]) > 0:
        return "bridge"
    elif sum([target in road_name for target in nontaged_road]) > 0:
        return "-"
    else:
        return "-"


def to_date_str(int_date: int):
    day = int_date % 100
    year = (int_date - day) // 10000
    month = (int_date - year * 10000) // 100
    return f"{year}-{month:02}-{day:02}"


def do_preprocessing(train, test) -> None:
    # non-unique 제거하기
    is_unique = train.apply(pd.Series.unique, axis=0).apply(len)
    is_unique[is_unique == len(train)].index, is_unique[is_unique == 1].index
    train = train.drop(columns=is_unique[is_unique == 1].index)
    test = test.drop(columns=is_unique[is_unique == 1].index)

    # weight_restricted - label추가하기
    train["weight_restricted"] = (train["weight_restricted"] != 0).astype(int)
    test["weight_restricted"] = (test["weight_restricted"] != 0).astype(int)

    # is_holiday - label추가하기
    train["is_holiday"] = train["base_date"].apply(to_date_str)
    test["is_holiday"] = test["base_date"].apply(to_date_str)
    train["is_holiday"] = train["is_holiday"].apply(lambda _X: _X in kr_holidays).astype(int)
    test["is_holiday"] = test["is_holiday"].apply(lambda _X: _X in kr_holidays).astype(int)

    # road_type - label추가하기
    train["road_type"] = train["road_name"].apply(mapping_road)
    test["road_type"] = test["road_name"].apply(mapping_road)

    # label encode.
    str_col = ["day_of_week", "start_turn_restricted", "end_turn_restricted", "road_type"]
    for i in str_col:
        le = LabelEncoder()
        le = le.fit(train[i])
        train[i] = le.transform(train[i])

        for label in np.unique(test[i]):
            if label not in le.classes_:
                le.classes_ = np.append(le.classes_, label)
        test[i] = le.transform(test[i])

    # 문자형 데이터 제거하기.
    train = pd.concat([df_query_by_type(train, float), df_query_by_type(train, int)], axis=1)
    test = pd.concat([df_query_by_type(test, float), df_query_by_type(test, int)], axis=1)

    # 날짜처리
    train["base_date_year"] = train["base_date"] // 10000
    train["base_date_month"] = train["base_date"] // 100 % 100
    train["base_date_day"] = train["base_date"] % 100
    train = train.drop(columns="base_date")
    
    test["base_date_year"] = test["base_date"] // 10000
    test["base_date_month"] = test["base_date"] // 100 % 100
    test["base_date_day"] = test["base_date"] % 100
    test = test.drop(columns="base_date")



    # 좌표 정보 제거하기.
    train = train.drop(columns=["start_latitude", "start_longitude", "end_latitude", "end_longitude"])
    test = test.drop(columns=["start_latitude", "start_longitude", "end_latitude", "end_longitude"])

    return train, test

In [33]:
train = pd.read_parquet("./database/train.parquet")
test = pd.read_parquet("./database/test.parquet")

In [34]:
train, test = do_preprocessing(train, test)

---

In [35]:
train_x, train_y = train.drop(columns="target"), train["target"]
test_x = test

---

In [36]:
import numpy as np
from lightgbm import LGBMRegressor
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(train_x, train_y, test_size=0.2, random_state=42)
eval_set = [(x_valid, y_valid)]

lgb_params = {
    "objective": "regression",
    "boosting_type": "gbdt",
    "metric": "rmse",
    "n_estimators": 800,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "reg_alpha": 0.1,
    "reg_lambda": 0.2,
    "subsample": 0.7,
    "colsample_bytree": 0.6,
    "verbose": 1,
}

lgb_regress = LGBMRegressor(
    **lgb_params,
)
lgb_regress.fit(x_train, y_train, eval_metric="rmse", eval_set=eval_set)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064729 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 108
[LightGBM] [Info] Number of data points in the train set: 3760973, number of used features: 15
[LightGBM] [Info] Start training from score 42.786869


In [54]:
# Eval
from sklearn.metrics import r2_score as R2
from sklearn.metrics import root_mean_squared_error as RMSE
from sklearn.metrics import median_absolute_error as MAE

MAE(lgb_regress.predict(x_valid), y_valid), R2(lgb_regress.predict(x_valid), y_valid),RMSE(lgb_regress.predict(x_valid), y_valid)



(5.84272399431044, 0.4141501399320764, 9.644646323047251)

## Submission
---

In [57]:
sample_submission = pd.read_csv('./database/sample_submission.csv')

In [58]:
sample_submission["target"] = lgb_regress.predict(test_x)
sample_submission.to_csv("./submit.csv", index = False)
sample_submission



Unnamed: 0,id,target
0,TEST_000000,23.721783
1,TEST_000001,46.525728
2,TEST_000002,49.950288
3,TEST_000003,37.079919
4,TEST_000004,39.270461
...,...,...
291236,TEST_291236,48.995964
291237,TEST_291237,38.162050
291238,TEST_291238,23.069232
291239,TEST_291239,26.379595


![firstsubmission](./0002.png)