#데이터 전처리 

## Import // 데이터 생성

In [32]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import random
import os
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
%cd /content/drive/MyDrive/데이콘/[데이콘]제주도 도로 교통량 예측/data

# !unzip -qq "/content/drive/MyDrive/데이콘/[데이콘]제주도 도로 교통량 예측/data/open.zip"

/content/drive/MyDrive/데이콘/[데이콘]제주도 도로 교통량 예측/data


In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import gc

## csv to parquet
#### -> 메모리에 효율적인 데이터 유형을 사용하여 용량을 크게 줄이고 빠른 작업이 가능합니다.

In [35]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

## 데이터 불러오기

In [36]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')

# 라벨 인코딩

In [37]:
# 카테고리컬 피쳐
str_col = ["road_rating","road_name","connect_code","weight_restricted","road_type","start_node_name","start_turn_restricted","end_node_name","end_turn_restricted"]
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

#피쳐 생성
- 년도 (완)
- 달 (완)
- 날짜 (완)
- 계절 (완)
- 공휴일 여부

- 날씨 ( 화창 / 비) - 외부데이터

- start/end node에 train 데이터 셋 count를 하여 비율 구함 (완)

In [38]:
train["year"] = train["base_date"].apply(lambda x : int(str(x)[:4]))
train["month"] = train["base_date"].apply(lambda x : int(str(x)[4:6]))
train["day"] = train["base_date"].apply(lambda x : int(str(x)[6:]))

test["year"] = test["base_date"].apply(lambda x : int(str(x)[:4]))
test["month"] = test["base_date"].apply(lambda x : int(str(x)[4:6]))
test["day"] = test["base_date"].apply(lambda x : int(str(x)[6:]))

In [39]:
#  양력 > 3~5월이 봄, 6~8월이 여름, 9~11월이 가을, 12~2월이 겨울이다. 
train["season_plus"] = train["month"].apply(lambda x :  0 if 3<= x <=5 else 1 if 6<= x <=8 else 2 if 9<= x <=10 else 3)

test["season_plus"] = test["month"].apply(lambda x :  0 if 3<= x <=5 else 1 if 6<= x <=8 else 2 if 9<= x <=10 else 3)

In [40]:
start_node_ratio_set = pd.DataFrame((train["start_node_name"].value_counts() * 100) / len(train) ).sort_index()
end_node_ratio_set = pd.DataFrame((train["end_node_name"].value_counts() * 100) / len(train) ).sort_index()

In [41]:
train["startNodeRatio"] = train["start_node_name"].apply(lambda x : start_node_ratio_set.iloc[x].values[0])
train["endNodeRatio"] = train["end_node_name"].apply(lambda x : end_node_ratio_set.iloc[x].values[0])

In [42]:
test["startNodeRatio"] = test["start_node_name"].apply(lambda x : start_node_ratio_set.iloc[x].values[0])
test["endNodeRatio"] = test["end_node_name"].apply(lambda x : end_node_ratio_set.iloc[x].values[0])

#위/경도로 거리 구함

In [43]:
!pip install haversine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting haversine
  Downloading haversine-2.7.0-py2.py3-none-any.whl (6.9 kB)
Installing collected packages: haversine
Successfully installed haversine-2.7.0


In [44]:
from haversine import haversine
from tqdm import tqdm
km_result = []
mile_result = []
nmi_result = []
for i in tqdm(range(len(train))):
  A = (train["start_latitude"].iloc[i], train["start_longitude"].iloc[i]) # (lat, lon)
  B = (train["end_latitude"].iloc[i], train["end_longitude"].iloc[i])

  km_result.append(haversine(A, B)) # km

100%|██████████| 4701217/4701217 [03:29<00:00, 22419.40it/s]


In [45]:
train = pd.concat([train,pd.DataFrame(km_result,columns = ["km"])],axis = 1)

In [46]:
km_result = []
mile_result = []
nmi_result = []
for i in tqdm(range(len(test))):
  A = (test["start_latitude"].iloc[i], test["start_longitude"].iloc[i]) # (lat, lon)
  B = (test["end_latitude"].iloc[i], test["end_longitude"].iloc[i])

  km_result.append(haversine(A, B)) # km

100%|██████████| 291241/291241 [00:14<00:00, 20364.25it/s]


In [47]:
test = pd.concat([test,pd.DataFrame(km_result,columns = ["km"])],axis = 1)


# 파생 피쳐 생성 & 추가 피쳐 생성
- 경,위도를 사용하여 거리 (완)
- 거 / 속 > 시 - 대략 걸리는 시간 (완)

-휴일

In [48]:
train["distance"] = ((train["start_latitude"] - train["end_latitude"])**2 + (train["start_longitude"] - train["end_longitude"])**2 ) ** (1/2)
test["distance"] = ((test["start_latitude"] - test["end_latitude"])**2 + (test["start_longitude"] - test["end_longitude"])**2 ) ** (1/2)

In [49]:
train["expect_time"] = (train["distance"] * 10000) / train["maximum_speed_limit"]
test["expect_time"] = (test["distance"] * 10000) / test["maximum_speed_limit"]

In [50]:
# 휴일 생성
!pip install pytimekr

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytimekr
  Downloading pytimekr-0.1.0.tar.gz (7.3 kB)
Collecting lunardate>=0.1.5
  Downloading lunardate-0.2.0-py3-none-any.whl (5.6 kB)
Building wheels for collected packages: pytimekr
  Building wheel for pytimekr (setup.py) ... [?25l[?25hdone
  Created wheel for pytimekr: filename=pytimekr-0.1.0-py3-none-any.whl size=7941 sha256=6cca4d317f1f50c773b5e29aa923c1800254b0e91406265640659fc1bb7d1d71
  Stored in directory: /root/.cache/pip/wheels/7c/54/31/d099bce210ce51c45eeb97e149699dedc66c78eb84c01be9c2
Successfully built pytimekr
Installing collected packages: lunardate, pytimekr
Successfully installed lunardate-0.2.0 pytimekr-0.1.0


In [51]:
from pytimekr import pytimekr

holiday_list = []
for j in range(2021,2023):
  for i in pytimekr.holidays(year=j):
    holiday_list.append(int(i.strftime("%Y%m%d")))

In [52]:
# 휴일 추가 ( 공휴일)
train["holiday"] = 0
test["holiday"] = 0
train["holiday"] = train["base_date"].apply(lambda x: 1 if x in holiday_list else 0)
test["holiday"] = test["base_date"].apply(lambda x: 1 if x in holiday_list else 0)

In [53]:
# 휴일 - 주말
train["holiday"] = train["day_of_week"].apply(lambda x : 1 if x in ["토","일"] else 0)
test["holiday"] = test["day_of_week"].apply(lambda x : 1 if x in ["토","일"] else 0)

In [54]:
# 요일 변경 0 >월 ~6 일
train["day_of_week"] = train["day_of_week"].apply(lambda x : 0 if x == "월" else 1 if x == "화" else 2 if x == "수" else 3 if x == "목" else 4 if x == "금" else 5 if x == "토" else 6)
test["day_of_week"] = test["day_of_week"].apply(lambda x : 0 if x == "월" else 1 if x == "화" else 2 if x == "수" else 3 if x == "목" else 4 if x == "금" else 5 if x == "토" else 6)

In [55]:
result = []
for i in tqdm(range(len(train))):
  result.append(str(train["start_node_name"].iloc[i]) + "_" +str(train["road_name"].iloc[i]) +"_"+str(train["end_node_name"].iloc[i]))
train["s_r_e"] = result

100%|██████████| 4701217/4701217 [02:29<00:00, 31377.58it/s]


In [56]:
result = []
for i in tqdm(range(len(test))):
  result.append(str(test["start_node_name"].iloc[i]) + "_" +str(test["road_name"].iloc[i]) +"_"+str(test["end_node_name"].iloc[i]))
test["s_r_e"] = result

100%|██████████| 291241/291241 [00:09<00:00, 30985.13it/s]


In [57]:
# 카테고리컬 피쳐
str_col = ["s_r_e"]

for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
            print(label)
    test[i]=le.transform(test[i])

In [58]:
train["return"] = train["start_turn_restricted"] + train["end_turn_restricted"] 
test["return"] = test["start_turn_restricted"] + test["end_turn_restricted"] 

In [59]:
train["plusRatio"] = train["startNodeRatio"] + train["endNodeRatio"] 
test["plusRatio"] = test["startNodeRatio"] + test["endNodeRatio"] 

In [60]:
train["geniality"] = train["lane_count"] * (train["road_rating"] + 0.00001) * (train["road_type"] + 0.00001)
test["geniality"] = test["lane_count"] * (test["road_rating"] + 0.00001) * (test["road_type"] + 0.00001)

In [61]:
train["rename"] = (train["start_node_name"]+0.00001) / (train["end_node_name"]+0.00001)
test["rename"] = (test["start_node_name"]+0.00001) / (test["end_node_name"]+0.00001)

In [62]:
train['sin_24_1'] = np.sin(2 * np.pi * train['base_hour']/23.0) * np.cos(2 * np.pi * train['base_hour']/23.0)
test['sin_24_1'] = np.sin(2 * np.pi * test['base_hour']/23.0) * np.cos(2 * np.pi * test['base_hour']/23.0)

In [63]:
train['sin_season'] = np.sin(2 * np.pi * train['season_plus']/4) * np.cos(2 * np.pi * train['season_plus']/4)
test['sin_season'] = np.sin(2 * np.pi * test['season_plus']/4) * np.cos(2 * np.pi * test['season_plus']/4)

In [64]:
train["sin_month"] = np.sin(2 * np.pi * train['month']/12) * np.cos(2 * np.pi * train['month']/12)
test["sin_month"] = np.sin(2 * np.pi * test['month']/12) * np.cos(2 * np.pi * test['month']/12)

In [65]:
train.to_parquet(f'/content/drive/MyDrive/데이콘/[데이콘]제주도 도로 교통량 예측/data/train_middle_test2.parquet')
test.to_parquet(f'/content/drive/MyDrive/데이콘/[데이콘]제주도 도로 교통량 예측/data/test_middle_test2.parquet')

# 저장 데이터 불러오기

In [66]:
train = pd.read_parquet('/content/drive/MyDrive/데이콘/[데이콘]제주도 도로 교통량 예측/data/train_middle_test2.parquet')
test = pd.read_parquet('/content/drive/MyDrive/데이콘/[데이콘]제주도 도로 교통량 예측/data/test_middle_test2.parquet')

In [67]:
y_train = train['target'] 

x_train = train.drop(['id','base_date', 'target','vehicle_restricted',"height_restricted","year","day","multi_linked","weight_restricted"], axis=1)
x_test = test.drop(['id','base_date','vehicle_restricted',"height_restricted","year","day","multi_linked","weight_restricted"], axis=1)

# 옵튜나 적용 - rf / catboost /lgbm

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm

rf_result = 0

rf_model = RandomForestRegressor(random_state= 42 ,n_jobs = -1,verbose=1,
                                    min_samples_split= 22, min_samples_leaf= 4, n_estimators= 204).fit(x_train, y_train)

rf_result_test = rf_model.predict(x_test) # 제출해야 하는 test

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 19.2min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 83.3min
[Parallel(n_jobs=-1)]: Done 204 out of 204 | elapsed: 88.0min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   43.3s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:  3.6min
[Parallel(n_jobs=4)]: Done 204 out of 204 | elapsed:  3.8min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   10.0s
[Parallel(n_jobs=4)]: Done 204 out of 204 | elapsed:   10.6s finished


In [None]:
sample_submission = pd.read_csv('/content/drive/MyDrive/데이콘/[데이콘]제주도 도로 교통량 예측/data/sample_submission.csv')

In [None]:
sample_submission['target'] = rf_result_test
sample_submission.to_csv("/content/drive/MyDrive/데이콘/[데이콘]제주도 도로 교통량 예측/data/데이터처리code/submit/submmit_1114_rf_dayofweek_3.037_raw.csv", index = False)

#catboost - 옵튜나 적용

In [None]:
# catboost
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 110 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [None]:
import catboost
cat_model = catboost.CatBoostRegressor(iterations= 1577, learning_rate= 0.2984773243425316, reg_lambda= 6.35584439301886, subsample= 0.3109823217156622, random_strength= 16.933982779310604,
                                       depth =12, min_data_in_leaf= 14, leaf_estimation_iterations= 23, bagging_temperature= 0.06812233896860301,
                                       random_state=42,verbose= 1).fit(x_train, y_train)
val_result = cat_model.predict(x_test)


0:	learn: 12.8665005	total: 1.31s	remaining: 34m 29s
1:	learn: 10.7924999	total: 2.71s	remaining: 35m 35s
2:	learn: 9.3786352	total: 3.8s	remaining: 33m 12s
3:	learn: 8.4074514	total: 5.12s	remaining: 33m 31s
4:	learn: 7.8268586	total: 6.73s	remaining: 35m 17s
5:	learn: 7.4129384	total: 8.25s	remaining: 36m 1s
6:	learn: 7.1227560	total: 9.98s	remaining: 37m 18s
7:	learn: 6.8637292	total: 11.6s	remaining: 38m 2s
8:	learn: 6.7199905	total: 14.2s	remaining: 41m 8s
9:	learn: 6.5461880	total: 16.4s	remaining: 42m 56s
10:	learn: 6.4337858	total: 17.6s	remaining: 41m 52s
11:	learn: 6.3207245	total: 18.8s	remaining: 40m 54s
12:	learn: 6.2745214	total: 20.3s	remaining: 40m 40s
13:	learn: 6.1226423	total: 21.4s	remaining: 39m 43s
14:	learn: 6.0636830	total: 22.8s	remaining: 39m 31s
15:	learn: 5.9934201	total: 23.9s	remaining: 38m 53s
16:	learn: 5.9425024	total: 25s	remaining: 38m 12s
17:	learn: 5.8975872	total: 26.1s	remaining: 37m 36s
18:	learn: 5.8690908	total: 27.2s	remaining: 37m 10s
19:	lea

In [None]:
sample_submission = pd.read_csv('/content/drive/MyDrive/데이콘/[데이콘]제주도 도로 교통량 예측/data/sample_submission.csv')

In [None]:
sample_submission['target'] = val_result
sample_submission.to_csv("/content/drive/MyDrive/데이콘/[데이콘]제주도 도로 교통량 예측/data/데이터처리code/submit/submmit_1114_cat_dayofweek_3.103_raw.csv", index = False)

# lgbm optuna

In [None]:
import lightgbm
lgbm_model = lightgbm.LGBMRegressor( n_estimators= 2978, min_child_samples= 59, min_child_weight= 0.0013500108470101552, subsample= 0.18912918330729142, num_leaves= 100,#. Best is trial 145 with value: 3.238283474145092.
                                    random_state = 42 ).fit(x_train, y_train)
lgbm_result = lgbm_model.predict(x_test)

In [None]:
sample_submission['target'] = lgbm_result
sample_submission.to_csv("/content/drive/MyDrive/데이콘/[데이콘]제주도 도로 교통량 예측/data/데이터처리code/submit/submmit_1113_lgbm_dayofweek_3.100_raw.csv", index = False)

# 제출

In [27]:
rf = pd.read_csv('/content/drive/MyDrive/데이콘/[데이콘]제주도 도로 교통량 예측/data/데이터처리code/submit/submmit_1114_rf_dayofweek_3.037_raw.csv')["target"]
cat= pd.read_csv('/content/drive/MyDrive/데이콘/[데이콘]제주도 도로 교통량 예측/data/데이터처리code/submit/submmit_1114_cat_dayofweek_3.103_raw.csv')["target"]
lgbm = pd.read_csv('/content/drive/MyDrive/데이콘/[데이콘]제주도 도로 교통량 예측/data/데이터처리code/submit/submmit_1113_lgbm_dayofweek_3.100_raw.csv')["target"]

In [28]:
result = rf * 0.5 + cat * 0.3 + lgbm * 0.2

In [None]:
sample_submission['target'] = np.round(result)
sample_submission.to_csv("/content/drive/MyDrive/데이콘/[데이콘]제주도 도로 교통량 예측/data/데이터처리code/submit/submmit_1114_532esemblecsv.csv", index = False)