[대회명]
제주 테크노파크 제주도 도로 교통량 예측 AI 경진대회


[배경]
제주도내 주민등록인구는 2022년 기준 약 68만명으로, 연평균 1.3%정도 매년 증가하고 있습니다.

또한 외국인과 관광객까지 고려하면 전체 상주인구는 90만명을 넘을 것으로 추정되며,

제주도민 증가와 외국인의 증가로 현재 제주도의 교통체증이 심각한 문제로 떠오르고 있습니다.



[주제]
제주도 도로 교통량 예측 AI 알고리즘 개발



[설명]
제주도의 교통 정보로부터 도로 교통량 회귀 예측

In [35]:
import numpy as np
import pandas as pd
import os
import gc # garbage collector

import matplotlib.pyplot as plt
import seaborn as sns

`csv to parquet` 변환을 진행하면 메모리에 효율적인 데이터 유형을 사용하여 용량을 크게 줄이고 빠른 작업 가능

In [36]:
DATA_DIR = './data/open/'
DATA_INFO = 'data_info.csv'
TRAIN_CSV = 'train.csv'
TEST_CSV = 'test.csv'

In [37]:
pd.read_csv(os.path.join(DATA_DIR, DATA_INFO), encoding='utf-8')

Unnamed: 0,변수명,변수 설명
0,id,아이디
1,base_date,날짜
2,day_of_week,요일
3,base_hour,시간대
4,road_in_use,도로사용여부
5,lane_count,차로수
6,road_rating,도로등급
7,multi_linked,중용구간 여부
8,connect_code,연결로 코드
9,maximum_speed_limit,최고속도제한


In [38]:
train_chunk = pd.read_csv(os.path.join(DATA_DIR, TRAIN_CSV), iterator=True, chunksize=1000)
train_chunk

<pandas.io.parsers.readers.TextFileReader at 0x2d5b7ba90>

In [39]:
train_chunk = list(train_chunk)
len(train_chunk)

4702

In [40]:
train_df = pd.concat(train_chunk)
train_df

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,road_type,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target
0,TRAIN_0000000,20220623,목,17,1,106,지방도1112호선,0,0,60.0,...,3,제3교래교,33.427747,126.662612,없음,제3교래교,33.427749,126.662335,없음,52.0
1,TRAIN_0000001,20220728,목,21,2,103,일반국도11호선,0,0,60.0,...,0,광양사거리,33.500730,126.529107,있음,KAL사거리,33.504811,126.526240,없음,30.0
2,TRAIN_0000002,20211010,일,7,2,103,일반국도16호선,0,0,80.0,...,0,창고천교,33.279145,126.368598,없음,상창육교,33.280072,126.362147,없음,61.0
3,TRAIN_0000003,20220311,금,13,2,107,태평로,0,0,50.0,...,0,남양리조트,33.246081,126.567204,없음,서현주택,33.245565,126.566228,없음,20.0
4,TRAIN_0000004,20211005,화,8,2,103,일반국도12호선,0,0,80.0,...,0,애월샷시,33.462214,126.326551,없음,애월입구,33.462677,126.330152,없음,38.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4701212,TRAIN_4701212,20211104,목,16,1,107,-,0,0,50.0,...,0,대림사거리,33.422145,126.278125,없음,금덕해운,33.420955,126.273750,없음,20.0
4701213,TRAIN_4701213,20220331,목,2,2,107,-,0,0,80.0,...,3,광삼교,33.472505,126.424368,없음,광삼교,33.472525,126.424890,없음,65.0
4701214,TRAIN_4701214,20220613,월,22,2,103,일반국도12호선,0,0,60.0,...,0,고성교차로,33.447183,126.912579,없음,성산교차로,33.444121,126.912948,없음,30.0
4701215,TRAIN_4701215,20211020,수,2,2,103,일반국도95호선,0,0,80.0,...,0,제6광령교,33.443596,126.431817,없음,관광대학입구,33.444996,126.433332,없음,73.0


4701217개의 데이터이어야함

In [41]:
4701217 - len(train_df)

0

메모리 효율을 위해 `parquet` 형식으로 바꿔주자

In [42]:
def csv_to_parquet(csv_path, save_name):
    chunk = pd.read_csv(csv_path, iterator=True, chunksize=1000)
    chunk = list(chunk)
    print(len(chunk))
    df = pd.concat(chunk)
    print(len(df))
    df.to_parquet('./data/open/{}.parquet'.format(save_name), engine='fastparquet')
    print('converting process complete')
    del df
    gc.collect() # 자동 메모리 관리용
    print(save_name, 'Done')

In [43]:
csv_to_parquet(os.path.join(DATA_DIR, TRAIN_CSV), 'train')
csv_to_parquet(os.path.join(DATA_DIR, TEST_CSV), 'test')

4702
4701217
converting process complete
train Done
292
291241
converting process complete
test Done


# 데이터 로딩

In [44]:
train = pd.read_parquet('./data/open/train.parquet', engine='fastparquet')

In [45]:
display(train)

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,road_type,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target
0,TRAIN_0000000,20220623,목,17,1,106,지방도1112호선,0,0,60.0,...,3,제3교래교,33.427747,126.662612,없음,제3교래교,33.427749,126.662335,없음,52.0
1,TRAIN_0000001,20220728,목,21,2,103,일반국도11호선,0,0,60.0,...,0,광양사거리,33.500730,126.529107,있음,KAL사거리,33.504811,126.526240,없음,30.0
2,TRAIN_0000002,20211010,일,7,2,103,일반국도16호선,0,0,80.0,...,0,창고천교,33.279145,126.368598,없음,상창육교,33.280072,126.362147,없음,61.0
3,TRAIN_0000003,20220311,금,13,2,107,태평로,0,0,50.0,...,0,남양리조트,33.246081,126.567204,없음,서현주택,33.245565,126.566228,없음,20.0
4,TRAIN_0000004,20211005,화,8,2,103,일반국도12호선,0,0,80.0,...,0,애월샷시,33.462214,126.326551,없음,애월입구,33.462677,126.330152,없음,38.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4701212,TRAIN_4701212,20211104,목,16,1,107,-,0,0,50.0,...,0,대림사거리,33.422145,126.278125,없음,금덕해운,33.420955,126.273750,없음,20.0
4701213,TRAIN_4701213,20220331,목,2,2,107,-,0,0,80.0,...,3,광삼교,33.472505,126.424368,없음,광삼교,33.472525,126.424890,없음,65.0
4701214,TRAIN_4701214,20220613,월,22,2,103,일반국도12호선,0,0,60.0,...,0,고성교차로,33.447183,126.912579,없음,성산교차로,33.444121,126.912948,없음,30.0
4701215,TRAIN_4701215,20211020,수,2,2,103,일반국도95호선,0,0,80.0,...,0,제6광령교,33.443596,126.431817,없음,관광대학입구,33.444996,126.433332,없음,73.0


In [46]:
if 'road_in_use' in train.columns:
    print('변수 제거하고 진행해야함')
else:
    print('정상 데이터셋')

정상 데이터셋


In [47]:
train = pd.read_parquet('./data/open/train.parquet')
test = pd.read_parquet('./data/open/test.parquet')

# 데이터 전처리

In [48]:
from sklearn.preprocessing import LabelEncoder

In [49]:
str_col = ['day_of_week', 'start_turn_restricted', 'end_turn_restricted']

for i in str_col:
    le = LabelEncoder()
    le = le.fit(train[i])
    train[i] = le.transform(train[i])

    for label in np.unique(test[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test[i] = le.transform(test[i])

In [50]:
train[str_col]

Unnamed: 0,day_of_week,start_turn_restricted,end_turn_restricted
0,1,0,0
1,1,1,0
2,4,0,0
3,0,0,0
4,6,0,0
...,...,...,...
4701212,1,0,0
4701213,1,0,0
4701214,3,0,0
4701215,2,0,0


In [51]:
y_train = train['target']

X_train = train.drop(['id','base_date', 'target','road_name', 'start_node_name', 'end_node_name','vehicle_restricted'], axis=1)

test = test.drop(['id','base_date', 'road_name', 'start_node_name', 'end_node_name','vehicle_restricted'], axis=1)

print(X_train.shape)
print(y_train.shape)
print(test.shape)

(4701217, 16)
(4701217,)
(291241, 16)


In [52]:
import lightgbm as lgb

In [53]:
LR = lgb.LGBMRegressor(random_state=42).fit(X_train, y_train)

In [54]:
pred = LR.predict(test)

In [55]:
sample_submission = pd.read_csv('./data/open/sample_submission.csv')

In [56]:
sample_submission['target'] = pred
sample_submission.to_csv("./data/open/submit.csv", index=False)

In [57]:
sample_submission

Unnamed: 0,id,target
0,TEST_000000,27.298710
1,TEST_000001,43.949243
2,TEST_000002,60.552621
3,TEST_000003,36.499322
4,TEST_000004,37.510570
...,...,...
291236,TEST_291236,45.902276
291237,TEST_291237,52.604831
291238,TEST_291238,21.964371
291239,TEST_291239,25.402628
