In [None]:
cd  /content/drive/MyDrive/kaggle_study/06-bus

/content/drive/MyDrive/kaggle_study/06-bus


In [None]:
# Library 예시
import os

# Analysis
import pandas as pd
import numpy as np
import datetime
import random
import gc
from tqdm import tqdm_notebook as tqdm

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Prevent Overfit
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

# Model
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb

SEED=42
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

def rmse(y_true, y_pred):
    return np.round(np.sqrt(mean_squared_error(y_true, y_pred)), 5)

########################### BASIC SETTING
seed_everything(SEED)
TARGET = '18~20_ride'

########################### DATA LOAD
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sub = pd.read_csv('submission_sample.csv')

In [None]:
train_df

Unnamed: 0,id,date,bus_route_id,in_out,station_code,station_name,latitude,longitude,6~7_ride,7~8_ride,8~9_ride,9~10_ride,10~11_ride,11~12_ride,6~7_takeoff,7~8_takeoff,8~9_takeoff,9~10_takeoff,10~11_takeoff,11~12_takeoff,18~20_ride
0,0,2019-09-01,4270000,시외,344,제주썬호텔,33.48990,126.49373,0.0,1.0,2.0,5.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2019-09-01,4270000,시외,357,한라병원,33.48944,126.48508,1.0,4.0,4.0,2.0,5.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,2,2019-09-01,4270000,시외,432,정존마을,33.48181,126.47352,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,3,2019-09-01,4270000,시내,1579,제주국제공항(600번),33.50577,126.49252,0.0,17.0,6.0,26.0,14.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0
4,4,2019-09-01,4270000,시내,1646,중문관광단지입구,33.25579,126.41260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
415418,415418,2019-09-30,32820000,시내,1129,한림환승정류장(한림리),33.41437,126.26336,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
415419,415419,2019-09-30,32820000,시내,1564,제주시외버스터미널,33.49946,126.51479,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
415420,415420,2019-09-30,32820000,시내,2322,해병부대,33.23100,126.26273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
415421,415421,2019-09-30,32820000,시내,3291,애월환승정류장(애월리),33.46483,126.31870,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# [1] 데이터 전처리
Data Cleansing & Pre-Processing

## 1) 데이터 전처리를 위한 사용자 함수 선언

In [None]:
def df_copy(tr_df, te_df):
    tr = tr_df.copy();te = te_df.copy()
    return tr, te

def base_preprocessing(tr_df, te_df):
    tr, te = df_copy(tr_df, te_df)

    tr['bus_route_id'] = tr['bus_route_id'].apply(lambda x: str(x)[:-4]).astype(int)
    te['bus_route_id'] = te['bus_route_id'].apply(lambda x: str(x)[:-4]).astype(int)
    tr['station_name2'] = tr['station_name'].apply(lambda x: str(x)[:2])
    te['station_name2'] = te['station_name'].apply(lambda x: str(x)[:2])
    tr['station_name'] = tr['station_name'].apply(lambda x: x.replace(' ', ''))
    te['station_name'] = te['station_name'].apply(lambda x: x.replace(' ', ''))

    le = LabelEncoder().fit(pd.concat([tr['station_name'], te['station_name']]))
    le2 = LabelEncoder().fit(pd.concat([tr['station_name2'], te['station_name2']]))
    for df in [tr, te]:
        df['day'] = pd.to_datetime(df['date']).dt.day
        df['week'] = pd.to_datetime(df['date']).dt.week
        df['weekday'] = pd.to_datetime(df['date']).dt.weekday
        df['station_name'] = le.transform(df['station_name'])
        df['station_name2'] = le2.transform(df['station_name2'])

        df['6~8_ride'] = df[['6~7_ride','7~8_ride']].sum(1)
        df['6~9_ride'] = df[['6~7_ride','7~8_ride','8~9_ride']].sum(1)
        df['6~10_ride'] = df[['6~7_ride','7~8_ride','8~9_ride', '9~10_ride']].sum(1)
        df['6~8_takeoff'] = df[['6~7_takeoff','7~8_takeoff']].sum(1)
        df['6~9_takeoff'] = df[['6~7_takeoff','7~8_takeoff','8~9_takeoff']].sum(1)
        df['6~10_takeoff'] = df[['6~7_takeoff','7~8_takeoff','8~9_takeoff', '9~10_takeoff']].sum(1)
    te['day'] = te['day']+30
    return tr, te

def lat_long_create(tr_df, te_df):
    tr, te = df_copy(tr_df, te_df)
    tr['lat_long'] = np.round(tr['latitude'], 2).astype(str) + np.round(tr['longitude'], 2).astype(str)
    te['lat_long'] = np.round(te['latitude'], 2).astype(str) + np.round(te['longitude'], 2).astype(str)
    le = LabelEncoder().fit(pd.concat([tr['lat_long'], te['lat_long']]))
    tr['station_lat_long'] = le.transform(tr['lat_long'])
    te['station_lat_long'] = le.transform(te['lat_long'])

    tr['lat_long'] = np.round(tr['latitude'], 3).astype(str) + np.round(tr['longitude'], 2).astype(str)
    te['lat_long'] = np.round(te['latitude'], 3).astype(str) + np.round(te['longitude'], 2).astype(str)
    le = LabelEncoder().fit(pd.concat([tr['lat_long'], te['lat_long']]))
    tr['station_lat_long2'] = le.transform(tr['lat_long'])
    te['station_lat_long2'] = le.transform(te['lat_long'])
    return tr, te

def feature_combine(tr_df, te_df):
    tr, te = df_copy(tr_df, te_df)
    for df in [tr, te]:
        df['bus_route_id_station_code'] = ((df['bus_route_id']).astype(str) + (df['station_code']).astype(str)).astype('category')
        df['bus_route_id_station_lat_long'] = ((df['bus_route_id']).astype(str) + (df['station_lat_long']).astype(str)).astype('category')
    return tr, te 

def category_transform(tr_df, te_df, columns):
    tr, te = df_copy(tr_df, te_df)
    for df in [tr, te]:
        df[columns] = df[columns].astype(str).astype('category')
    return tr, te

def frequency_encoding(tr_df, te_df, columns, normalize=False):
    tr, te = df_copy(tr_df, te_df)
    for col in columns:
        if not normalize:
            freq_encode = pd.concat([tr[col], te[col]]).value_counts()
            tr[col+'_fq_enc'] = tr[col].map(freq_encode)
            te[col+'_fq_enc'] = te[col].map(freq_encode)
        else:
            freq_encode = pd.concat([tr[col], te[col]]).value_counts(normalize=True)
            tr[col+'_fq_enc_nor'] = tr[col].map(freq_encode)
            te[col+'_fq_enc_nor'] = te[col].map(freq_encode)
    return tr, te

def remove_outlier(tr_df, te_df, columns):
    tr, te = df_copy(tr_df, te_df)
    for col in columns:
        tr[col] = np.where(tr[col].isin(te[col]), tr[col], 0)
        te[col] = np.where(te[col].isin(tr[col]), te[col], 0)
    return tr, te

def day_agg(tr_df, te_df, merge_columns, columns, aggs=['mean']):
    tr, te = df_copy(tr_df, te_df)
    for merge_column in merge_columns:
        for col in columns:
            for agg in aggs:
                valid = pd.concat([tr[[merge_column, col]], te[[merge_column, col]]])
                new_cn = merge_column + '_' + agg + '_' + col
                if agg=='quantile':
                    valid = valid.groupby(merge_column)[col].quantile(0.8).reset_index().rename(columns={col:new_cn})
                else:
                    valid = valid.groupby(merge_column)[col].agg([agg]).reset_index().rename(columns={agg:new_cn})
                valid.index = valid[merge_column].tolist()
                valid = valid[new_cn].to_dict()
            
                tr[new_cn] = tr[merge_column].map(valid)
                te[new_cn] = te[merge_column].map(valid)
    return tr, te

def sub_day_agg(tr_df, te_df, merge_columns, date_columns, columns, aggs=['mean']):
    tr, te = df_copy(tr_df, te_df)
    for merge_column in merge_columns:
        for date in date_columns:
            tr['mc_date'] = tr[merge_column].astype(str) + '_' +tr[date].astype(str)
            te['mc_date'] = te[merge_column].astype(str) + '_' +te[date].astype(str)
            for col in columns:
                for agg in aggs:
                    valid = pd.concat([tr[['mc_date', col]], te[['mc_date', col]]])
                    new_cn = merge_column + '_' + date + '_' + col + '_' + agg
                    if agg=='quantile':
                        valid = valid.groupby('mc_date')[col].quantile(0.8).reset_index().rename(columns={col:new_cn})
                    else:
                        valid = valid.groupby('mc_date')[col].agg([agg]).reset_index().rename(columns={agg:new_cn})
                    valid.index = valid['mc_date'].tolist()
                    valid = valid[new_cn].to_dict()
                
                    tr[new_cn] = tr['mc_date'].map(valid)
                    te[new_cn] = te['mc_date'].map(valid)
    tr = tr.drop(columns=['mc_date'])
    te = te.drop(columns=['mc_date'])
    return tr, te

## Data Pre-Processing
1. base_preprocessing
  * bus_route_id : 뒷자리 0000 제거 후 정수형 변환
  * station_name2 : station_name의 앞 두 글자만 사용한 열 추가 (ex. 제주, 한라) 
  * station_name : 공백 제거
  * station_name2 & station_name -> LabelEncoder
  * data -> day, week, weekday 열 추가하여 세분화
  * 6-8, 6-9, 6-10 : 각 시간대를 더하여 탑승, 하차 인원 열 추가
2. lat_long_create
  * station_lat_long : 위도-경도 각각을 소수점 둘째자리에서 반올림하여 결합
  * station_lat_long2 :위도-경도 각각을 소수점 셋째자리에서 반올림하여 결합
  * 각각 LabelEncoder
3. feature_combine
  * bus_route_id_station_code : bus_route_id와 station_code 결합
  * bus_route_id_station_lat_long : bus_route_id와 station_lat_long 결합

In [None]:
########################### Final features list
remove_features = ['id', 'date', 'in_out', TARGET]
ride_take = ['6~7_ride', '7~8_ride', '8~9_ride', '9~10_ride', '10~11_ride', '11~12_ride', '6~7_takeoff', '7~8_takeoff', '8~9_takeoff', '9~10_takeoff', '10~11_takeoff', '11~12_takeoff']

remove_features += ['day', 'week', 'weekday', 'lat_long']
tr, te = base_preprocessing(train_df, test_df)
tr, te = lat_long_create(tr, te)
tr, te = feature_combine(tr, te)

ride_take += ['6~8_ride', '6~9_ride', '6~10_ride', '6~8_takeoff', '6~9_takeoff', '6~10_takeoff']



In [None]:
tr, te = day_agg(tr, te, merge_columns=['day'], columns=ride_take, aggs=['mean'])
tr, te = sub_day_agg(tr, te, merge_columns=['bus_route_id', 'station_code', 'station_lat_long'], date_columns=['day'], columns=ride_take, aggs=['mean'])
tr, te = sub_day_agg(tr, te, merge_columns=['bus_route_id', 'station_code', 'station_name', 'station_lat_long'], date_columns=['day'], columns=ride_take, aggs=['quantile'])

In [None]:
category_features = ['bus_route_id', 'station_code', 'station_name', 'station_name2', 'station_lat_long', 'station_lat_long2', 'bus_route_id_station_code', 'bus_route_id_station_lat_long']
tr, te = frequency_encoding(tr, te, category_features)

In [None]:
tr.head()

Unnamed: 0,id,date,bus_route_id,in_out,station_code,station_name,latitude,longitude,6~7_ride,7~8_ride,8~9_ride,9~10_ride,10~11_ride,11~12_ride,6~7_takeoff,7~8_takeoff,8~9_takeoff,9~10_takeoff,10~11_takeoff,11~12_takeoff,18~20_ride,station_name2,day,week,weekday,6~8_ride,6~9_ride,6~10_ride,6~8_takeoff,6~9_takeoff,6~10_takeoff,lat_long,station_lat_long,station_lat_long2,bus_route_id_station_code,bus_route_id_station_lat_long,day_mean_6~7_ride,day_mean_7~8_ride,day_mean_8~9_ride,day_mean_9~10_ride,...,station_name_day_10~11_ride_quantile,station_name_day_11~12_ride_quantile,station_name_day_6~7_takeoff_quantile,station_name_day_7~8_takeoff_quantile,station_name_day_8~9_takeoff_quantile,station_name_day_9~10_takeoff_quantile,station_name_day_10~11_takeoff_quantile,station_name_day_11~12_takeoff_quantile,station_name_day_6~8_ride_quantile,station_name_day_6~9_ride_quantile,station_name_day_6~10_ride_quantile,station_name_day_6~8_takeoff_quantile,station_name_day_6~9_takeoff_quantile,station_name_day_6~10_takeoff_quantile,station_lat_long_day_6~7_ride_quantile,station_lat_long_day_7~8_ride_quantile,station_lat_long_day_8~9_ride_quantile,station_lat_long_day_9~10_ride_quantile,station_lat_long_day_10~11_ride_quantile,station_lat_long_day_11~12_ride_quantile,station_lat_long_day_6~7_takeoff_quantile,station_lat_long_day_7~8_takeoff_quantile,station_lat_long_day_8~9_takeoff_quantile,station_lat_long_day_9~10_takeoff_quantile,station_lat_long_day_10~11_takeoff_quantile,station_lat_long_day_11~12_takeoff_quantile,station_lat_long_day_6~8_ride_quantile,station_lat_long_day_6~9_ride_quantile,station_lat_long_day_6~10_ride_quantile,station_lat_long_day_6~8_takeoff_quantile,station_lat_long_day_6~9_takeoff_quantile,station_lat_long_day_6~10_takeoff_quantile,bus_route_id_fq_enc,station_code_fq_enc,station_name_fq_enc,station_name2_fq_enc,station_lat_long_fq_enc,station_lat_long2_fq_enc,bus_route_id_station_code_fq_enc,bus_route_id_station_lat_long_fq_enc
0,0,2019-09-01,427,시외,344,1489,33.4899,126.49373,0.0,1.0,2.0,5.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,720,1,35,6,1.0,3.0,8.0,0.0,0.0,0.0,33.49126.49,601,1486,427344,427601,0.254984,0.391576,0.49246,0.543855,...,1.8,4.8,0.0,0.0,0.0,0.0,0.0,2.4,1.0,2.6,6.6,0.0,0.0,0.0,1.0,1.0,2.0,1.0,2.0,2.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,4.0,1.0,2.0,3.0,1189,46,86,73327,16812,7037,46,138
1,1,2019-09-01,427,시외,357,1831,33.48944,126.48508,1.0,4.0,4.0,2.0,5.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,840,1,35,6,5.0,9.0,11.0,0.0,0.0,0.0,33.489126.49,601,1463,427357,427601,0.254984,0.391576,0.49246,0.543855,...,2.0,2.0,0.0,1.0,2.0,2.0,1.0,1.0,2.0,5.0,7.0,3.0,4.0,5.0,1.0,1.0,2.0,1.0,2.0,2.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,4.0,1.0,2.0,3.0,1189,2303,4756,8412,16812,2303,46,138
2,2,2019-09-01,427,시외,432,1413,33.48181,126.47352,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,709,1,35,6,2.0,2.0,4.0,0.0,0.0,0.0,33.482126.47,571,1390,427432,427571,0.254984,0.391576,0.49246,0.543855,...,1.0,0.4,0.0,0.0,0.4,1.0,0.0,1.0,2.0,3.4,5.0,0.4,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.4,0.4,1.0,1.0,0.0,1.0,2.0,3.0,1.0,1.0,1.0,1189,1154,2426,2426,5376,1774,46,46
3,3,2019-09-01,427,시내,1579,1438,33.50577,126.49252,0.0,17.0,6.0,26.0,14.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0,720,1,35,6,17.0,23.0,49.0,0.0,0.0,0.0,33.506126.49,627,1630,4271579,427627,0.254984,0.391576,0.49246,0.543855,...,14.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,23.0,49.0,0.0,0.0,0.0,0.0,1.4,3.0,3.0,3.4,3.0,3.4,7.4,6.8,6.4,6.4,5.0,2.0,5.4,8.0,12.0,17.4,21.0,1189,49,49,73327,3501,2378,46,50
4,4,2019-09-01,427,시내,1646,1583,33.25579,126.4126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,737,1,35,6,0.0,0.0,0.0,0.0,0.0,1.0,33.256126.41,86,223,4271646,42786,0.254984,0.391576,0.49246,0.543855,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.4,1.0,1.0,1.0,1189,386,826,8830,1290,896,39,39


#[2] 변수 선택 및 모델 구축 및 학습
Feature Engineering & Initial Modeling

In [None]:
os.chdir(retval)

In [None]:
########################### Model
def make_predictions(model, tr_df, tt_df, features_columns, target, params, category_feature=[''], NFOLDS=4, oof_save=False, clip=999, SEED=SEED):
    X,y = tr_df[features_columns], tr_df[target]
    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

    oof = np.zeros(len(tr_df))
    pred = np.zeros(len(tt_df))
    fi_df = pd.DataFrame()
    
    for fold_, (trn_idx, val_idx) in enumerate(kf.split(X)):
        print('Fold:',fold_)
        tr_data = lgb.Dataset(X.loc[trn_idx], label=y[trn_idx].clip(0, clip))
        vl_data = lgb.Dataset(X.loc[val_idx], label=y[val_idx])
        if model=='lgb':
            estimator = lgb.train(params, tr_data, valid_sets = [tr_data, vl_data], verbose_eval = 500)
            fi_df = pd.concat([fi_df, pd.DataFrame(sorted(zip(estimator.feature_importance(), features_columns)), columns=['Value', 'Feature'])])
        
        oof[val_idx] = estimator.predict(X.loc[val_idx])
        pred += estimator.predict(tt_df[features_columns])/NFOLDS
        del estimator
        gc.collect()

    oof = np.where(oof>0, oof, 0)
    pred = np.where(pred>0, pred, 0)

    if oof_save:
        if model=='lgb':
            np.save(retval+'\\content\\oof_lgb.npy', oof)
            np.save(retval+'\\content\\pred_lgb.npy', pred)
        elif model=='cat':
            np.save(retval+'\\content\\oof_cat.npy', oof)
            np.save(retval+'\\content\\pred_cat.npy', pred)

    tt_df[target] = pred
    print('OOF RMSE:', rmse(y, oof))
    
    try:
        fi_df = fi_df.groupby('Feature').mean().reset_index().sort_values('Value')
    except:
        pass

    return tt_df[['id', target]], fi_df
## -------------------

In [None]:
lgb_params = {
        'objective':'regression',
        'boosting_type':'gbdt',
        'metric':'rmse',
        'n_jobs':-1,
        'learning_rate':0.003,
        'num_leaves': 700,
        'max_depth':-1,
        'min_child_weight':5,
        'colsample_bytree': 0.3,
        'subsample':0.7,
        'n_estimators':50000,
        'gamma':0,
        'reg_lambda':0.05,
        'reg_alpha':0.05,
        'verbose':-1,
        'seed': SEED,
        'early_stopping_rounds':50
    }
    
tr, te = remove_outlier(tr, te, category_features)
tr, te = category_transform(tr, te, category_features)

features_columns = [col for col in tr.columns if col not in remove_features]

test_predictions, fi = make_predictions('lgb', tr, te, features_columns, TARGET, lgb_params, category_feature=category_features, NFOLDS=5, oof_save=True)

Fold: 0




Training until validation scores don't improve for 50 rounds.


KeyboardInterrupt: ignored

#[3] 결과
Conclusion & Discussion

In [None]:
test_predictions.to_csv('lgb_model.csv', index=False)