# Baseline 
1. 데이터 전처리 

- 범주형 데이터 label encoding 
- na 값 처리 (범주형: 최빈값 대체, 연속형: 평균값 대체)

2. 예측 모델 
- Linear Regression
- Random Forest regressor 


In [2]:
import pandas as pd 
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
import xgboost 
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_absolute_error

from tqdm import tqdm


# 1. Import Data 

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

data_info = pd.read_csv('./data/data_info.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

# 2. Data Preprocessing 

## 2.1 Label Encoding

In [4]:
cat = ['day_of_week','road_name','start_node_name','start_turn_restricted','end_node_name','end_turn_restricted']

In [5]:
class LabelEncoderExt(object):
    def __init__(self):
        self.label_encoder = LabelEncoder()

    def fit(self, data_list):
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)

In [6]:
le = LabelEncoderExt()

for label in cat: 
    le.fit(train[label]) 
    train[label] = le.transform(train[label])
    test[label] = le.transform(test[label])

## 2.2 Fill Na 

check na 

In [11]:
train.isnull().sum()

id                       0
base_date                0
day_of_week              0
base_hour                0
lane_count               0
road_rating              0
road_name                0
multi_linked             0
connect_code             0
maximum_speed_limit      0
vehicle_restricted       0
weight_restricted        0
height_restricted        0
road_type                0
start_node_name          0
start_latitude           0
start_longitude          0
start_turn_restricted    0
end_node_name            0
end_latitude             0
end_longitude            0
end_turn_restricted      0
target                   0
dtype: int64

In [12]:
test.isnull().sum()

id                       0
base_date                0
day_of_week              0
base_hour                0
lane_count               0
road_rating              0
road_name                0
multi_linked             0
connect_code             0
maximum_speed_limit      0
vehicle_restricted       0
weight_restricted        0
height_restricted        0
road_type                0
start_node_name          0
start_latitude           0
start_longitude          0
start_turn_restricted    0
end_node_name            0
end_latitude             0
end_longitude            0
end_turn_restricted      0
dtype: int64

no na data to fill

# 3. Modeling 

split data

In [7]:
all_train_x = train.drop(['id','target'],axis=1)
all_train_y = train.target

train_x, val_x, train_y, val_y = train_test_split(all_train_x, all_train_y, test_size=0.1, shuffle=True)

test_x = test.drop(['id'],axis=1)

In [8]:
gb = GradientBoostingRegressor()
gb.fit(all_train_x, all_train_y)

GradientBoostingRegressor()

In [None]:
gb_pred = gb.predict(test_x)

gb_submission = sample_submission.copy()
gb_submission["target"] = gb_pred 
gb_submission.to_csv("./submissions/baseline_gb.csv", index = False)


train / get validataion score

In [14]:
def get_baseline_scores(models,train_x,train_y, test_x, test_y):
    scores = {}
    for model in tqdm(models): 
        model.fit(train_x, train_y)
        pred = model.predict(test_x)
        score = mean_absolute_error(test_y,pred)
        scores[str(model)] = score 

    return scores

In [15]:
lr = LinearRegression(n_jobs=-1)
rf = RandomForestRegressor(n_jobs=-1) 
xgb = xgboost.XGBRegressor(n_jobs=-1)

models = [lr,rf,xgb]

In [16]:
val_baseline_socres = get_baseline_scores(models, train_x, train_y, val_x, val_y)

100%|██████████| 3/3 [06:55<00:00, 138.60s/it]


In [17]:
val_baseline_socres

{'LinearRegression(n_jobs=-1)': 10.209791137710447,
 'RandomForestRegressor(n_jobs=-1)': 2.8345501380492735,
 "XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,\n             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,\n             early_stopping_rounds=None, enable_categorical=False,\n             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',\n             importance_type=None, interaction_constraints='',\n             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,\n             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,\n             missing=nan, monotone_constraints='()', n_estimators=100,\n             n_jobs=-1, num_parallel_tree=1, predictor='auto', random_state=0,\n             reg_alpha=0, reg_lambda=1, ...)": 3.8367852320240963}

retrain on hole training data

In [18]:
test_preds = [] 
for model in tqdm(models):
    model.fit(all_train_x,all_train_y)
    pred = model.predict(test_x)
    test_preds.append(pred)   

100%|██████████| 3/3 [07:37<00:00, 152.62s/it]


# 4. Submission 

In [40]:
baseline_lr = sample_submission.copy()
baseline_rf = sample_submission.copy()
baseline_xgb = sample_submission.copy() 

baseline_lr['target'] = test_preds[0]
baseline_rf['target'] = test_preds[1]
baseline_xgb['target'] = test_preds[2]

baseline_lr.to_csv("./submissions/baseline_lr.csv", index = False)
baseline_rf.to_csv("./submissions/baseline_rf.csv", index = False)
baseline_xgb.to_csv("./submissions/baseline_xgb.csv", index = False)
