# Data Preprocessing, Feature Engineering 
Data Preprocessing
- Scale Numeric values 

Feuture Engineering: 
- analyze seasonality 
- is_vacation feature, is_weekend feature

In [1]:
import pandas as pd 
import numpy as np
import datetime as dt
from dateutil.parser import parse

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
import xgboost 
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_absolute_error

from tqdm import tqdm


# 1. Import data

In [9]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

data_info = pd.read_csv('./data/data_info.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

# 2. Data Preprocessing 

## 2.1 Label Encoding 

In [10]:
class LabelEncoderExt(object):
    def __init__(self):
        self.label_encoder = LabelEncoder()

    def fit(self, data_list):
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)

In [11]:
train.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,road_type,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target
0,TRAIN_0000000,20220623,목,17,1,106,지방도1112호선,0,0,60.0,...,3,제3교래교,33.427747,126.662612,없음,제3교래교,33.427749,126.662335,없음,52.0
1,TRAIN_0000001,20220728,목,21,2,103,일반국도11호선,0,0,60.0,...,0,광양사거리,33.50073,126.529107,있음,KAL사거리,33.504811,126.52624,없음,30.0
2,TRAIN_0000002,20211010,일,7,2,103,일반국도16호선,0,0,80.0,...,0,창고천교,33.279145,126.368598,없음,상창육교,33.280072,126.362147,없음,61.0
3,TRAIN_0000003,20220311,금,13,2,107,태평로,0,0,50.0,...,0,남양리조트,33.246081,126.567204,없음,서현주택,33.245565,126.566228,없음,20.0
4,TRAIN_0000004,20211005,화,8,2,103,일반국도12호선,0,0,80.0,...,0,애월샷시,33.462214,126.326551,없음,애월입구,33.462677,126.330152,없음,38.0


In [12]:
cat = ['day_of_week','road_name','start_node_name','start_turn_restricted','end_node_name','end_turn_restricted']

In [13]:
le = LabelEncoderExt()

for label in cat: 
    le.fit(train[label]) 
    train[label] = le.transform(train[label])
    test[label] = le.transform(test[label])

## 2.2 Fill Na 

In [14]:
train.isnull().sum()

id                       0
base_date                0
day_of_week              0
base_hour                0
lane_count               0
road_rating              0
road_name                0
multi_linked             0
connect_code             0
maximum_speed_limit      0
vehicle_restricted       0
weight_restricted        0
height_restricted        0
road_type                0
start_node_name          0
start_latitude           0
start_longitude          0
start_turn_restricted    0
end_node_name            0
end_latitude             0
end_longitude            0
end_turn_restricted      0
target                   0
dtype: int64

In [15]:
test.isnull().sum()

id                       0
base_date                0
day_of_week              0
base_hour                0
lane_count               0
road_rating              0
road_name                0
multi_linked             0
connect_code             0
maximum_speed_limit      0
vehicle_restricted       0
weight_restricted        0
height_restricted        0
road_type                0
start_node_name          0
start_latitude           0
start_longitude          0
start_turn_restricted    0
end_node_name            0
end_latitude             0
end_longitude            0
end_turn_restricted      0
dtype: int64

## 2.3 Feature Scaling 

In [19]:
'''num = ['lane_count']
scaler = MinMaxScaler()
scaler.fit(train[num])
train[num] = scaler.transform(train[num])
test[num] = scaler.transform(test[num])'''

## 2.4 Feauture Engineering 

### 2.4.1 create month feautre

In [16]:
train_month = [] 
for i in range(len(train)): 
    train_month.append(int(str(train["base_date"][i])[4:6]))
train["month"] = train_month

test_month = []
for i in range(len(test)):
    test_month.append(int(str(test["base_date"][i])[4:6]))
test["month"] = test_month

In [17]:
train.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,start_node_name,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target,month
0,TRAIN_0000000,20220623,2,17,1,106,47,0,0,60.0,...,387,33.427747,126.662612,1,387,33.427749,126.662335,1,52.0,6
1,TRAIN_0000001,20220728,2,21,2,103,35,0,0,60.0,...,33,33.50073,126.529107,2,2,33.504811,126.52624,1,30.0,7
2,TRAIN_0000002,20211010,5,7,2,103,37,0,0,80.0,...,427,33.279145,126.368598,1,180,33.280072,126.362147,1,61.0,10
3,TRAIN_0000003,20220311,1,13,2,107,58,0,0,50.0,...,59,33.246081,126.567204,1,204,33.245565,126.566228,1,20.0,3
4,TRAIN_0000004,20211005,7,8,2,103,36,0,0,80.0,...,268,33.462214,126.326551,1,270,33.462677,126.330152,1,38.0,10


### 2.4.2 create is_weekend feature 

In [18]:
#day_of_week 5,6 weekend
train_is_weekend = [] 
for i in range(len(train)): 
    if train["day_of_week"][i] == 5 or train["day_of_week"][i] == 6: 
        train_is_weekend.append(1)
    else: 
        train_is_weekend.append(0)


train["is_weekend"] = train_is_weekend 

test_is_weekend = []
for i in range(len(test)):
    if test["day_of_week"][i] == 5 or test["day_of_week"][i] == 6:
        test_is_weekend.append(1)
    else: 
        test_is_weekend.append(0)

test["is_weekend"] = test_is_weekend

### 2.4.3 create is_vacation feature 

In [19]:
train_is_vacation = [] 
for i in range(len(train)):
    if train["month"][i] == 7 or train["month"][i] == 8: 
        train_is_vacation.append(1)
    else: 
        train_is_vacation.append(0)

train["is_vacation"] = train_is_vacation

test_is_vacation = [] 
for i in range(len(test)): 
    if test["month"][i] == 7 or test["month"][i] == 8:
        test_is_vacation.append(1)
    else: 
        test_is_vacation.append(0)

test["is_vacation"] = test_is_vacation

In [20]:
train.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,target,month,is_weekend,is_vacation
0,TRAIN_0000000,20220623,2,17,1,106,47,0,0,60.0,...,126.662612,1,387,33.427749,126.662335,1,52.0,6,0,0
1,TRAIN_0000001,20220728,2,21,2,103,35,0,0,60.0,...,126.529107,2,2,33.504811,126.52624,1,30.0,7,0,1
2,TRAIN_0000002,20211010,5,7,2,103,37,0,0,80.0,...,126.368598,1,180,33.280072,126.362147,1,61.0,10,1,0
3,TRAIN_0000003,20220311,1,13,2,107,58,0,0,50.0,...,126.567204,1,204,33.245565,126.566228,1,20.0,3,0,0
4,TRAIN_0000004,20211005,7,8,2,103,36,0,0,80.0,...,126.326551,1,270,33.462677,126.330152,1,38.0,10,0,0


In [21]:
test.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,start_latitude,start_longitude,start_turn_restricted,end_node_name,end_latitude,end_longitude,end_turn_restricted,month,is_weekend,is_vacation
0,TEST_000000,20220825,2,17,3,107,33,0,0,70.0,...,33.499427,126.541298,1,404,33.500772,126.543837,2,8,0,1
1,TEST_000001,20220809,7,12,2,103,36,0,0,70.0,...,33.258507,126.427003,1,23,33.258119,126.41584,1,8,0,1
2,TEST_000002,20220805,1,2,1,103,37,0,0,60.0,...,33.25896,126.476508,1,96,33.259206,126.474687,1,8,0,1
3,TEST_000003,20220818,2,23,3,103,35,0,0,70.0,...,33.473494,126.545647,1,351,33.471061,126.545467,1,8,0,1
4,TEST_000004,20220810,3,17,3,106,13,0,0,70.0,...,33.501477,126.569223,1,146,33.496863,126.58123,1,8,0,1


# 3. Modeling 

In [30]:
all_train_x = train.drop(['id','target'],axis=1)
all_train_y = train.target

train_x, val_x, train_y, val_y = train_test_split(all_train_x, all_train_y, test_size=0.1, shuffle=True)

test_x = test.drop(['id'],axis=1)

In [31]:
def get_scores(models,train_x,train_y, test_x, test_y):
    scores = {}
    for model in tqdm(models): 
        model.fit(train_x, train_y)
        pred = model.predict(test_x)
        score = mean_absolute_error(test_y,pred)
        scores[str(model)] = score 

    return scores

In [32]:
lr = LinearRegression(n_jobs=-1)
rf = RandomForestRegressor(n_jobs=-1) 
xgb = xgboost.XGBRegressor(n_jobs=-1)
mlp = MLPRegressor()

models = [lr,rf,xgb,mlp]

In [28]:
val_socres = get_scores(models, train_x, train_y, val_x, val_y)

100%|██████████| 4/4 [20:15<00:00, 303.84s/it]


In [29]:
val_socres

{'LinearRegression(n_jobs=-1)': 10.196806579738665,
 'RandomForestRegressor(n_jobs=-1)': 3.599626746293045,
 "XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,\n             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,\n             early_stopping_rounds=None, enable_categorical=False,\n             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',\n             importance_type=None, interaction_constraints='',\n             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,\n             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,\n             missing=nan, monotone_constraints='()', n_estimators=100,\n             n_jobs=-1, num_parallel_tree=1, predictor='auto', random_state=0,\n             reg_alpha=0, reg_lambda=1, ...)": 4.154639200947169,
 'MLPRegressor()': 13.227654439116133}

retrain on hole dataset 

In [None]:
test_preds = [] 
for model in tqdm(models):
    model.fit(all_train_x,all_train_y)
    pred = model.predict(test_x)
    test_preds.append(pred)   

100%|██████████| 4/4 [20:36<00:00, 309.17s/it]


# 4. Submission 

In [33]:
phase2_lr = sample_submission.copy()
phase2_rf = sample_submission.copy()
phase2_xgb = sample_submission.copy() 
phase2_mlp = sample_submission.copy()

phase2_lr['target'] = test_preds[0]
phase2_rf['target'] = test_preds[1]
phase2_xgb['target'] = test_preds[2]
phase2_mlp['target'] = test_preds[3]

phase2_lr.to_csv("./submissions/phase2_lr.csv", index = False)
phase2_rf.to_csv("./submissions/phase2_rf.csv", index = False)
phase2_xgb.to_csv("./submissions/phase2_xgb.csv", index = False)
phase2_mlp.to_csv("./submissions/phase2_mlp.csv", index = False)