In [43]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_log_error as msle
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline

import lightgbm as lgb

In [6]:
train = pd.read_csv(r"C:\Users\Ong Yi Kai\Desktop\Data\Kaggle competitions\Tabular Data Jul 2021\train.csv", date_parser=True)
test = pd.read_csv(r"C:\Users\Ong Yi Kai\Desktop\Data\Kaggle competitions\Tabular Data Jul 2021\test.csv",date_parser=True)

date_tr = train.iloc[:,0]
data_tr = train.iloc[:,1:-3]
target = train.iloc[:,-3:]

date_test = test.iloc[:,0]
data_test = test.iloc[:,1:]

feature_names = data_tr.columns

In [7]:
# split data
X_train, X_val, y_train, y_val = train_test_split(data_tr, target, test_size=0.2, random_state=2021)
print(f"Shape of Training data:{X_train.shape}, target:{y_train.shape} \nShape of Validation Data:{X_val.shape}, target:{y_val.shape}")

Shape of Training data:(5688, 8), target:(5688, 3) 
Shape of Validation Data:(1423, 8), target:(1423, 3)


# BASIC MODELS

In [8]:
#Create regressor objects
lr = MultiOutputRegressor(LinearRegression())
dtr = MultiOutputRegressor(DecisionTreeRegressor())
rfr = MultiOutputRegressor(RandomForestRegressor())
svr = MultiOutputRegressor(SVR())
knr = MultiOutputRegressor(KNeighborsRegressor())

In [9]:
pipe = Pipeline([
    ('Scalers',StandardScaler()),
    ('Estimators',lr)
])

In [10]:
param_grid = {
    'Estimators':[lr,dtr,rfr,svr,knr]
}

search = GridSearchCV(pipe, param_grid,n_jobs=-1,cv=5,return_train_score=True, verbose=1)

In [11]:
search.fit(X_train,y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('Scalers', StandardScaler()),
                                       ('Estimators',
                                        MultiOutputRegressor(estimator=LinearRegression()))]),
             n_jobs=-1,
             param_grid={'Estimators': [MultiOutputRegressor(estimator=LinearRegression()),
                                        MultiOutputRegressor(estimator=DecisionTreeRegressor()),
                                        MultiOutputRegressor(estimator=RandomForestRegressor()),
                                        MultiOutputRegressor(estimator=SVR()),
                                        MultiOutputRegressor(estimator=KNeighborsRegressor())]},
             return_train_score=True, verbose=1)

In [12]:
search.best_params_

{'Estimators': MultiOutputRegressor(estimator=RandomForestRegressor())}

In [13]:
y_val_pred = search.predict(X_val)
error = msle(y_val,y_val_pred,multioutput = 'uniform_average',squared=False)
error

0.18715397856752125

# LIGHTBOOST

In [73]:
from xgboost import XGBRegressor

In [81]:
Params0 = {
        'max_depth': 3, 
        'learning_rate': 0.06974270910763652, 
        'n_estimators': 24813, 
        'min_child_weight': 15, 
        'gamma': 0.00015352935707382668, 
        'alpha': 0.007300887912196733, 
        'lambda': 0.00233147304187698, 
        'colsample_bytree': 0.6706673656091967, 
        'subsample': 0.32392556118811044,
        'booster': 'gbtree',
        'random_state': 123,
        'use_label_encoder': False,
        'eval_metric':'rmsle'

}

In [101]:
folds = KFold(n_splits=5, random_state=2005,shuffle=True)
pred0 = np.zeros(len(y_val))
test0 = np.zeros(len(data_test))
for fold, (trn_idx,val_idx) in enumerate(folds.split(X_train,y_train)):
    x_tr,y_tr = X_train.iloc[trn_idx,:], y_train.iloc[trn_idx,0]
    x_v, y_v =  X_train.iloc[val_idx,:], y_train.iloc[val_idx,0]
    
    model = XGBRegressor(**Params0)
    model.fit(x_tr,y_tr,
             eval_set=[(x_v,y_v)],
             eval_metric='rmsle',
             verbose=100,
             early_stopping_rounds=200)
    
    pred_v = model.predict(x_v)
    rmsle = msle(y_v,pred_v,multioutput='uniform_average',squared=False)
    print(f'RMSLE:{rmsle}')
    
    pred0 += model.predict(X_val)/folds.n_splits
    test0 += model.predict(data_test)/folds.n_splits

[0]	validation_0-rmsle:0.67738
[100]	validation_0-rmsle:0.13105
[200]	validation_0-rmsle:0.12957
[300]	validation_0-rmsle:0.13091
[380]	validation_0-rmsle:0.13040
RMSLE:0.1293841549482384
[0]	validation_0-rmsle:0.66867
[100]	validation_0-rmsle:0.13720
[200]	validation_0-rmsle:0.13330
[300]	validation_0-rmsle:0.13237
[400]	validation_0-rmsle:0.13125
[500]	validation_0-rmsle:0.13265
[600]	validation_0-rmsle:0.13272
[602]	validation_0-rmsle:0.13278
RMSLE:0.1310619930389098
[0]	validation_0-rmsle:0.67617
[100]	validation_0-rmsle:0.14490
[200]	validation_0-rmsle:0.14250
[300]	validation_0-rmsle:0.14214
[400]	validation_0-rmsle:0.14187
[500]	validation_0-rmsle:0.14060
[600]	validation_0-rmsle:0.14119
[700]	validation_0-rmsle:0.14174
[800]	validation_0-rmsle:0.14014
[900]	validation_0-rmsle:0.14029
[1000]	validation_0-rmsle:0.14069
[1100]	validation_0-rmsle:0.13977
[1200]	validation_0-rmsle:0.13959
[1300]	validation_0-rmsle:0.13906
[1400]	validation_0-rmsle:0.13978
[1500]	validation_0-rmsle:0

In [88]:
Params1 ={
        'max_depth': 3, 
        'learning_rate': 0.01630350395073977, 
        'n_estimators': 20058, 
        'min_child_weight': 11, 
        'gamma': 0.0009762828881569192, 
        'alpha': 0.001235465069634119, 
        'lambda': 0.0005268383741494084, 
        'colsample_bytree': 0.5100114916691317, 
        'subsample': 0.31372256786444536,
        'booster': 'gbtree',
        'random_state': 123,
        'use_label_encoder': False,
        'eval_metric': 'rmsle'
        }

In [102]:
folds = KFold(n_splits=5, random_state=2005,shuffle=True)
pred1 = np.zeros(len(y_val))
test1 = np.zeros((len(data_test)))
for fold, (trn_idx,val_idx) in enumerate(folds.split(X_train,y_train)):
    x_tr,y_tr = X_train.iloc[trn_idx,:], y_train.iloc[trn_idx,1]
    x_v, y_v =  X_train.iloc[val_idx,:], y_train.iloc[val_idx,1]
    
    model = XGBRegressor(**Params1)
    model.fit(x_tr,y_tr,
             eval_set=[(x_v,y_v)],
             eval_metric='rmsle',
             verbose=100,
             early_stopping_rounds=200)
    
    pred_v = model.predict(x_v)
    rmsle = msle(y_v,abs(pred_v),multioutput='uniform_average',squared=False)
    print(f'RMSLE:{rmsle}')
    
    pred1 += model.predict(X_val)/folds.n_splits
    test1 +=model.predict(data_test)/folds.n_splits

[0]	validation_0-rmsle:1.81975
[100]	validation_0-rmsle:0.22738
[200]	validation_0-rmsle:0.13317
[300]	validation_0-rmsle:0.11620
[400]	validation_0-rmsle:0.11070
[500]	validation_0-rmsle:0.10750
[600]	validation_0-rmsle:0.10609
[700]	validation_0-rmsle:0.10450
[800]	validation_0-rmsle:0.10300
[900]	validation_0-rmsle:0.10225
[1000]	validation_0-rmsle:0.10106
[1100]	validation_0-rmsle:0.10071
[1200]	validation_0-rmsle:0.10017
[1300]	validation_0-rmsle:0.10003
[1400]	validation_0-rmsle:0.10015
[1500]	validation_0-rmsle:0.09983
[1600]	validation_0-rmsle:0.09951
[1700]	validation_0-rmsle:0.09906
[1800]	validation_0-rmsle:0.09870
[1900]	validation_0-rmsle:0.09842
[2000]	validation_0-rmsle:0.09819
[2100]	validation_0-rmsle:0.09758
[2200]	validation_0-rmsle:0.09733
[2300]	validation_0-rmsle:0.09740
[2400]	validation_0-rmsle:0.09731
[2452]	validation_0-rmsle:0.09733
RMSLE:0.0972254464768814
[0]	validation_0-rmsle:1.78089
[100]	validation_0-rmsle:0.23063
[200]	validation_0-rmsle:0.13753
[300]	

In [95]:
Params2 = {
        'max_depth': 6, 
        'learning_rate': 0.07398714527058703, 
        'n_estimators': 15509, 
        'min_child_weight': 1, 
        'gamma': 0.0010264813784765508, 
        'alpha': 0.002893496668661691, 
        'lambda': 0.008742987610869259, 
        'colsample_bytree': 0.6256322009147708, 
        'subsample': 0.5955732014997671,
        'booster': 'gbtree',
        'random_state': 123,
        'use_label_encoder': False,
        'eval_metric': 'rmsle'
        }

In [103]:
folds = KFold(n_splits=5, random_state=2005,shuffle=True)
pred2 = np.zeros(len(y_val))
test2 = np.zeros((len(data_test)))
for fold, (trn_idx,val_idx) in enumerate(folds.split(X_train,y_train)):
    x_tr,y_tr = X_train.iloc[trn_idx,:], y_train.iloc[trn_idx,2]
    x_v, y_v =  X_train.iloc[val_idx,:], y_train.iloc[val_idx,2]
    
    model = XGBRegressor(**Params2)
    model.fit(x_tr,y_tr,
             eval_set=[(x_v,y_v)],
             eval_metric='rmsle',
             verbose=100,
             early_stopping_rounds=200)
    
    pred_v = model.predict(x_v)
    rmsle = msle(y_v,pred_v,multioutput='uniform_average',squared=False)
    print(f'RMSLE:{rmsle}')
    
    pred2 += model.predict(X_val)/folds.n_splits
    test2 += model.predict(data_test)/folds.n_splits

[0]	validation_0-rmsle:2.43634
[100]	validation_0-rmsle:0.37523
[200]	validation_0-rmsle:nan
[241]	validation_0-rmsle:nan
RMSLE:0.3549932564257398
[0]	validation_0-rmsle:2.42859
[100]	validation_0-rmsle:0.37117
[200]	validation_0-rmsle:0.36593
[300]	validation_0-rmsle:0.36653
[400]	validation_0-rmsle:0.36563
[448]	validation_0-rmsle:0.36605
RMSLE:0.3652646052461032
[0]	validation_0-rmsle:2.40629
[100]	validation_0-rmsle:0.34349
[200]	validation_0-rmsle:0.33863
[300]	validation_0-rmsle:0.33970
[400]	validation_0-rmsle:0.34265
[416]	validation_0-rmsle:0.34266
RMSLE:0.33835936222044083
[0]	validation_0-rmsle:2.44066
[100]	validation_0-rmsle:0.35181
[200]	validation_0-rmsle:0.35154
[300]	validation_0-rmsle:0.35355
[336]	validation_0-rmsle:0.35357
RMSLE:0.3502201882573853
[0]	validation_0-rmsle:2.41330
[100]	validation_0-rmsle:0.35680
[200]	validation_0-rmsle:nan
[277]	validation_0-rmsle:nan
RMSLE:0.3535674483700579


In [99]:
y_pred_val = np.column_stack((pred0,pred1,pred2))
val_error = msle(y_val,abs(y_pred_val), multioutput = 'uniform_average',squared=False)
val_error

0.19097871442633182

In [107]:
sub_pred = pd.DataFrame(np.column_stack((test0,test1,test2)),columns=target.columns)
sub_pred_df = pd.concat([date_test,sub_pred],axis=1)
sub_pred_df.to_csv('submit_0.csv',index=False)