# Import data & libraries

In [12]:
import pandas as pd
import numpy as np
y_train = pd.read_csv('train.csv').iloc[:,1]
X_train = pd.read_csv('EDA_X_train.csv')
X_test = pd.read_csv('EDA_X_test.csv')

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score,  mean_absolute_error

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
import xgboost as xgb
import lightgbm as lgb
from mlxtend.regressor import StackingCVRegressor

import warnings
warnings.filterwarnings('ignore') # this is to clear the warnings from this page, typically you would leave them on


# Additive Boosting

In [15]:
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore') # this is to clear the warnings from this page, typically you would leave them on

alphas = list(np.logspace(-15, 15, 151, base=2))

def cross_validation(X, y, cv=10):
    kf = KFold(n_splits = cv, random_state = None, shuffle = True)
    rmse_scores = []
    for train_index, test_index in kf.split(X):
        X_trn, X_tst = X.iloc[train_index], X.iloc[test_index]
        y_trn, y_tst = y.iloc[train_index], y.iloc[test_index]
        
        ### straight ridge
        ridge1 = RidgeCV(alphas=alphas, cv=10, normalize=True)
        ridge1.fit(X_train, y_train)

               
        
        
        y_fit = ridge1.predict(X_trn)
        resid = y_trn - y_fit

        model = lgb.LGBMRegressor(objective='regression')


        tuning_parameters = {
            'learning_rate': [0.01, 0.05, 0.1 ],
            'n_estimators' : [250, 500, 750, 1000, 1500],
            'max_depth' : [2, 3, 4],
            'subsample' : [0.6, 0.8, 1.0],
        }

        gb_search = RandomizedSearchCV(model, tuning_parameters, n_iter = 16, cv = 5, 
                                       return_train_score=False, n_jobs=4, random_state = 42)

        gb_search.fit(X_trn, resid)

        abst = gb_search.best_estimator_
        
        
        
        
        y_pred = ridge1.predict(X_tst) + abst.predict(X_tst)
        mse = np.sum((y_pred - y_tst)**2) / len(y_pred)
        rmse_score = np.sqrt(mse)
        print(rmse_score)
        rmse_scores.append(rmse_score)
    
    print('%d-fold cross-validation RMSE: %.3f +/- %.3f' % (cv, np.mean(rmse_scores), np.std(rmse_scores)))
        
cross_validation(X_train, y_train, cv=10)

66.97038828594964
55.87187598607969
67.98673608496098
73.40776174069202
69.72438566002535
69.223469642714
54.0565312380459
51.85743961964429
87.10226089717916
64.88472883519087
10-fold cross-validation RMSE: 66.109 +/- 9.878


In [16]:
ridge1 = RidgeCV(alphas=alphas, cv=10, normalize=True)
ridge1.fit(X_train, y_train)


RidgeCV(alphas=array([3.05176e-05, 3.50555e-05, ..., 2.85262e+04, 3.27680e+04]),
    cv=10, fit_intercept=True, gcv_mode=None, normalize=True, scoring=None,
    store_cv_values=False)

In [17]:
y_fit = ridge1.predict(X_train)
resid = y_train - y_fit

model = lgb.LGBMRegressor(objective='regression')


tuning_parameters = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators' : [250, 500, 750, 1000, 1500],
    'max_depth' : [2, 3, 4],
    'subsample' : [0.6, 0.8, 1.0],
}

gb_search = RandomizedSearchCV(model, tuning_parameters, n_iter = 16, cv = 5, 
                               return_train_score=False, n_jobs=4, random_state = 42)

gb_search.fit(X_train, resid)

abst = gb_search.best_estimator_


print('Best parameters found by randomised search:', gb_search.best_params_, '\n')

Best parameters found by randomised search: {'subsample': 0.8, 'n_estimators': 250, 'max_depth': 4, 'learning_rate': 0.01} 



## ABST prediction on test

In [18]:
y_pred_abst=ridge1.predict(X_test)+abst.predict(X_test) # combining predictions

In [19]:
y_pred_abst_train = ridge1.predict(X_train)+abst.predict(X_train)
print(y_pred_abst_train)

[129.98680452 109.14662974 149.67451499 161.21299829 140.6688517
 141.84304972 189.72171108 123.13531247 315.58539279 252.49256217
 161.89861376 237.79009169 182.21874873  84.33160376 167.72395841
 209.40597311 259.24208707 150.09753804 259.47682138 265.7483875
 152.71620737 206.2218147  144.4906199  276.93015027  48.35246289
  63.81024601  98.31307843 192.27189207 205.75933407 198.0635575
  54.33132915  78.0401173  227.55286925 348.69747592 117.9983264
 123.13457987 102.97602726  94.26959443  76.74340945  68.8715342
  94.54823589 276.28337099 201.67668942  62.40255017 152.46055892
  64.02713548  95.51711639  79.25491235 257.2941721  252.36450377
 313.01179883 152.38285754  82.22461501 168.67286475 148.09727727
 221.65529392 104.39722184 326.87999745 357.71882498 126.0027109
 223.61520355 163.18412114  92.7639697   55.34702796 372.28924223
 237.46642709 154.31469591  71.5700399  204.93071654 235.95783263
 225.273019   159.28391425 315.12871556 229.16711543 275.92371383
 115.60914768 20

# Model Stacking Cross-Validation

In [20]:
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings('ignore') # this is to clear the warnings from this page, typically you would leave them on

def cross_validation(X, y, cv=5):
    kf = KFold(n_splits = cv, random_state = None, shuffle = True)
    rmse_scores = []
    for train_index, test_index in kf.split(X):
        X_trn, X_tst = X.iloc[train_index], X.iloc[test_index]
        y_trn, y_tst = y.iloc[train_index], y.iloc[test_index]
        
        # OLS
        ols = LinearRegression()
        ols.fit(X_trn, y_trn)
        
        # Lasso
        lasso = Pipeline((
            #('scaler', StandardScaler()),
            ('estimator', LassoCV(cv=5)),
        ))
        lasso.fit(X_trn, y_trn)
        
        
        # Ridge
        alphas = list(np.logspace(-15, 15, 151, base=2))

        ridge = Pipeline((
            #('scaler', StandardScaler()),
            ('estimator', RidgeCV(alphas=alphas, cv=5)),
        ))

        ridge.fit(X_trn, y_trn)
        
        
        # xgb
        model = xgb.XGBRegressor()

        tuning_parameters = {
            'learning_rate': [0.01, 0.05, 0.1],
            'n_estimators' : [250, 500, 750, 1000, 1500],
            'max_depth' : [2, 3, 4],
            'subsample' : [0.6, 0.8, 1.0],
        }

        gb_search = RandomizedSearchCV(model, tuning_parameters, n_iter = 16, cv = 5, return_train_score=False, 
                                       n_jobs=4, random_state = 20)
        gb_search.fit(X_trn, y_trn)

        xbst = gb_search.best_estimator_


        print('Best parameters found by randomised search:', gb_search.best_params_, '\n')
        
        
        
        # Stack
        models = [ols, lasso, ridge, xbst]
        stack = StackingCVRegressor(models, meta_regressor = LinearRegression(), cv=10)
        stack.fit(X_trn.values, y_trn.ravel())
        y_prd = stack.predict(X_tst.values)
        
        mse = np.sum((y_prd - y_tst)**2) / len(y_prd)
        rmse_score = np.sqrt(mse)
        print(rmse_score)
        rmse_scores.append(rmse_score)
    
    print('%d-fold cross-validation RMSE: %.3f +/- %.3f' % (cv, np.mean(rmse_scores), np.std(rmse_scores)))
        
#cross_validation(X_train, y_train, cv=10)

## Stacking prediction on test

In [21]:
# OLS
ols = LinearRegression()
ols.fit(X_train, y_train)

# Lasso
lasso = Pipeline((
    #('scaler', StandardScaler()),
    ('estimator', LassoCV(cv=5)),
))
lasso.fit(X_train, y_train)


# Ridge
alphas = list(np.logspace(-15, 15, 151, base=2))

ridge = Pipeline((
    #('scaler', StandardScaler()),
    ('estimator', RidgeCV(alphas=alphas, cv=5)),
))

ridge.fit(X_train, y_train)


# xgb
model = xgb.XGBRegressor()

tuning_parameters = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators' : [250, 500, 750, 1000, 1500],
    'max_depth' : [2, 3, 4],
    'subsample' : [0.6, 0.8, 1.0],
}

gb_search = RandomizedSearchCV(model, tuning_parameters, n_iter = 16, cv = 5, return_train_score=False, 
                               n_jobs=4, random_state = 20)
gb_search.fit(X_train, y_train)

xbst = gb_search.best_estimator_


print('Best parameters found by randomised search:', gb_search.best_params_, '\n')



# Stack
models = [ols, lasso, ridge, xbst]
stack = StackingCVRegressor(models, meta_regressor = LinearRegression(), cv=10)
stack.fit(X_train.values, y_train.ravel())

y_pred_stack=stack.predict(X_test.values)
#y_pred

Best parameters found by randomised search: {'subsample': 1.0, 'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.01} 



In [22]:
y_prediction = y_pred_abst

output = pd.concat([pd.DataFrame(y_prediction)], axis=1)
output.columns=['price']

print(output.to_csv(index=False))

price
90.43047979849878
175.46356472728485
97.30284097737871
56.65147536100356
199.8654323823291
122.21768062969446
292.6925890057014
166.08574258780914
169.03192320873794
47.11389663324014
308.88505323251957
141.3452211621975
315.57511438238
164.24169757566563
139.85272905399958
254.2964508406394
122.90133893493899
220.0518571241235
212.4377053203917
140.64503386347562
142.96904447477732
234.08314747148626
115.68642894582493
203.93667525797235
59.98437740990537
93.72365500890137
64.2320958477549
124.44517212495496
100.88805564044443
168.01683682344742
172.8294084270949
211.2696830142566
76.30306253066357
157.16730304124405
176.24006214746947
83.99648798226681
255.99690886583315
119.51803856353038
474.47276921900755
90.25497169921962
220.2369648506085
160.78012266885645
178.1872592895995
103.2616421366014
140.05109399177675
122.68650107263429
211.0485781784068
65.64679843531286
131.69706125750318
121.11032709805458
153.7538802514174
126.93265579158732
53.37185032648309
169.549338672545