# This notebook uses sklearn regression models to predict rent prices in Kiev

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split

# Reading Prepared Data from first notebook

In [2]:
data = pd.read_csv('data_prepared.csv')
data = data.drop('Unnamed: 0', axis = 1)

In [3]:
X = data.drop('Price', axis = 1).values
y = data.Price.values

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.1)

In [5]:
def RMSE(y1, y2):
    return np.sqrt(mean_squared_error(y1, y2))

# Creating Regression Models

In [6]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0001, random_state=1))
Elnet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
KRR = make_pipeline(RobustScaler(), KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5))
GBoost = make_pipeline(RobustScaler(), GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5))

### Function to evaluate models using cross validation score

In [7]:
n_folds = 5

def rmse_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train)
    rmse = np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [8]:
score = rmse_cv(lasso)
print("Lasso score on RMSE with", n_folds, "folds val score: mean - {:.4f}, std - {:.4f}".format(score.mean(), score.std()))

score = rmse_cv(Elnet)
print("Elastic Net score on RMSE with", n_folds, "folds val score: mean - {:.4f}, std - {:.4f}".format(score.mean(), score.std()))

score = rmse_cv(KRR)
print("Kernel Ridge score on RMSE with", n_folds, "folds val score: mean - {:.4f}, std - {:.4f}".format(score.mean(), score.std()))

score = rmse_cv(GBoost)
print("Gradient Boost score on RMSE with", n_folds, "folds val score: mean - {:.4f}, std - {:.4f}".format(score.mean(), score.std()))


Lasso score on RMSE with 5 folds val score: mean - 0.3149, std - 0.0773
Elastic Net score on RMSE with 5 folds val score: mean - 0.3147, std - 0.0775
Kernel Ridge score on RMSE with 5 folds val score: mean - 0.2994, std - 0.0794
Gradient Boost score on RMSE with 5 folds val score: mean - 0.2818, std - 0.0776


## Evaluating Models using simple RMSE score on train and test data

### Lasso

In [9]:
lasso.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('lasso', Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=1,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [10]:
y_pred = lasso.predict(X_train)
print("Lasso RMSE score on Train data: ", RMSE(y_pred, y_train))
y_pred = lasso.predict(X_test)
print("Lasso RMSE score on Test data: ", RMSE(y_pred, y_test))

Lasso RMSE score on Train data:  0.32043835988544644
Lasso RMSE score on Test data:  0.28372195165911157


### Elastic Net

In [11]:
Elnet.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('elasticnet', ElasticNet(alpha=0.0005, copy_X=True, fit_intercept=True, l1_ratio=0.9,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=3, selection='cyclic', tol=0.0001, warm_start=False))])

In [12]:
y_pred = Elnet.predict(X_train)
print("Elastic Net RMSE score on Train data: ", RMSE(y_pred, y_train))
y_pred = Elnet.predict(X_test)
print("Elastic Net RMSE score on Test data: ", RMSE(y_pred, y_test))

Elastic Net RMSE score on Train data:  0.3208292351796001
Elastic Net RMSE score on Test data:  0.2837761026131384


### Kernel Ridge

In [13]:
KRR.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('kernelridge', KernelRidge(alpha=0.6, coef0=2.5, degree=2, gamma=None, kernel='polynomial',
      kernel_params=None))])

In [14]:
y_pred = KRR.predict(X_train)
print("Kernel Ridge RMSE score on Train data: ", RMSE(y_pred, y_train))
y_pred = KRR.predict(X_test)
print("Kernel Ridge RMSE score on Test data: ", RMSE(y_pred, y_test))

Kernel Ridge RMSE score on Train data:  0.2994591760745504
Kernel Ridge RMSE score on Test data:  0.2613285071138791


### Gradient Boost

In [15]:
GBoost.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('gradientboostingregressor', GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.05, loss='huber', max_depth=4,
          ...     subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False))])

In [16]:
y_pred = GBoost.predict(X_train)
print("Gradient Boost RMSE score on Train data: ", RMSE(y_pred, y_train))
y_pred = GBoost.predict(X_test)
print("Gradient Boost RMSE score on Test data: ", RMSE(y_pred, y_test))

Gradient Boost RMSE score on Train data:  0.19903356564069694
Gradient Boost RMSE score on Test data:  0.25879226840607417


### Now we average model's prediction on test data

In [17]:
class AverageModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        for model in self.models_:
            model.fit(X, y)

        return self
    
    def predict(self, X):
        predictions = np.column_stack([model.predict(X) for model in self.models_])
        return np.mean(predictions, axis=1)

In [18]:
averaged_models = AverageModels(models = (lasso, Elnet, KRR, GBoost))

score = rmse_cv(averaged_models)
print("Averaged base models on RMSE score with", n_folds, "folds val score: mean - {:.4f}, std - {:.4f}".format(score.mean(), score.std()))

Averaged base models on RMSE score with 5 folds val score: mean - 0.2909, std - 0.0804


In [19]:
averaged_models.fit(X_train, y_train)

AverageModels(models=(Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('lasso', Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_...   subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False))])))

### The Results

In [20]:
y_pred = averaged_models.predict(X_train)
print("Averaged base models RMSE score on Train data: ", RMSE(y_pred, y_train))
y_pred = averaged_models.predict(X_test)
print("Averaged base models RMSE score on Test data: ", RMSE(y_pred, y_test))

Averaged base models RMSE score on Train data:  0.2747030065813238
Averaged base models RMSE score on Test data:  0.25725559815127114


Eventually Gradient Boost looks more attractive but it takes more time to fit and hard in implementation, for example my gradient boost that I managed to code is extremely slow compared to sklearn gboost, maybe the reason is python language but I am not sure:)