## Import Libraries

In [58]:
import pandas as pd
import numpy as np

from pandas.api.types import is_string_dtype, is_numeric_dtype

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, StandardScaler
from sklearn.feature_selection import RFE

from sklearn.impute import KNNImputer, SimpleImputer

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, Lasso, Ridge, ElasticNet
from sklearn.pipeline import Pipeline


import sys
import os

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" 
    

## Import Data

In [59]:
train = pd.read_csv('../../datasets/cleaned_data/train_preproc.csv')
test = pd.read_csv('../../datasets/cleaned_data/test_preproc.csv')

## Get Model Preprocessing Features

In [63]:
numeric = list(set(['overall_qual', 'year_built', 'year_remod/add',  'mas_vnr_area', 'bsmtfin_sf_1', 'total_bsmt_sf', '1st_flr_sf', 'gr_liv_area', 'full_bath', 'totrms_abvgrd', 'fireplaces', 'garage_cars', 'garage_area', 'actual_totrms_abvgrd', 'g_gr_area', 'total_sf', 'actual_totrms_abvgrd', 'g_gr_area', 'total_sf'])) 
categorical = ['neighborhood']

## Train Test Split

In [64]:
y = train['saleprice']
X = train[numeric + categorical]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.30, random_state=2000)

## Baseline Model

In [65]:
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error

# create a dummy regressor
dummy_reg = DummyRegressor(strategy='mean')
# fit it on the training set
dummy_reg.fit(X_train, y_train)
# make predictions on the test set
y_pred = dummy_reg.predict(X_val)

In [66]:
r2_score(y_val, y_pred) # how much of the varience that is explained by the model

-0.004601670112839962

In [67]:
# calculate root mean squared error
mean_squared_error(y_val, y_pred, squared=False)

83671.63553408739

## Feature Engineering and Model Testing

In [69]:
ct_poly = ColumnTransformer([
    ('poly', PolynomialFeatures(), [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]), 
], remainder = 'passthrough')

ct_ohe = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False), [16]),# one hot encode categorical columns
], remainder = 'drop')

## Lasso Model Hyperparam Tuning

In [70]:
lasso_p_params = {
    'poly__poly__degree': [1, 2],
    'poly__poly__include_bias': [True, False],
    'poly__poly__interaction_only': [True, False],
    'ohe__ohe__handle_unknown': ['ignore'],
    'ohe__ohe__sparse': [True, False],
    'l__alpha': np.logspace(1, 10, 10),
    'l__fit_intercept': [True, False],
    'l__normalize': [True, False]
}

lasso_pipe = Pipeline([
            ('poly', ct_poly), 
            ('ohe', ct_ohe), 
            ('ss', StandardScaler(with_mean=False)),
            ('l', Lasso()), # use lasso to optimize parameter selection
])

lasso_gs = GridSearchCV( # use gridsearch to optimize feature selection
    lasso_pipe, 
    lasso_p_params,
    cv = 4,
    n_jobs = -1,
    error_score='raise' 
)
lasso_gs.fit(X_train, y_train) 
print(f'Train: {lasso_gs.score(X_train, y_train)}, Test: {lasso_gs.score(X_val, y_val)}, Best Score: {lasso_gs.best_score_}') 

Train: 0.7856091041887385, Test: 0.661367659018561, Best Score: 0.5789848305518114


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


In [71]:
lasso_gs.best_estimator_

Pipeline(steps=[('poly',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('poly',
                                                  PolynomialFeatures(include_bias=False,
                                                                     interaction_only=True),
                                                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                   10, 11, 12, 13, 14, 15])])),
                ('ohe',
                 ColumnTransformer(transformers=[('ohe',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  [16])])),
                ('ss', StandardScaler(with_mean=False)),
                ('l', Lasso(alpha=10.0, normalize=True))])

### Scoring

In [72]:
preds_y = lasso_gs.predict(X_val)

In [73]:
r2_score(y_val, preds_y) # how much of the varience that is explained by the model

0.661367659018561

In [74]:
mean_squared_error(y_val, preds_y, squared=False)

48578.659529746255

In [75]:
columns = lasso_gs.best_estimator_.named_steps['poly'].get_feature_names_out()
coefs = lasso_gs.best_estimator_.named_steps['l'].coef_

In [76]:
temp = pd.DataFrame(zip(columns, coefs))
temp = temp[temp[1] != 0]

In [77]:
temp.sort_values(by=1)

Unnamed: 0,0,1
15,poly__garage_cars,-11145.905906
49,poly__mas_vnr_area actual_totrms_abvgrd,-8942.653565
5,poly__fireplaces,-7978.153239
3,poly__1st_flr_sf,-6252.783844
31,poly__overall_qual mas_vnr_area,-4765.558000
...,...,...
86,poly__fireplaces total_bsmt_sf,302.464289
95,poly__full_bath total_bsmt_sf,530.267382
116,poly__bsmtfin_sf_1 total_bsmt_sf,544.801600
123,poly__totrms_abvgrd total_sf,964.692235


## Ridge Model Hyperparam Tuning

In [78]:
ridge_p_params = {
    'poly__poly__degree': [1, 2, 3],
    'poly__poly__include_bias': [True, False],
    'poly__poly__interaction_only': [True, False],
    'r__alpha': np.logspace(1, 10, 20),
    'r__fit_intercept': [True, False],
    'r__normalize': [True, False],
    'r__solver': ['auto'],
}

ridge_pipe = Pipeline([
            ('poly', ct_poly), 
            ('ohe', ct_ohe), 
            ('ss', StandardScaler(with_mean=False)),
            ('r', Ridge()), # use lasso to optimize parameter selection
])

ridge_gs = GridSearchCV( # use gridsearch to optimize feature selection
    ridge_pipe, 
    ridge_p_params,
    cv = 4,
    n_jobs = -1
)
ridge_gs.fit(X_train, y_train)
print(f'Train: {ridge_gs.score(X_train, y_train)}, Test: {ridge_gs.score(X_val, y_val)}, Best Score: {ridge_gs.best_score_}') 

Train: 0.7887936794684451, Test: 0.6570953560959208, Best Score: 0.5833540972137253




In [79]:
ridge_gs.best_estimator_

Pipeline(steps=[('poly',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('poly',
                                                  PolynomialFeatures(include_bias=False,
                                                                     interaction_only=True),
                                                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                   10, 11, 12, 13, 14, 15])])),
                ('ohe',
                 ColumnTransformer(transformers=[('ohe',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  [16])])),
                ('ss', StandardScaler(with_mean=False)),
                ('r', Ridge(alpha=88.58667904100822, normalize=False))])

### Scoring

In [80]:
preds_y = ridge_gs.predict(X_val)

In [81]:
r2_score(y_val, preds_y) # how much of the varience that is explained by the model

0.6570953560959208

In [82]:
mean_squared_error(y_val, preds_y, squared=False)

48884.14163134596

In [83]:
columns = ridge_gs.best_estimator_.named_steps['poly'].get_feature_names_out()
coefs = ridge_gs.best_estimator_.named_steps['r'].coef_

In [84]:
temp = pd.DataFrame(zip(columns, coefs))
temp = temp[temp[1] != 0]

In [85]:
temp.sort_values(by=1)

Unnamed: 0,0,1
15,poly__garage_cars,-13980.100429
49,poly__mas_vnr_area actual_totrms_abvgrd,-12712.508405
5,poly__fireplaces,-9646.965625
3,poly__1st_flr_sf,-7410.879185
108,poly__g_gr_area bsmtfin_sf_1,-6565.607046
...,...,...
95,poly__full_bath total_bsmt_sf,-96.436738
116,poly__bsmtfin_sf_1 total_bsmt_sf,-82.778781
86,poly__fireplaces total_bsmt_sf,-32.214315
123,poly__totrms_abvgrd total_sf,591.526952


## Elastic Net Model Hyperparam Tuning

In [86]:
elastic_net_p_params = {
    'poly__poly__degree': [1, 2],
    'poly__poly__include_bias': [True, False],
    'poly__poly__interaction_only': [True, False],
    'en__alpha': np.logspace(1, 10, 15),
    'en__fit_intercept': [True, False],
    'en__normalize': [True, False],
}

en_pipe = Pipeline([
            ('poly', ct_poly), 
            ('ohe', ct_ohe), 
            ('ss', StandardScaler(with_mean=False)),
            ('en', ElasticNet()), # use lasso to optimize parameter selection
])

elastic_net_gs = GridSearchCV( # use gridsearch to optimize feature selection
    en_pipe, 
    elastic_net_p_params,
    cv = 4,
    n_jobs = -1
)
elastic_net_gs.fit(X_train, y_train)
print(f'Train: {elastic_net_gs.score(X_train, y_train)}, Test: {elastic_net_gs.score(X_val, y_val)}, Best Score: {elastic_net_gs.best_score_}') 

Train: 0.24332824798088082, Test: 0.18889245865974869, Best Score: 0.19252331403360898




In [87]:
elastic_net_gs.best_estimator_

Pipeline(steps=[('poly',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('poly',
                                                  PolynomialFeatures(include_bias=False,
                                                                     interaction_only=True),
                                                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                   10, 11, 12, 13, 14, 15])])),
                ('ohe',
                 ColumnTransformer(transformers=[('ohe',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  [16])])),
                ('ss', StandardScaler(with_mean=False)),
                ('en', ElasticNet(alpha=10.0, normalize=False))])

### Scoring

In [88]:
preds_y = elastic_net_gs.predict(X_val)

In [89]:
r2_score(y_val, preds_y) # how much of the varience that is explained by the model

0.18889245865974869

In [90]:
mean_squared_error(y_val, preds_y, squared=False)

75183.15212887275

In [91]:
columns = elastic_net_gs.best_estimator_.named_steps['poly'].get_feature_names_out()
coefs = elastic_net_gs.best_estimator_.named_steps['en'].coef_

In [92]:
temp = pd.DataFrame(zip(columns, coefs))
temp = temp[temp[1] != 0]

In [93]:
temp.sort_values(by=1)

Unnamed: 0,0,1
15,poly__garage_cars,-2529.722077
49,poly__mas_vnr_area actual_totrms_abvgrd,-2334.047992
5,poly__fireplaces,-1717.225254
3,poly__1st_flr_sf,-1313.493787
108,poly__g_gr_area bsmtfin_sf_1,-1186.920073
...,...,...
95,poly__full_bath total_bsmt_sf,-15.178836
116,poly__bsmtfin_sf_1 total_bsmt_sf,-12.756582
86,poly__fireplaces total_bsmt_sf,-4.103048
123,poly__totrms_abvgrd total_sf,104.724675


## Submission

In [None]:
# submission = pd.DataFrame()
# submission['Id'] = test['id'].astype(int)
# test.drop(columns = ['id', 'pid'], inplace = True)
# preds = gs.predict(test[columns + ['actual_totrms_abvgrd', 'g_gr_area', 'total_sf', 'neighborhood', 'utilities']])
# submission['SalePrice'] = preds
# submission.to_csv('../../datasets/submissions/submission_5.csv', index = False)