In [2]:
# Packages
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_val_score
%matplotlib inline

In [3]:
# Options
sns.set_style("whitegrid")
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', -1)

In [4]:
# Parameters
infile = './data/transformed_data/model.csv'

In [9]:
# Import + preprocess
df = pd.read_csv(infile)
df.drop([
    'OverallQual', 'Neighborhood', 'MSZoning', 'BldgType', 'PropertyAge', 'TotalBsmtSF', '1stFlrSF',
    '2ndFlrSF', 'GrLivArea', 'TotalArea', 'FullBath', 'HalfBath', 'TotalBath', 'TotRmsAbvGrd',
    'BedroomAbvGr', 'Remodelled'
], axis=1, inplace=True)
df.columns.tolist()

['Id',
 'IsNew',
 'IsPartial',
 'SalePrice',
 'Neighborhood_Blmngtn',
 'Neighborhood_Blueste',
 'Neighborhood_BrDale',
 'Neighborhood_BrkSide',
 'Neighborhood_ClearCr',
 'Neighborhood_CollgCr',
 'Neighborhood_Crawfor',
 'Neighborhood_Edwards',
 'Neighborhood_Gilbert',
 'Neighborhood_IDOTRR',
 'Neighborhood_MeadowV',
 'Neighborhood_Mitchel',
 'Neighborhood_NAmes',
 'Neighborhood_NPkVill',
 'Neighborhood_NWAmes',
 'Neighborhood_NoRidge',
 'Neighborhood_NridgHt',
 'Neighborhood_OldTown',
 'Neighborhood_SWISU',
 'Neighborhood_Sawyer',
 'Neighborhood_SawyerW',
 'Neighborhood_Somerst',
 'Neighborhood_StoneBr',
 'Neighborhood_Timber',
 'Neighborhood_Veenker',
 'MSZoning_C (all)',
 'MSZoning_FV',
 'MSZoning_RH',
 'MSZoning_RL',
 'MSZoning_RM',
 'BldgType_1Fam',
 'BldgType_2fmCon',
 'BldgType_Duplex',
 'BldgType_Twnhs',
 'BldgType_TwnhsE',
 'OverallQualLog',
 'PropertyAgeLog',
 'TotalBsmtSFLog',
 '1stFlrSFLog',
 '2ndFlrSFLog',
 'GrLivAreaLog',
 'TotalAreaLog',
 'FullBathLog',
 'HalfBathLog',
 '

In [10]:
# Define Features & Target
features = df.drop(['Id', 'SalePriceLog'], axis=1)
target = df[['SalePriceLog']]

In [11]:
# Linear Regression
model = LinearRegression()
scores = cross_val_score(model, features, target, cv=10)
print(np.mean(scores))
print(np.mean(np.log1p(scores)))

0.9525559960970638
0.6691264200999051


In [12]:
# Ridge Regression
model = Ridge()
scores = cross_val_score(model, features, target, cv=10)
print(np.mean(scores))
print(np.mean(np.log1p(scores)))

0.9503796446530345
0.6680117816634195


In [39]:
# Lasso Regression
model = Lasso()
scores = cross_val_score(model, features, target, cv=10)
print(np.mean(scores))
print(np.mean(np.log1p(scores)))

-0.006820054474503534
-0.0068622269506132


#### Sources & Reference

Tutorials & Papers:  
* [Cardinality Reduction](https://pkghosh.wordpress.com/2017/10/09/combating-high-cardinality-features-in-supervised-machine-learning/)  
* [FA & PCA](https://www.dummies.com/programming/big-data/data-science/data-science-using-python-to-perform-factor-and-principal-component-analysis/)  
* [Factor Analysis for Decomposition](https://www.packtpub.com/mapt/book/big_data_and_business_intelligence/9781783989485/1/ch01lvl1sec19/using-factor-analysis-for-decomposition)  
* [SKL Decomposition](http://scikit-learn.org/stable/modules/decomposition.html)  
* [Clustering Mixed Data](https://datascience.stackexchange.com/questions/8681/clustering-for-mixed-numeric-and-nominal-discrete-data)  
* [Box Cox Transformation](https://www.statisticshowto.datasciencecentral.com/box-cox-transformation/)  
* [Log Transforms](http://onlinestatbook.com/2/transformations/log.html)   
* [SKL ensembling](http://scikit-learn.org/stable/modules/ensemble.html)  

Kaggle Kernels & Notebooks: 
* [Data Exploration Kernel](https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python)
* [Good Overall Kaggle Kernel](https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard)  
* [Applied Regression](https://www.kaggle.com/juliencs/a-study-on-regression-applied-to-the-ames-dataset)  
* [Regularized Linear Models](https://www.kaggle.com/apapiu/regularized-linear-models)  