# Housing Demo for Data Club

This is a demo notebook to explain some basic ML concepts in the SKLearn environment. 

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

Get data and information from https://www.kaggle.com/c/house-prices-advanced-regression-techniques

## Prep Data

In [3]:
df = pd.read_csv('data/train.csv')

Need to get dummy variables to deal with categoricals

In [8]:
df = df
df.MSSubClass = df.MSSubClass.astype(str)
df = pd.get_dummies(df)

In [11]:
X = df.drop(['SalePrice', 'Id'], axis=1).values
y = df.SalePrice.values

There are missing values so need to impute these

In [13]:
X = SimpleImputer().fit_transform(X)

## Training a model

In [14]:
m_linear = LinearRegression().fit(X, y)

  linalg.lstsq(X, y)


In [15]:
m_linear.score(X, y)

0.9336835673541317

Some things we did wrong:

* We didnt look at the data
* We didn't set a target
* We tested our model on the same data we trained on

## Train-Validation Split

In [17]:

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, test_size=0.2)

In [18]:
m_linear = LinearRegression().fit(X_train, y_train)
m_linear.score(X_val, y_val)

0.4862796324549694

In [19]:
m_ridge = Ridge().fit(X_train, y_train)
m_ridge.score(X_val, y_val)

0.6308230373105395

In [20]:
m_ridge2 = Ridge(alpha=0.5).fit(X_train, y_train)
m_ridge2.score(X_val, y_val)

0.5880300791348494

Some things we did wrong:

* What is the ridge model?
* We iteratively looked at models, arbitrary stopping conditions
* Testing each model once leaves an element of chance for which model performs best

## Cross Validation

In [21]:
m_ridge = Ridge()
param_grid = {'alpha': [0.1, 1, 10]}
grid = GridSearchCV(m_ridge, param_grid, cv=5, return_train_score=False)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': [0.1, 1, 10]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=False, scoring=None, verbose=0)

In [22]:
res = pd.DataFrame(grid.cv_results_)
res


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005981,0.000543,0.00054,4e-05,0.1,{'alpha': 0.1},0.875625,0.763627,0.916745,0.877418,0.897386,0.866124,0.053447,3
1,0.005668,0.000303,0.000577,0.000104,1.0,{'alpha': 1},0.89741,0.768181,0.928593,0.881662,0.904139,0.875968,0.056029,2
2,0.005536,0.001874,0.000481,4.5e-05,10.0,{'alpha': 10},0.910037,0.762902,0.933358,0.879313,0.90675,0.878447,0.060321,1


In [23]:
grid.score(X_val, y_val)

0.7036441114654699

Some things we did wrong:

* Doing better but we still had to do all of our preprocessing manually at the start
* Any preprocessing that requires a fit call should be part of the CV process

## Making a pipeline


In [24]:
X = df.drop(['SalePrice', 'Id'], axis=1).values
y = df.SalePrice.values

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, test_size=0.2)

In [26]:
pipe = Pipeline([
    ('impute', SimpleImputer(strategy="median")),
    ('scaler', MinMaxScaler()),
    ('regressor', Ridge())
])

param_grid =  {'impute__strategy': ["median", "mean"],
               'scaler': [MinMaxScaler(), RobustScaler(), None],
               'regressor': [Ridge(), Lasso()],
               'regressor__alpha': [0.1, 1, 10]}
 

grid = GridSearchCV(pipe, param_grid, cv=5, return_train_score=False)
grid.fit(X_train, y_train)







GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('impute', SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)), ('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('regressor', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'impute__strategy': ['median', 'mean'], 'scaler': [MinMaxScaler(copy=True, feature_range=(0, 1)), RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True), None], 'regressor': [Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None,
   n...state=None,
   selection='cyclic', tol=0.0001, warm_start=False)], 'regressor__alpha': [0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring=None, verbose=0)

In [27]:
res = pd.DataFrame(grid.cv_results_)
res.sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_impute__strategy,param_regressor,param_regressor__alpha,param_scaler,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
25,0.132467,0.035121,0.001895,0.000352,mean,"Ridge(alpha=10, copy_X=True, fit_intercept=Tru...",10.0,"RobustScaler(copy=True, quantile_range=(25.0, ...","{'impute__strategy': 'mean', 'regressor': Ridg...",0.910205,0.771442,0.933014,0.877463,0.906635,0.879731,0.057005,1
7,0.094973,0.00626,0.001611,0.000181,median,"Ridge(alpha=10, copy_X=True, fit_intercept=Tru...",10.0,"RobustScaler(copy=True, quantile_range=(25.0, ...","{'impute__strategy': 'median', 'regressor': Ri...",0.909961,0.771292,0.933083,0.877443,0.906932,0.879721,0.057077,2
3,0.021821,0.001376,0.001394,0.000124,median,"Ridge(alpha=10, copy_X=True, fit_intercept=Tru...",1.0,"MinMaxScaler(copy=True, feature_range=(0, 1))","{'impute__strategy': 'median', 'regressor': Ri...",0.899908,0.780749,0.928274,0.877914,0.905727,0.878491,0.051482,3
21,0.016644,0.002646,0.001875,0.000174,mean,"Ridge(alpha=10, copy_X=True, fit_intercept=Tru...",1.0,"MinMaxScaler(copy=True, feature_range=(0, 1))","{'impute__strategy': 'mean', 'regressor': Ridg...",0.900006,0.780803,0.928261,0.877911,0.90555,0.878484,0.051449,4
26,0.013316,0.001714,0.001821,0.000385,mean,"Ridge(alpha=10, copy_X=True, fit_intercept=Tru...",10.0,,"{'impute__strategy': 'mean', 'regressor': Ridg...",0.910037,0.762879,0.933348,0.879307,0.906737,0.878437,0.060327,5
8,0.021047,0.000972,0.001198,9.3e-05,median,"Ridge(alpha=10, copy_X=True, fit_intercept=Tru...",10.0,,"{'impute__strategy': 'median', 'regressor': Ri...",0.909797,0.762714,0.933423,0.879292,0.907026,0.878425,0.060406,6
4,0.090692,0.006982,0.001498,7.4e-05,median,"Ridge(alpha=10, copy_X=True, fit_intercept=Tru...",1.0,"RobustScaler(copy=True, quantile_range=(25.0, ...","{'impute__strategy': 'median', 'regressor': Ri...",0.897503,0.769568,0.928477,0.88133,0.904205,0.876188,0.055481,7
22,0.113082,0.04349,0.001805,0.000285,mean,"Ridge(alpha=10, copy_X=True, fit_intercept=Tru...",1.0,"RobustScaler(copy=True, quantile_range=(25.0, ...","{'impute__strategy': 'mean', 'regressor': Ridg...",0.897615,0.769635,0.92842,0.881332,0.904029,0.876178,0.055435,8
5,0.019864,0.000597,0.001165,4.9e-05,median,"Ridge(alpha=10, copy_X=True, fit_intercept=Tru...",1.0,,"{'impute__strategy': 'median', 'regressor': Ri...",0.89729,0.768102,0.928639,0.881653,0.904292,0.875966,0.056075,9
23,0.031702,0.032416,0.001983,0.000358,mean,"Ridge(alpha=10, copy_X=True, fit_intercept=Tru...",1.0,,"{'impute__strategy': 'mean', 'regressor': Ridg...",0.897403,0.76817,0.928582,0.881654,0.904117,0.875956,0.056029,10


In [28]:
grid.score(X_val, y_val)

0.7072562932539285

How can we do better?

## More ML!!!??

In [29]:
pipe = Pipeline([
    ('impute', SimpleImputer(strategy="mean")),
    ('scaler', MinMaxScaler()),
    ('regressor', Ridge())
])

param_grid = [{'scaler': [MinMaxScaler(), RobustScaler()],
               'regressor': [RandomForestRegressor(n_estimators=100)],
               'regressor__min_samples_split': [5, 10],
               'regressor__max_features': ['sqrt', 0.3],
              },
              {'scaler': [MinMaxScaler(), RobustScaler()],
               'regressor': [GradientBoostingRegressor(loss='huber')],
               'regressor__max_depth': [2, 3],
               'regressor__n_estimators': [500, 1000],
               'regressor__max_features': ['sqrt', 0.3],
              },
              {'scaler': [RobustScaler()],
               'regressor': [Ridge(alpha=10)],
              }
             ]

grid = GridSearchCV(pipe, param_grid, cv=5, return_train_score=False)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('impute', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('regressor', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'scaler': [MinMaxScaler(copy=True, feature_range=(0, 1)), RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)], 'regressor': [RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=0.3, max_leaf_node...t_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring=None, verbose=0)

In [30]:
res = pd.DataFrame(grid.cv_results_)
res.sort_values('rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor,param_regressor__max_features,param_regressor__min_samples_split,param_scaler,param_regressor__max_depth,param_regressor__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
24,0.076905,0.002828,0.001493,6.5e-05,"Ridge(alpha=10, copy_X=True, fit_intercept=Tru...",,,"RobustScaler(copy=True, quantile_range=(25.0, ...",,,"{'regressor': Ridge(alpha=10, copy_X=True, fit...",0.910205,0.771442,0.933014,0.877463,0.906635,0.879731,0.057005,1
23,3.427067,0.142039,0.004953,8.1e-05,([DecisionTreeRegressor(criterion='friedman_ms...,0.3,,"RobustScaler(copy=True, quantile_range=(25.0, ...",3.0,1000.0,{'regressor': ([DecisionTreeRegressor(criterio...,0.906974,0.762762,0.921228,0.890701,0.908364,0.877969,0.058473,2
4,0.572238,0.025881,0.009182,0.000296,"(DecisionTreeRegressor(criterion='mse', max_de...",0.3,5.0,"MinMaxScaler(copy=True, feature_range=(0, 1))",,,{'regressor': (DecisionTreeRegressor(criterion...,0.898711,0.79318,0.904778,0.857722,0.903977,0.871658,0.042981,3
16,0.975334,0.044585,0.003345,0.000109,([DecisionTreeRegressor(criterion='friedman_ms...,sqrt,,"MinMaxScaler(copy=True, feature_range=(0, 1))",3.0,500.0,{'regressor': ([DecisionTreeRegressor(criterio...,0.89159,0.734588,0.909027,0.894779,0.925376,0.871005,0.06932,4
17,1.019371,0.014867,0.003437,5.9e-05,([DecisionTreeRegressor(criterion='friedman_ms...,sqrt,,"RobustScaler(copy=True, quantile_range=(25.0, ...",3.0,500.0,{'regressor': ([DecisionTreeRegressor(criterio...,0.90574,0.736368,0.918232,0.872243,0.921185,0.870709,0.069446,5
5,0.605458,0.018138,0.009237,0.000761,"(DecisionTreeRegressor(criterion='mse', max_de...",0.3,5.0,"RobustScaler(copy=True, quantile_range=(25.0, ...",,,{'regressor': (DecisionTreeRegressor(criterion...,0.89949,0.790836,0.900334,0.851611,0.905845,0.869608,0.044019,6
21,1.736963,0.076984,0.003391,0.000248,([DecisionTreeRegressor(criterion='friedman_ms...,0.3,,"RobustScaler(copy=True, quantile_range=(25.0, ...",3.0,500.0,{'regressor': ([DecisionTreeRegressor(criterio...,0.911448,0.74768,0.908246,0.863159,0.915778,0.869228,0.06373,7
6,0.496943,0.013928,0.008755,0.000892,"(DecisionTreeRegressor(criterion='mse', max_de...",0.3,10.0,"MinMaxScaler(copy=True, feature_range=(0, 1))",,,{'regressor': (DecisionTreeRegressor(criterion...,0.900787,0.806674,0.899013,0.83987,0.897755,0.86882,0.038669,8
19,2.079374,0.069987,0.006105,0.001063,([DecisionTreeRegressor(criterion='friedman_ms...,sqrt,,"RobustScaler(copy=True, quantile_range=(25.0, ...",3.0,1000.0,{'regressor': ([DecisionTreeRegressor(criterio...,0.900059,0.757521,0.917524,0.852188,0.915118,0.868456,0.060292,9
22,3.33359,0.073307,0.005015,0.00014,([DecisionTreeRegressor(criterion='friedman_ms...,0.3,,"MinMaxScaler(copy=True, feature_range=(0, 1))",3.0,1000.0,{'regressor': ([DecisionTreeRegressor(criterio...,0.910691,0.727752,0.919672,0.876613,0.903681,0.867643,0.071479,10


In [None]:
grid.score(X_val, y_val)

In [None]:
grid.best_params_

## Prepare submission

In [None]:
test = pd.read_csv('data/test.csv')
test.MSSubClass = test.MSSubClass.astype(str)
test = pd.get_dummies(test)
X_test = test.drop('Id', axis=1).loc[:, df_dummy.drop(['SalePrice', 'Id'], axis=1).columns].values

pipe = Pipeline([
    ('impute', SimpleImputer(strategy="median")),
    ('scaler', RobustScaler()),
    ('regressor', Ridge(alpha=10))
])
pipe.fit(X,y)
y_pred = pipe.predict(X_test)

In [None]:
pd.DataFrame({'SalePrice': y_pred}, index=pd.read_csv('data/test.csv').Id).to_csv('data/pred.csv')

## Tips to improve this

* Actually look at the data
* Understand the models you are using
* Do some kind of feature selection (but include in pipeline)