# Housing Demo for Data Club

This is a demo notebook to explain some basic ML concepts in the SKLearn environment. 

In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

Get data and information from https://www.kaggle.com/c/house-prices-advanced-regression-techniques

## Prep Data

In [None]:
df = pd.read_csv('data/train.csv')

Need to get dummy variables to deal with categoricals

In [None]:
# df = df
# df.MSSubClass = df.MSSubClass.astype(str)
# df = pd.get_dummies(df)

In [None]:
X = df.drop(['SalePrice', 'Id'], axis=1).values
y = df.SalePrice.values

There are missing values so need to impute these

In [None]:
# X = SimpleImputer().fit_transform(X)

## Training a model

In [None]:
m_linear = LinearRegression(metric='').fit(X, y)

In [None]:
m_linear.score(X, y)

Some things we did wrong:

* ...

## Train-Validation Split

In [None]:

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, test_size=0.2)

In [None]:
m_linear = LinearRegression().fit(X_train, y_train)
m_linear.score(X_val, y_val)

In [None]:
m_ridge = Ridge().fit(X_train, y_train)
m_ridge.score(X_val, y_val)

In [None]:
m_ridge2 = Ridge(alpha=0.5).fit(X_train, y_train)
m_ridge2.score(X_val, y_val)

Some things we did wrong:

* ...

## Cross Validation

In [None]:
m_ridge = Ridge()
param_grid = {'alpha': [0.1, 1, 10]}
grid = GridSearchCV(m_ridge, param_grid, cv=5, return_train_score=False)
grid.fit(X_train, y_train)

In [None]:
res = pd.DataFrame(grid.cv_results_)
res


In [None]:
grid.score(X_val, y_val)

Some things we did wrong:

* ...

## Making a pipeline


In [None]:
X = df.drop(['SalePrice', 'Id'], axis=1).values
y = df.SalePrice.values

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, test_size=0.2)

In [None]:
pipe = Pipeline([
    ('impute', SimpleImputer(strategy="median")),
    ('scaler', MinMaxScaler()),
    ('regressor', Ridge())
])

param_grid =  {'impute__strategy': ["median", "mean"],
               'scaler': [MinMaxScaler(), RobustScaler(), None],
               'regressor': [Ridge(), Lasso()],
               'regressor__alpha': [0.1, 1, 10]}
 

grid = GridSearchCV(pipe, param_grid, cv=5, return_train_score=False)
grid.fit(X_train, y_train)

In [None]:
res = pd.DataFrame(grid.cv_results_)
res.sort_values('rank_test_score')

In [None]:
grid.score(X_val, y_val)

How can we do better?

## More ML!

In [None]:
pipe = Pipeline([
    ('impute', SimpleImputer(strategy="mean")),
    ('scaler', MinMaxScaler()),
    ('regressor', Ridge())
])

param_grid = [{'scaler': [MinMaxScaler(), RobustScaler()],
               'regressor': [RandomForestRegressor(n_estimators=100)],
               'regressor__min_samples_split': [5, 10],
               'regressor__max_features': ['sqrt', 0.3],
              },
              {'scaler': [MinMaxScaler(), RobustScaler()],
               'regressor': [GradientBoostingRegressor(loss='huber')],
               'regressor__max_depth': [2, 3],
               'regressor__n_estimators': [500, 1000],
               'regressor__max_features': ['sqrt', 0.3],
              },
              {'scaler': [RobustScaler()],
               'regressor': [Ridge(alpha=10)],
              }
             ]

grid = GridSearchCV(pipe, param_grid, cv=5, return_train_score=False)
grid.fit(X_train, y_train)

In [None]:
res = pd.DataFrame(grid.cv_results_)
res.sort_values('rank_test_score')

In [None]:
grid.score(X_val, y_val)

In [None]:
grid.best_params_

## Prepare submission

In [None]:
test = pd.read_csv('data/test.csv')
test.MSSubClass = test.MSSubClass.astype(str)
test = pd.get_dummies(test)
X_test = test.drop('Id', axis=1).loc[:, df_dummy.drop(['SalePrice', 'Id'], axis=1).columns].values

pipe = Pipeline([
    ('impute', SimpleImputer(strategy="median")),
    ('scaler', RobustScaler()),
    ('regressor', Ridge(alpha=10))
])
pipe.fit(X,y)
y_pred = pipe.predict(X_test)

In [None]:
pd.DataFrame({'SalePrice': y_pred}, index=pd.read_csv('data/test.csv').Id).to_csv('data/pred.csv')

## Tips to improve this

*