# Machine Learning with Pipelines
### (Incomplete)
### Train, tune, and pickle models using Sweta's cleaned dataset
**Each pipeline will:**
- Standardize data with MinMax (vs standardscaler to accomodate binary dummy variables) 
- Select features with RFECV (recursive feature elimination with cross-validation)
- Tune hyperparameters with GridSearchCV

**Additional:**
- Evaluate performance
- Pickle
- Ensemble for flask predictor tool

In [111]:
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression, HuberRegressor, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score
import pickle

#### Pickle Code

In [2]:
# # save the model to disk
# filename = 'finalized_model.sav'
# pickle.dump(model, open(filename, 'wb'))
 
# # some time later...
 
# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)
# print(result)

### Load Dummified Data

In [3]:
train = pd.read_csv('dummi data/train_final.csv')
test = pd.read_csv('dummi data/test_final.csv')
# test[test.isna().any(axis=1)]
test = test.dropna(axis=0)

In [4]:
X_train = train.drop(columns=['SalePrice', 'PID', 'Unnamed: 0'])
y_train = train['SalePrice']
X_test = test.drop(columns=['SalePrice', 'PID', 'Unnamed: 0'])
y_test = test['SalePrice']

### Feature Selection

In [57]:
# Lasso GridSearchCV
search = GridSearchCV(Lasso(random_state=0, tol=.1), 
                      {'alpha': np.arange(0.01,1000,1)},
                     cv=5,
        scoring='neg_mean_squared_error'
                     )
search.fit(X_train, y_train)
search.best_params_

{'alpha': 98.01}

In [68]:
pickle.dump(search, open('FS_LassoGridSearch.sav', 'wb'))

In [92]:
coefficients = search.best_estimator_.coef_
importance = np.abs(coefficients)
features = X_train.columns
keep_feat = np.array(features)[importance > 10]
X_train[keep_feat].columns
X_train = X_train[keep_feat]
X_test = X_test[keep_feat]

In [93]:
# RFECV and RandomForestRegressor.

In [94]:
# pickle.dump(rfecv, open('FS_RFECV.sav', 'wb'))

### OLS Pipeline
- HuberRegressor to mitigate impact of outliers
- L2 Regularization

In [95]:
# hb = HuberRegressor(alpha=1)

# pipe = make_pipeline(
#     # standardize scale
# #     MinMaxScaler(), 
#     HuberRegressor()    
#     # feature selection 
# #     RFECV(hb, cv=4, scoring='accuracy')
# )

# # hyperparameter tuning
# grid_pipe = GridSearchCV(pipe,
#         param_grid={
#             'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
#         }, cv=4, refit=True
#     )

# grid_pipe.fit(X_train, y_train)

In [96]:
# grid_pipe.best_score_

In [97]:
# hb = HuberRegressor()
# hb.fit(X_train,y_train)

# lm = LinearRegression()
# lm.fit(X_train, y_train)

In [98]:
ols = make_pipeline(
    MinMaxScaler(),
    LinearRegression()
)

ols.fit(X_train, y_train)

Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('linearregression', LinearRegression())])

In [99]:
# evaluate model
y_pred = ols.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print('mean abs err:', mae)
r2 = r2_score(y_test, y_pred)
print('r2:', r2)

mean abs err: 15613.513333333334
r2: 0.886479063191782


In [100]:
pickle.dump(ols, open('Linear.sav', 'wb'))

### Bayesian Ridge Pipeline

In [101]:
br = make_pipeline(
    MinMaxScaler(),
    BayesianRidge()
)

br.fit(X_train, y_train)

Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('bayesianridge', BayesianRidge())])

In [102]:
# evaluate model
y_pred = br.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print('mean abs err:', mae)
r2 = r2_score(y_test, y_pred)
print('r2:', r2)

mean abs err: 15502.735972678454
r2: 0.8868992723194256


In [103]:
pickle.dump(br, open('BayesianRidge.sav', 'wb'))

### Random Forest Pipeline

In [104]:
rf = make_pipeline(
    MinMaxScaler(),
    RandomForestRegressor()
)

rf.fit(X_train, y_train)

Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('randomforestregressor', RandomForestRegressor())])

In [105]:
# evaluate model
y_pred = rf.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print('mean abs err:', mae)
r2 = r2_score(y_test, y_pred)
print('r2:', r2)

mean abs err: 14543.00231851852
r2: 0.8897111619717812


In [106]:
pickle.dump(rf, open('RandomForest.sav', 'wb'))

### Gradient Booster Pipeline

In [107]:
gb = make_pipeline(
    MinMaxScaler(),
    GradientBoostingRegressor()
)

gb.fit(X_train, y_train)

Pipeline(steps=[('minmaxscaler', MinMaxScaler()),
                ('gradientboostingregressor', GradientBoostingRegressor())])

In [108]:
# evaluate model
y_pred = gb.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print('mean abs err:', mae)
r2 = r2_score(y_test, y_pred)
print('r2:', r2)

mean abs err: 13230.552917928684
r2: 0.9229897622643011


In [109]:
pickle.dump(gb, open('GBooster.sav', 'wb'))

### SVM SVR Pipeline (poor performance)

In [112]:
svr = make_pipeline(
    MinMaxScaler(),
    SVR()
)

svr.fit(X_train, y_train)

Pipeline(steps=[('minmaxscaler', MinMaxScaler()), ('svr', SVR())])

In [113]:
# evaluate model
y_pred = svr.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print('mean abs err:', mae)
r2 = r2_score(y_test, y_pred)
print('r2:', r2)

mean abs err: 47645.3450582701
r2: -0.03931731190896737


In [114]:
# pickle.dump(pipe, open('SupportVector.sav', 'wb'))

### Sale Range from all Features

In [115]:
def salePrice_range(features):
    pred_list = [ols.predict(features), 
                 br.predict(features), 
                 rf.predict(features),
                 gb.predict(features)]
    low = min(pred_list)
    low_fmt = '${:,.2f}'.format(low[0])
    high = max(pred_list)
    high_fmt = '${:,.2f}'.format(high[0])
    return print(low_fmt,'-',high_fmt) 

pretend_features = X_test.sample(1, random_state=0)

salePrice_range(pretend_features)

### Ensemble (incomplete)

In [None]:
# combine predictions of weak learners ?
x_train = np.column_stack((ols_pred, br_pred, rf_pred, gb_pred))

In [None]:
# def trainStackModel(x_train, y_train, X_test, n_folds, seed):
#     cv = KFold(n_splits=N_folds, random_state=seed)
#     gbm = xgb.XGBRegressor(
#         n_estimators= 600,
#     )

### PCA Pipeline

### Clustering Pipeline