# **(ADD HERE THE NOTEBOOK NAME)**

## Objectives

* Write here your notebook objective, for example, "Fetch data from Kaggle and save as raw data", or "engineer features for modelling"

## Inputs

* Write here which data or information you need to run the notebook 

## Outputs

* Write here which files, code or artifacts you generate by the end of the notebook 

## CRISP-DM

* Modelling


---

# Change working directory

* We are assuming you will store the notebooks in a sub folder, therefore when running the notebook in the editor, you will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [1]:
import os
current_dir = os.getcwd()
current_dir

'/workspace/milestone-project-heritage-housing-issues/jupyter_notebooks'

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [2]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory


Confirm the new current directory

In [3]:
current_dir = os.getcwd()
current_dir

'/workspace/milestone-project-heritage-housing-issues'

# Load Data

In [4]:
import pandas as pd
df = (pd.read_csv(f"inputs/datasets/unzipped/house_prices_records.csv")
        .drop(labels=['EnclosedPorch', 'WoodDeckSF'],axis=1))
print(df.shape)
df.head()

(1460, 22)


Unnamed: 0,1stFlrSF,2ndFlrSF,BedroomAbvGr,BsmtExposure,BsmtFinSF1,BsmtFinType1,BsmtUnfSF,GarageArea,GarageFinish,GarageYrBlt,...,LotArea,LotFrontage,MasVnrArea,OpenPorchSF,OverallCond,OverallQual,TotalBsmtSF,YearBuilt,YearRemodAdd,SalePrice
0,856,854.0,3.0,No,706,GLQ,150,548,RFn,2003.0,...,8450,65.0,196.0,61,5,7,856,2003,2003,208500
1,1262,0.0,3.0,Gd,978,ALQ,284,460,RFn,1976.0,...,9600,80.0,0.0,0,8,6,1262,1976,1976,181500
2,920,866.0,3.0,Mn,486,GLQ,434,608,RFn,2001.0,...,11250,68.0,162.0,42,5,7,920,2001,2002,223500
3,961,,,No,216,ALQ,540,642,Unf,1998.0,...,9550,60.0,0.0,35,5,7,756,1915,1970,140000
4,1145,,4.0,Av,655,GLQ,490,836,RFn,2000.0,...,14260,84.0,350.0,84,5,8,1145,2000,2000,250000


### Load variables

In [5]:
arbitrary_imputation_vars = ['2ndFlrSF']
median_imputation_vars = ['BedroomAbvGr', 'LotFrontage','GarageYrBlt','MasVnrArea']
most_frequent_vars = ['BsmtFinType1']

In [6]:
categorical_encoding_vars =df.select_dtypes(include=['object']).columns.to_list()
categorical_encoding_vars

['BsmtExposure', 'BsmtFinType1', 'GarageFinish', 'KitchenQual']

In [7]:
log_transformation_vars = ['1stFlrSF', 'LotArea']
yeojohnson_vars = ['GrLivArea']
boxcox_vars =[]

In [8]:
smart_correlation_features = df.columns.to_list() 
smart_correlation_features.pop()
smart_correlation_features

['1stFlrSF',
 '2ndFlrSF',
 'BedroomAbvGr',
 'BsmtExposure',
 'BsmtFinSF1',
 'BsmtFinType1',
 'BsmtUnfSF',
 'GarageArea',
 'GarageFinish',
 'GarageYrBlt',
 'GrLivArea',
 'KitchenQual',
 'LotArea',
 'LotFrontage',
 'MasVnrArea',
 'OpenPorchSF',
 'OverallCond',
 'OverallQual',
 'TotalBsmtSF',
 'YearBuilt',
 'YearRemodAdd']

* We Will handle Data cleaning for 'GarageFinish' outside of the pipeline

In [9]:
nan_GarageFinish_vals=df[df['GarageFinish'].isna()]
nan_GarageFinish_index = list(nan_GarageFinish_vals.index.values)

df['GarageFinish'] = df['GarageFinish'].fillna(0)

for x in nan_GarageFinish_index:
    garage_area_value = df.iloc[x,8]
    if garage_area_value == 0:
        df.at[x,'GarageFinish'] = 'None'
    else:
        df.at[x,'GarageFinish'] = 'Unf'

df.filter(['GarageFinish']).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   GarageFinish  1460 non-null   object
dtypes: object(1)
memory usage: 11.5+ KB


# Create ML Pipelines

1. Data Cleaning and Feature Engineering

In [50]:
from sklearn.pipeline import Pipeline

### Data Cleaning
from feature_engine.imputation import MeanMedianImputer
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.imputation import CategoricalImputer

### Feature Engineering
from feature_engine.selection import SmartCorrelatedSelection
from feature_engine.encoding import OrdinalEncoder
from feature_engine import transformation as vt

def PipelineDataCleaningAndFeatureEngineering():
  pipeline_base = Pipeline([
    ("ArbitraryImputer", ArbitraryNumberImputer(arbitrary_number=0,
                                            variables=arbitrary_imputation_vars)),
                                
    ("MedianImputation", MeanMedianImputer(imputation_method='median',
                                            variables=median_imputation_vars)),
    
    ("CategoricalImputer", CategoricalImputer(imputation_method='frequent', 
                                        variables=most_frequent_vars)),

    ("OrdinalCategoricalEncoder",OrdinalEncoder(encoding_method='arbitrary', 
                                                variables = categorical_encoding_vars)),
    
    ("LogTransformer", vt.YeoJohnsonTransformer(variables = yeojohnson_vars)),

    ("YeoJohnsonTransformer", vt.YeoJohnsonTransformer(variables = yeojohnson_vars)),

    # ("BoxCoxTransformer", vt.BoxCoxTransformer(variables = boxcox_vars)),
      
    ("SmartCorrelatedSelection",SmartCorrelatedSelection(variables=smart_correlation_features, 
                                                          method="pearson", threshold=0.6, 
                                                          selection_method="variance") ),
       
  ])

  return pipeline_base

PipelineDataCleaningAndFeatureEngineering()

Pipeline(steps=[('ArbitraryImputer',
                 ArbitraryNumberImputer(arbitrary_number=0,
                                        variables=['2ndFlrSF'])),
                ('MedianImputation',
                 MeanMedianImputer(variables=['BedroomAbvGr', 'LotFrontage',
                                              'GarageYrBlt', 'MasVnrArea'])),
                ('CategoricalImputer',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=['BsmtFinType1'])),
                ('OrdinalCategoricalEncoder',
                 OrdinalEnco...
                 SmartCorrelatedSelection(selection_method='variance',
                                          threshold=0.6,
                                          variables=['1stFlrSF', '2ndFlrSF',
                                                     'BedroomAbvGr',
                                                     'BsmtExposure',
                                                     

# ML Pipeline for Modelling and Hyperparameter Optimization

* Our next step is choosing the optimal algorithm for our ML model and the most effective hyperparameter for our selected algorithm
* We will do so with the below function and custom class.
    * Below code taken from CI lesson: *Scikit-Learn Unit 6: Cross Validation Search Part 2*

In [52]:
import warnings
warnings.filterwarnings('ignore')
### Feat Scaling
from sklearn.preprocessing import StandardScaler

### Feat Selection
from sklearn.feature_selection import SelectFromModel

### ML algorithms 
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor

def PipelineRegressor(model):
  pipeline_base = Pipeline([
     
       ("scaler",StandardScaler() ),
       ("feat_selection",SelectFromModel(model) ),
       ("model",model ),
  ])

  return pipeline_base

In [58]:
from sklearn.model_selection import GridSearchCV

class HyperparameterOptimizationSearch:

    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f"\nRunning GridSearchCV for {key} \n")

            model =  PipelineClf(self.models[key])
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs, verbose=verbose, scoring=scoring, )
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]
        return df[columns], self.grid_searches

# Split Train & Test Set

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(
                                    df.drop(['SalePrice'],axis=1),
                                    df['SalePrice'],
                                    test_size = 0.2,
                                    random_state = 0,
                                    )

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1168, 21) (1168,) (292, 21) (292,)


* We apply our first pipeline to our train and test sets

In [14]:
pipeline_data_cleaning_feat_eng = PipelineDataCleaningAndFeatureEngineering()
X_train = pipeline_data_cleaning_feat_eng.fit_transform(X_train)
X_test = pipeline_data_cleaning_feat_eng.transform(X_test)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1168, 17) (1168,) (292, 17) (292,)


# Scikit Learn Cross Validation Search

* We first search for the most suitable algorithm using tandard hyper parameters

In [34]:
models_quick_search = {
    # "LogisticRegression": LogisticRegression(random_state=0),
    "LinearRegression": LinearRegression(),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=0), 
    "XGBRegressor": XGBRegressor(random_state=0),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=0),
    "RandomForestRegressor": RandomForestRegressor(random_state=0),
    "ExtraTreesRegressor": ExtraTreesRegressor(random_state=0),
    "AdaBoostRegressor": AdaBoostRegressor(random_state=0),
}

params_quick_search = {
    # "LogisticRegression":{},
    "LinearRegression": {},
    "GradientBoostingRegressor": {},
    "XGBRegressor":{},
    "DecisionTreeRegressor":{},
    "RandomForestRegressor":{},
    "GradientBoostingClassifier":{},
    "ExtraTreesRegressor":{},
    "AdaBoostRegressor":{},
}

* At this stage we made an attempt at finding the best algorithm for out model, and set our cross validation value, initially, to cv=2, to allow LogisticRegression to run.
* Logistic regression scored worse that other algorithms, so we have commented it out above, in order to allow a more extensive search wit a cross validation value of cv=5

In [60]:
from sklearn.metrics import make_scorer, r2_score
search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
search.fit(X_train, y_train, 
            scoring=make_scorer(r2_score), n_jobs=-1, cv=5)


Running GridSearchCV for LinearRegression 

Fitting 5 folds for each of 1 candidates, totalling 5 fits


Traceback (most recent call last):
  File "/workspace/.pip-modules/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/workspace/.pip-modules/lib/python3.8/site-packages/sklearn/pipeline.py", line 341, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/workspace/.pip-modules/lib/python3.8/site-packages/sklearn/pipeline.py", line 303, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/workspace/.pip-modules/lib/python3.8/site-packages/joblib/memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "/workspace/.pip-modules/lib/python3.8/site-packages/sklearn/pipeline.py", line 754, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/workspace/.pip-modules/lib/python3.8/site-packages/sklearn/base.py", line 702, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "/workspace/.p

NotFittedError: All estimators failed to fit

* We add the results to a DataFrame and assess.

In [59]:
import numpy as np
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
print(grid_search_summary.shape)
grid_search_summary 

(729, 11)


Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,model__learning_rate,model__max_depth,model__min_impurity_decrease,model__min_samples_split,model__min_weight_fraction_leaf,model__n_estimators
279,GradientBoostingRegressor,0.688071,0.788019,0.822261,0.05064,0.01,3,3,3,0.0,400
252,GradientBoostingRegressor,0.688071,0.788019,0.822261,0.05064,0.01,3,0,3,0.0,400
306,GradientBoostingRegressor,0.688071,0.788019,0.822261,0.05064,0.01,3,10,3,0.0,400
270,GradientBoostingRegressor,0.687524,0.78782,0.822032,0.05083,0.01,3,3,2,0.0,400
297,GradientBoostingRegressor,0.687524,0.78782,0.822032,0.05083,0.01,3,10,2,0.0,400
...,...,...,...,...,...,...,...,...,...,...,...
564,GradientBoostingRegressor,-0.033214,0.067398,0.141992,0.071831,0.001,3,10,5,0.5,400
699,GradientBoostingRegressor,-0.033214,0.067398,0.141992,0.071831,0.001,,3,5,0.5,400
555,GradientBoostingRegressor,-0.033214,0.067398,0.141992,0.071831,0.001,3,10,3,0.5,400
546,GradientBoostingRegressor,-0.033214,0.067398,0.141992,0.071831,0.001,3,10,2,0.5,400


* We can see that GradientBoostingRegressor provides us with the best mean score.
* The R2 score of 0.77 is sufficient to meet the performance goal of our model as per our client's business case.
* Next we will try find the best hyperparameters for our model and try to fine tune it so that we can improve it's score on our TrainSet

In [44]:
models_search = {
    "GradientBoostingRegressor":GradientBoostingRegressor(random_state=0),
}

params_search = {
    "GradientBoostingRegressor":{
        'model__n_estimators': [400,450,500],
        'model__learning_rate': [1e-1,1e-2,1e-3], 
        'model__max_depth': [3,10,None],
        'model__min_samples_split': [2,3,5],
        'model__min_weight_fraction_leaf':[0.0, 0.3, 0.5],
        'model__min_impurity_decrease': [0, 3, 10]
    }
}

* We perform another Cross Validation search using only "GradientBoostingRegressor" as our model and a set of hyperparameters to apply, in order to find the best hyperparameter permutation for our model

In [45]:
warnings.filterwarnings('ignore')
search = HyperparameterOptimizationSearch(models=models_search, params=params_search)
search.fit(X_train, y_train, scoring = 'r2', n_jobs=-1, cv=5)


Running GridSearchCV for GradientBoostingRegressor 

Fitting 5 folds for each of 729 candidates, totalling 3645 fits


* We add the results to a DataFrame and assess

In [54]:
grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary

Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,model__learning_rate,model__max_depth,model__min_impurity_decrease,model__min_samples_split,model__min_weight_fraction_leaf,model__n_estimators
279,GradientBoostingRegressor,0.688071,0.788019,0.822261,0.05064,0.01,3,3,3,0.0,400
252,GradientBoostingRegressor,0.688071,0.788019,0.822261,0.05064,0.01,3,0,3,0.0,400
306,GradientBoostingRegressor,0.688071,0.788019,0.822261,0.05064,0.01,3,10,3,0.0,400
270,GradientBoostingRegressor,0.687524,0.78782,0.822032,0.05083,0.01,3,3,2,0.0,400
297,GradientBoostingRegressor,0.687524,0.78782,0.822032,0.05083,0.01,3,10,2,0.0,400
...,...,...,...,...,...,...,...,...,...,...,...
564,GradientBoostingRegressor,-0.033214,0.067398,0.141992,0.071831,0.001,3,10,5,0.5,400
699,GradientBoostingRegressor,-0.033214,0.067398,0.141992,0.071831,0.001,,3,5,0.5,400
555,GradientBoostingRegressor,-0.033214,0.067398,0.141992,0.071831,0.001,3,10,3,0.5,400
546,GradientBoostingRegressor,-0.033214,0.067398,0.141992,0.071831,0.001,3,10,2,0.5,400


* We can see that we have an improvement in the mean R2 score, bringing it up to -----insert score----
* We will use these hyperparameters for our model

* We print our best model below

In [55]:
best_model = grid_search_summary.iloc[0,0]
best_model

'GradientBoostingRegressor'

* We print our best combination of hyperparameters for our algorithm

In [56]:
best_hyperparams = grid_search_pipelines[best_model].best_params_
best_hyperparams

{'model__learning_rate': 0.01,
 'model__max_depth': 3,
 'model__min_impurity_decrease': 0,
 'model__min_samples_split': 3,
 'model__min_weight_fraction_leaf': 0.0,
 'model__n_estimators': 400}

* We define our best pipeline at this stage

In [57]:
pipeline_clf = grid_search_pipelines[best_model].best_estimator_
pipeline_clf

Pipeline(steps=[('scaler', StandardScaler()),
                ('feat_selection',
                 SelectFromModel(estimator=GradientBoostingRegressor(random_state=0))),
                ('model',
                 GradientBoostingRegressor(learning_rate=0.01,
                                           min_impurity_decrease=0,
                                           min_samples_split=3,
                                           n_estimators=400, random_state=0))])

---

# Push files to Repo

* In case you don't need to push files to Repo, you may replace this section for "Conclusions and Next Steps" and state your conclusions and next steps.

In [None]:
import os
try:
  # create here your folder
  # os.makedirs(name='')
except Exception as e:
  print(e)
