# Predict Sale Price 

## Objectives

* Write your notebook objective here, for example, "Fetch data from Kaggle and save as raw data", or "engineer features for modelling"

## Inputs

* Write here which data or information you need to run the notebook 

## Outputs

* Write here which files, code or artefacts you generate by the end of the notebook 

## Additional Comments

* In case you have any additional comments that don't fit in the previous bullets, please state them here. 


---

# Change working directory

* We are assuming you will store the notebooks in a subfolder, therefore when running the notebook in the editor, you will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

# Load Data 

In [None]:
import numpy as np
import pandas as pd
df = (pd.read_csv("outputs/datasets/collection/house_prices_records.csv"))

print(df.shape)
df.head(3)

---

## ML Pipeline with all Data

### Data Cleaning and Feature Engineering 

In [None]:
### Data Cleaning

from sklearn.pipeline import Pipeline

### Feature Engineering
from feature_engine.selection import SmartCorrelatedSelection
from feature_engine.encoding import OrdinalEncoder

### Feature Scaling


### Feature Selection 


### ML algorithms


#### need to go through FE notebook and fir model as per selcted features. 


def PipelineOptimization(model):
  pipeline_base = Pipeline([

    ### Data Cleaning 
    ("ArbitraryImputer",OrdinalEncoder(encoding_method='arbitrary', 
                                                variables = [ '']) ),
  
     ("CategoricalEncoder",CategoricalImputer(imputation_method='missing',fill_value='Unf',
                                                variables = [''])),

    ("MeanMedianImputer",MeanMedianImputer(imputation_method='median', 
                                                variables = [''])),  

    ### Feature Engineering 
     ("OrdinalCategoricalEncoder",
         OrdinalEncoder(encoding_method = 'arbitrary',
                        variables = [''])),

    ("LogTransformer", vt.LogTransformer(
            variables = [''])),

    ("PowerTransformer", vt.PowerTransformer(
        variables = [''])),

    ("YeoJohnsonTransformer", vt.YeoJohnsonTransformer(
            variables=[''])),

    ("Winsorizer", Winsorizer(capping_method='iqr', tail='both', fold=1.5,
                                  variables = [''])),

      
    ("SmartCorrelatedSelection",SmartCorrelatedSelection(variables=None, method="", 
                                                        threshold=0.6, selection_method="") ),
       
  ])

  return pipeline_base

PipelineOptimization()

## ML Pipeline for Modelling and Hyperparameter Optimization 

Custom Class for Hyperparameter Optimization

In [None]:
from sklearn.model_selection import GridSearchCV

class HyperparameterOptimizationSearch:

    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv, n_jobs, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f"\nRunning GridSearchCV for {key} \n")

            model =  PipelineClf(self.models[key])
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs, verbose=verbose, scoring=scoring, )
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]
        return df[columns], self.grid_searches


### Split Train and Test Set 

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(
                                    df.drop(['SalePrice'],axis=1),
                                    df['SalePrice'],
                                    test_size = 0.2,
                                    random_state = 0,
                                    )

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)


## Grid Search CV - Sklearn

We will use standard hyper parameters to find most suitable algorithm

In [None]:
models_quick_search = {
    "LogisticRegression": LogisticRegression(random_state=0),
    "XGBClassifier":XGBClassifier(random_state=0),
    "DecisionTreeClassifier":DecisionTreeClassifier(random_state=0),
    "RandomForestClassifier":RandomForestClassifier(random_state=0),
    "GradientBoostingClassifier":GradientBoostingClassifier(random_state=0),
    "ExtraTreesClassifier":ExtraTreesClassifier(random_state=0),
    "AdaBoostClassifier":AdaBoostClassifier(random_state=0),
}

params_quick_search = {
    "LogisticRegression":{},
    "XGBClassifier":{},
    "DecisionTreeClassifier":{},
    "RandomForestClassifier":{},
    "GradientBoostingClassifier":{},
    "ExtraTreesClassifier":{},
    "AdaBoostClassifier":{},
}


### Quick GridSearch CV - Binary Classifier

In [None]:
from sklearn.metrics import make_scorer, recall_score
search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
search.fit(X_train, y_train,
           scoring =  'r2',
           n_jobs=-1, cv=5)

Check results 

In [None]:

grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary 

### Do extensive search on most suitable algorithm to find best hyperparameter configuration

Define model and parameters, for Extensive Search

In [None]:
models_search = {
    "Regressor":(random_state=0),
}

params_search = {
    "":{
        'model__n_estimators': [,],
        'model__max_depth': [,,None],
    }
}


Extensive GridSearch CV 

In [None]:
from sklearn.metrics import make_scorer, recall_score
search = HyperparameterOptimizationSearch(models=models_quick_search, params=params_quick_search)
search.fit(X_train, y_train,
           scoring =  'r2',
           n_jobs=-1, cv=5)

Check results 

In [None]:

grid_search_summary, grid_search_pipelines = search.score_summary(sort_by='mean_score')
grid_search_summary 


Check best model 

In [None]:

best_model = grid_search_summary.iloc[0,0]
best_model

Parameters for best model

In [None]:
best_parameters = grid_search_pipelines[best_model].best_params_
best_parameters

Define the best regressor

In [None]:
regressor_pipeline = grid_search_pipelines[best_model].best_estimator_
regressor_pipeline

### Assess Feature Importance 

---

NOTE

* You may add as many sections as you want, as long as they support your project workflow.
* All notebook's cells should be run top-down (you can't create a dynamic wherein a given point you need to go back to a previous cell to execute some task, like go back to a previous cell and refresh a variable content)

---

# Push files to Repo

* If you do not need to push files to Repo, you may replace this section with "Conclusions and Next Steps" and state your conclusions and next steps.

In [None]:
import os
try:
  # create here your folder
  # os.makedirs(name='')
except Exception as e:
  print(e)
