# Import Libraries

In [1]:
!pip install hydra-core





In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pycaret.regression import *

import mlflow
import mlflow.sklearn
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import os
import hydra
from omegaconf import DictConfig
from hydra.experimental import compose, initialize_config_dir

# Data Preprocessing
1. Drop Duplicates
2. Change month data type
3. Sample down data as jupyter will crash during model tuning
4. pycaret

### Pycaret Processing
1. target is resale price
2. train size of 0.8
3. transformation and normalize set to true to normalize data
4. remove outliers
5. 5 kfold
6. ignore features not required
- block number can be repeated throughout different towns and it will not indicate a higher or lower price
- latitude and longitude not required as users will not always know the exact coordinates of their house
7. bin postal code since it should be categorical and to avoid too many unique values
8. Explicity set month to as date feature

# Train Model
1. Drop models that will take too long/ crash jupyter notebook
2. Tune Model
4. Evaluate Model
5. Save Pipeline
6. Register Model

In [None]:
def process_train():
    """Function to process the data"""
    
    # Specify the root directory where your MLOps project is located
    root_directory = "C:/Github/MLOPs_WebApp/MLOps_Assignment"

    # Construct the absolute path to the configuration directory
    config_dir = os.path.normpath(os.path.join(root_directory, "config", "process"))

    # Initialize the Hydra config directory
    initialize_config_dir(config_dir=config_dir)

    # Load the Hydra configuration
    config = compose(config_name="process_RJ")
    print('All parameters in process_RJ.yaml: ' + str(config))

    # Load raw data
    hdb = pd.read_csv(config.data.raw)

    # Remove duplicates
    hdb_nodup = hdb.drop_duplicates()

    # Convert 'month' column to datetime
    hdb_nodup['month'] = pd.to_datetime(hdb_nodup['month'], format='%Y-%m')

    # Sample the data based on the provided configuration
    hdb_sampled = hdb_nodup.sample(frac=config.setup.sample, random_state=123)

    # Perform PyCaret setup
    hdbsetup = setup(
        data=hdb_sampled,
        target=config.setup.target,
        train_size=config.setup.trainsize,
        transformation=config.setup.transform,
        normalize=config.setup.norm,
        session_id=config.setup.session,
        log_experiment=config.setup.logexp,
        experiment_name=config.setup.expname,
        remove_outliers=config.setup.rmoutlier,
        fold=config.setup.fold,
        ignore_features=config.setup.ignore,
        bin_numeric_features=[config.setup.bin],
        date_features=[config.setup.date]
    )

    # Print features information
    print(f'Numeric features: {hdbsetup._fxs["Numeric"]}')
    print(f'Categorical features: {hdbsetup._fxs["Categorical"]}')
    print(f'Date features: {hdbsetup._fxs["Date"]}')

    # Get transformed dataset
    df = hdbsetup.get_config('dataset_transformed')
    # print('Processed Data: ', df)

    # Save processed data
    file_name = 'hdb_processed.csv'
    full_file = os.path.join(config.data.ppath, file_name)
    df.to_csv(full_file, index=False)
    
    print(f'Data was processed and written at {full_file}.')

    # Compare models (excluding specified models)
    best = compare_models()
    print(best)

    # Tune models
    tuned_best = tune_model(best, return_tuner=True) 
    best_tuned_model = tuned_best[0]  # Extract the best tuned model from the tuple
    print(best_tuned_model)

    # predict on test set
    hdb_pred = predict_model(best_tuned_model)
    print(hdb_pred)
    
    plot_model(best_tuned_model, plot= 'residuals')
    evaluate_model(best_tuned_model)

    # SAVE PIPELINE
    # finalize the model
    finalized = finalize_model(best_tuned_model)

    # # save model to disk
    save_model(finalized, 'hdb_pipeline_final3')
    # load pipeline
    loaded_best_pipeline = load_model('hdb_pipeline_final3')

    # MODEL REGISTRATION
    # Calculate evaluation metrics
    y_true = hdb_pred['resale_price']
    y_pred = hdb_pred['prediction_label']
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    r2 = r2_score(y_true, y_pred)
    mlflow.end_run()
    # Register the pipeline with MLflow
    with mlflow.start_run() as run:
        # Log the experiment name and run ID
        mlflow.set_experiment('Final_HDB')
        mlflow.log_param('experiment_name', 'Final_HDB')
        mlflow.log_param('run_id', run.info.run_id)
        
        mlflow.log_param('train_size', 0.8)
        mlflow.log_param('transformation', True)
        mlflow.log_param('normalize', True)
        mlflow.log_param('remove_outliers', True)
        mlflow.log_param('fold', 10)
        
        # Log metrics
        mlflow.log_metric('MAE', mae)
        mlflow.log_metric('MSE', mse)
        mlflow.log_metric('RMSE', rmse)
        mlflow.log_metric('R2', r2)
        
        
        # Log the pipeline object using mlflow.sklearn
        mlflow.sklearn.log_model(sk_model=best_tuned_model, registered_model_name='hdb_LIGHTGBMmodel', artifact_path='sk_model')
    


process_train()

All parameters in process_RJ.yaml: {'data': {'raw': '../../data/raw/01_hdb_resale_transactions.csv', 'ppath': '../../data/processed/'}, 'setup': {'target': 'resale_price', 'trainsize': 0.8, 'transform': True, 'norm': True, 'session': 123, 'logexp': True, 'expname': 'hdb_experiment', 'rmoutlier': True, 'fold': 10, 'date': 'month', 'bin': 'postal_code', 'sample': 0.6, 'ignore': ['block', 'latitude', 'longitude']}}


Unnamed: 0,Description,Value
0,Session id,123
1,Target,resale_price
2,Target type,Regression
3,Original data shape,"(116326, 15)"
4,Transformed data shape,"(111673, 56)"
5,Transformed train set shape,"(88407, 56)"
6,Transformed test set shape,"(23266, 56)"
7,Ignore features,3
8,Numeric features,5
9,Date features,1


Numeric features: ['postal_code', 'floor_area_sqm', 'lease_commence_date', 'cbd_dist', 'min_dist_mrt']
Categorical features: ['street_name', 'town', 'flat_type', 'storey_range', 'flat_model']
Date features: ['month']
Data was processed and written at ../../data/processed/hdb_processed.csv.


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,24108.257,1136455767.6581,33706.546,0.9574,0.0675,0.0515,2.464
et,Extra Trees Regressor,22418.144,1138768644.1415,33717.7901,0.9573,0.065,0.0472,26.658
rf,Random Forest Regressor,22605.039,1164175661.5401,34107.5199,0.9564,0.0648,0.0473,29.413
dt,Decision Tree Regressor,30738.6329,2136283409.514,46207.7311,0.9199,0.089,0.0644,1.776
knn,K Neighbors Regressor,36241.6773,2869922816.0,53565.9691,0.8924,0.1003,0.0741,4.648
gbr,Gradient Boosting Regressor,38690.1639,2926311666.2748,54092.3726,0.8903,0.1066,0.0823,7.367
lr,Linear Regression,51391.7864,4433394257.7041,66582.6551,0.8338,0.1634,0.1156,6.403
ridge,Ridge Regression,54453.3115,5041200268.2309,70999.5081,0.811,0.165,0.1218,2.31
br,Bayesian Ridge,54452.5293,5041269156.4593,70999.9932,0.811,0.165,0.1218,2.57
lasso,Lasso Regression,54453.2998,5041205356.5423,70999.5439,0.811,0.165,0.1218,6.299
