# 5. Deploy Model

xxx

## 5.1 Environment Configuration

### 5.1.1 Import Libraries

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import norm

import joblib
import os

from datetime import datetime as dt

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor

### 5.1.2 Define Utility Functions

In [2]:
def normalize_new_data(new_data, normalization_params):
    # Ensure the input data is a numpy array
    if isinstance(new_data, np.ndarray):
        # Normalize each feature using the stored mean and standard deviation
        normalized_data = (new_data - normalization_params['mean']) / normalization_params['std']
        return normalized_data
    elif isinstance(new_data, pd.DataFrame):
        # Normalize each column (feature) using the stored mean and standard deviation
        normalized_data = (new_data - normalization_params['mean']) / normalization_params['std']
        return normalized_data
    else:
        raise ValueError("Input data must be a numpy array or pandas DataFrame")

In [3]:
def prediction_interval(x, model, alpha=0.05):
    # Get the mean prediction
    y_pred = model.predict(x)
    
    # Calculate the standard deviation of predictions
    std_dev = np.std(y_pred)
    
    # Calculate the z-score for the given significance level
    z_score = np.abs(norm.ppf(alpha / 2))
    
    # Calculate margin of error
    margin_of_error = z_score * std_dev
    
    # Calculate prediction interval
    lower_bound = y_pred - margin_of_error
    upper_bound = y_pred + margin_of_error

    # Convert predictions to integers
    lower_bound = np.round(lower_bound).astype(int)
    upper_bound = np.round(upper_bound).astype(int)
    
    return lower_bound, upper_bound

In [4]:
def get_predictions(x, model):
    # Make predictions
    y_pred = model.predict(x)
    
    # Convert predictions to integers
    predictions = np.round(y_pred).astype(int)

    lower_bound, upper_bound = prediction_interval(x, model, alpha=0.75)

    return list(predictions), list(lower_bound), list(upper_bound)

## 5.2 Load Model

### 5.2.1 Import Model & Parameters

In [5]:
# Path to the model to load
model_dir_path = "../models/GBR-20240218-01/"

In [6]:
# Load the trained model
model = joblib.load(f'{model_dir_path}model.pkl')

# Load the normalization parameters
normalization_params = joblib.load(f'{model_dir_path}normalization_params.pkl')

# Load the feature names
feature_list = joblib.load(f'{model_dir_path}feature_list.pkl')

## 5.3 Load New Data

In [7]:
# Load new data here
file_path = input("Path to CSV File: ")

input_df = pd.read_csv(file_path)
input_df = input_df[feature_list]

Path to CSV File:  /Users/simonaytes/Documents/GitHub/cherry-blossom/data/inputs/competition_input.csv


In [8]:
# Normalize data
input_df = normalize_new_data(input_df, normalization_params)

## 5.4 Predict

### 5.4.1 Get Predictions

In [9]:
predictions, lower_bound, upper_bound = get_predictions(input_df, model)



### 5.4.2 Create Prediction Dataframe

In [10]:
predictions_df = pd.read_csv(file_path)[['location']]
predictions_df["prediction"] = predictions
predictions_df["lower"] = lower_bound
predictions_df["upper"] = upper_bound

predictions_df = predictions_df[['location', 'prediction', 'lower', 'upper']]

### 5.4.3 Output Prediction Dataframe

In [11]:
# Create an output filename using the current timestamp
output_file_name = f"../data/output/{dt.now().strftime('%Y-%m-%d_%H-%M-%S') + '.csv'}"

# Output the dataframe
predictions_df.to_csv(output_file_name, index=False)