## Build Model

Libraries

In [2]:
import numpy as np
import pandas as pd
import sklearn
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import make_scorer
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
from typing import Union, List


Global Variables

In [3]:
#path variables
path_to_file = '../data/processed_df.parquet' 
model_store = '../models/model.joblib'    
model_encoder_1 = '../models/encoder.joblib'    
model_scaler_1 = '../models/scaler.joblib'    

target = 'SalePrice'
feature_list = ['Id', 'LotArea', 'YearBuilt', 'BsmtFinSF1', 'BedroomAbvGr',
                 'KitchenAbvGr', 'GarageArea',
                 '1stFlrSF', 'MSZoning', 'Heating']

# identifying and splitting features into continuous and categorical
numeric_features = ['Id', 'LotArea', 'YearBuilt',
                 'BsmtFinSF1', 'BedroomAbvGr',
                    'KitchenAbvGr', 'GarageArea','1stFlrSF']

categorical_features = ['MSZoning', 'Heating']

df = pd.read_parquet(path_to_file)




Processing functions

In [4]:
def scale_numeric(df):
  scaler = StandardScaler()
  scaler.fit(df[numeric_features])
  scaled_data = scaler.transform(df[numeric_features])
  scaled_df = pd.DataFrame(data=scaled_data, columns=numeric_features)
  joblib.dump(scaler, model_scaler_1)
  return scaled_df


def encode_categorical(df):
  encoder = OrdinalEncoder()
  encoder.fit(df[categorical_features])
  encoded_data = encoder.transform(df[categorical_features])
  encoded_df = pd.DataFrame(data=encoded_data, columns=categorical_features)
  joblib.dump(encoder, model_encoder_1)
  return encoded_df


# This function holds the target variable if it is trained
# data in a bid to reunite it with its features
def hold_on_target(df: pd.DataFrame) -> Union[pd.DataFrame, None]:
    if target in df.columns:
        held_target = df[target]
        held_df = pd.DataFrame(held_target)
        return held_df
    else:
        return None


# In a training data, this function recombines the scaled
# and encoded data with the target for the purpose of training
def comb_scal_enco(*dfs: List[pd.DataFrame]) -> pd.DataFrame:
    if len(dfs) == 3:
        merged_coders = dfs[0].join(dfs[1]).join(dfs[2])
    elif len(dfs) == 2:
        merged_coders = dfs[0].join(dfs[1])
    else:
        raise ValueError("The function requires 2 or 3 DataFrames")
    return merged_coders


def preprocessing_step(df):
   #scaling
  output_scale = scale_numeric(df)
  #encode cat
  output_encode = encode_categorical(df)
  #holding taget in place
  output_hold = hold_on_target(df)
  #Feature engineering by combining
  preprocessed_output = comb_scal_enco(output_scale, output_encode, output_hold)
  return preprocessed_output


Model Building

In [5]:
def training_data(preprocessed_df):
    # Splitting the data
    X, y = preprocessed_df.drop([target], axis=1), preprocessed_df[target]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Training the model
    hgb_regressor = HistGradientBoostingRegressor()
    trained_model = hgb_regressor.fit(X_train, y_train)
    joblib.dump(hgb_regressor, model_store)
    return X_val, y_val, trained_model




def evaluation_trained_model(X_val, y_val, trained_model):
    # Predicting values for evaluation
    y_pred = trained_model.predict(X_val)

    # Evaluation metrics score
    rmsle = np.sqrt(mean_squared_log_error(y_val, y_pred))

    return {'RMSLE': rmsle}




def build_model(data: pd.DataFrame) -> dict[str, str]:
    # Preprocessing
    preprocessed_df = preprocessing_step(data)

    # Training
    X_val, y_val, trained_model = training_data(preprocessed_df)

    # Evaluation and result
    performance = evaluation_trained_model(X_val, y_val, trained_model)

    return performance


In [6]:
# Call the build_model function with numeric and categorical features
result = build_model(df)
result

{'RMSLE': 0.20361223177145288}

## Inference

In [7]:
test_raw = pd.read_csv('../data/test_df.parquet')
test = test_raw.copy()

test = test[feature_list]

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x82 in position 11: invalid start byte

In [8]:
def scale_numeric(df):
  scaler = joblib.load(model_scaler_1)
  scaled_data = scaler.transform(df[numeric_features])
  scaled_df = pd.DataFrame(data=scaled_data, columns=numeric_features)
  return scaled_df



def encode_categorical(df):
  encoder = joblib.load(model_encoder_1)
  encoded_data = encoder.transform(df[categorical_features])
  encoded_df = pd.DataFrame(data=encoded_data, columns=categorical_features)
  return encoded_df




def preprocessing_step(df):
#   #scaling
   output_scale = scale_numeric(df)
#   #encode cat
   output_encode = encode_categorical(df)
#   #Feature engineering by combining
   preprocessed_output = comb_scal_enco(output_scale, output_encode)
   return preprocessed_output


def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
  #preprocessing dataframe
  preprocessed_data = preprocessing_step(input_data)

  # #Loading the model
  model = joblib.load(model_store)

  prediction = model.predict(preprocessed_data)

  return prediction



In [9]:
result11 = make_predictions(test)
result11

array([156457.08565053, 187653.43843743, 219465.81430615, ...,
       231850.35648543, 150269.60302678, 232055.46813541])