In [1]:
import mlflow
from mlflow.tracking import MlflowClient

print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

client = MlflowClient()

# Use the search_experiments function to get a list of available experiments
experiments = client.search_experiments()
print(len(experiments))

# Display the list of available experiments
for experiment in experiments:
    print(f"Experiment Name: {experiment.name}, Experiment ID: {experiment.experiment_id}")

tracking URI: 'file:///Users/viviane/Desktop/MLOps/NYC-home-value/mlruns'
5
Experiment Name: nyc_house_price_linear, Experiment ID: 296524759276928674
Experiment Name: nyc_house_price_ensemble, Experiment ID: 624000600208382101
Experiment Name: nyc_house_price_xgb, Experiment ID: 879898173824708793
Experiment Name: nyc_house_price_randomforest, Experiment ID: 686346702712954572
Experiment Name: Default, Experiment ID: 0


In [26]:
import pandas as pd
def load_data(path):
    return pd.read_csv(path)

df = load_data('data/nyc-rolling-sales.csv')
df.head()
# df.shape

Unnamed: 0,NO:,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
0,4,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,6,,C2,153 AVENUE B,...,5,0,5,1633,6440,1900,2,C2,6625000,19-07-2017 00:00
1,5,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,26,,C7,234 EAST 4TH STREET,...,28,3,31,4616,18690,1900,2,C7,-,14-12-2016 00:00
2,6,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,39,,C7,197 EAST 3RD STREET,...,16,1,17,2212,7803,1900,2,C7,-,09-12-2016 00:00
3,7,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,21,,C4,154 EAST 7TH STREET,...,10,0,10,2272,6794,1913,2,C4,3936272,23-09-2016 00:00
4,8,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,55,,C2,301 EAST 10TH STREET,...,6,0,6,2369,4615,1900,2,C2,8000000,17-11-2016 00:00


In [29]:
from datetime import datetime

def clean_data(df):
    df = df.dropna()
    df['SALE DATE'] = pd.to_datetime(df['SALE DATE'], format="%d-%m-%Y %H:%M")
    df['house_age'] = df['SALE DATE'].dt.year - df['YEAR BUILT']
    df = df[(df['ZIP CODE']!=0) & (df['YEAR BUILT']!=0)]
    df = df[['NEIGHBORHOOD', 'BUILDING CLASS CATEGORY', 'BUILDING CLASS AT PRESENT', 'ZIP CODE','TOTAL UNITS', 'GROSS SQUARE FEET', 'house_age', 'SALE PRICE']]
    df['SALE PRICE'] = pd.to_numeric(df['SALE PRICE'], errors='coerce')
    df['GROSS SQUARE FEET'] = pd.to_numeric(df['GROSS SQUARE FEET'], errors='coerce')
    df.rename(columns={'BUILDING CLASS CATEGORY': 'building_category',
                    'BUILDING CLASS AT PRESENT': 'building_class',
                    'ZIP CODE': 'zip_code',
                    'TOTAL UNITS': 'total_unit',
                    'GROSS SQUARE FEET': 'square_feet',
                    'SALE PRICE': 'price'
                    }, inplace=True)
    df['building_category'] = df['building_category'].apply(lambda x: x.strip())
    columns_to_replace = ['total_unit', 'square_feet', 'house_age']
    mean_values = df[columns_to_replace].mean()
    df[columns_to_replace] = df[columns_to_replace].replace(0, mean_values)
    df[columns_to_replace] = df[columns_to_replace].fillna(mean_values)
    df = df[(df['price'] >=20000) & (df['price'] <=3000000)]
    return df

df = clean_data(df)

In [30]:
# df.describe()
df.head()

Unnamed: 0,NEIGHBORHOOD,building_category,building_class,zip_code,total_unit,square_feet,house_age,price
13,ALPHABET CITY,09 COOPS - WALKUP APARTMENTS,C6,10009,2.394866,4256.917964,97.0,499000.0
15,ALPHABET CITY,09 COOPS - WALKUP APARTMENTS,C6,10009,2.394866,4256.917964,97.0,529500.0
16,ALPHABET CITY,09 COOPS - WALKUP APARTMENTS,C6,10009,2.394866,4256.917964,97.0,423000.0
17,ALPHABET CITY,09 COOPS - WALKUP APARTMENTS,C6,10009,2.394866,4256.917964,92.0,501000.0
18,ALPHABET CITY,09 COOPS - WALKUP APARTMENTS,C6,10009,2.394866,4256.917964,96.0,450000.0


In [None]:
# Import libraries
# !pip install plotly
# !pip install nbformat==4.2.0


import seaborn as sns
import matplotlib.pyplot as plt
import re
import plotly.express as px

fig = px.box(df, y='price',color_discrete_sequence=px.colors.sequential.Agsunset,
             width=600, height=500)
fig.update_layout(title_text='Box Plot of Price')
fig.show()

In [49]:
# df.describe()
df.nunique()
# df.isna().sum()

NEIGHBORHOOD          235
building_category      40
building_class        119
zip_code              167
square_feet          3475
year_built            143
price                4986
dtype: int64

In [4]:
from sklearn.model_selection import train_test_split
CATEGORICAL_COLS = ["NEIGHBORHOOD", "building_category", "building_class"]
NUMERICAL_COLS = ["zip_code", "total_unit", "square_feet", "house_age"]

In [47]:
from typing import List
from sklearn.feature_extraction import DictVectorizer

def encode_cols(df: pd.DataFrame, categorical_cols: List[str] = None, numerical_cols: List[str] = None) -> pd.DataFrame:
    if categorical_cols is None:
        categorical_cols = ["NEIGHBORHOOD", "building_category", "building_class"]
    if numerical_cols is None:
        numerical_cols = ["zip_code", "total_unit", "square_feet", "house_age"]
    
    # missing_numerical_cols = set(numerical_cols) - set(df.columns)
    # if missing_numerical_cols:
    #     raise KeyError(f"Numerical columns {missing_numerical_cols} not found in the DataFrame.")

    df[numerical_cols] = df[numerical_cols].fillna(-1).astype("str")
    df[categorical_cols] = df[categorical_cols].astype("str")
    return df


def extract_x_y(
    df: pd.DataFrame,
    categorical_cols: List[str] = None,
    numerical_cols: List[str] = None,
    dv: DictVectorizer = None,
    with_target: bool = True,
) -> dict:
    if categorical_cols is None:
        categorical_cols = ["NEIGHBORHOOD", "building_category", "building_class"]
    if numerical_cols is None:
        numerical_cols = ["zip_code", "total_unit", "square_feet", "house_age"]
    dicts = df[[*categorical_cols, *numerical_cols]].to_dict(orient="records")

    y = None
    if with_target:
        if dv is None:
            dv = DictVectorizer()
            dv.fit(dicts)
        y = df["price"].values

    x = dv.transform(dicts)
    return x, y, dv

# save the preprocessor into saved_pkl folder
import pickle
def save_picked(path: str, dv: DictVectorizer):
    with open(path, "wb") as f:
        pickle.dump(dv, f)

In [17]:
# Train model
# !pip install xgboost

import numpy as np
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def train_model(x_train: pd.DataFrame, y_train: np.ndarray, model_type):
    # model = None
    random = RandomForestRegressor(random_state=42, n_estimators=25,
                              max_depth=70, min_samples_leaf=1, min_samples_split=7)
    xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', learning_rate=0.5, n_estimators=600,
                                max_depth=10, subsample=1, colsample_bytree=1)
    linear = LinearRegression(fit_intercept=True, copy_X=True)
    ensemble = VotingRegressor([('random_forest', random), ('xgb', xgb_reg)])

    model_map = {
        "randomforest": random,
        "xgb": xgb_reg,
        "linear": linear
    }
    model = model_map.get(model_type, None)
    if model is None:
        raise ValueError(f"Invalid model type: {model_type}")
    
    model.fit(x_train, y_train)
    return model

def predict_price(input_data, model):
    return model.predict(input_data)

def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray):
    rmse = round(np.sqrt(mean_squared_error(y_true, y_pred)), 2)
    mae = round(mean_absolute_error(y_true, y_pred), 2)
    r2 = round(r2_score(y_true, y_pred), 4)
    return rmse, mae, r2

In [32]:
# try all steps
df = load_data('/Users/viviane/Desktop/MLOps/NYC-home-value/data/nyc-rolling-sales.csv')
df = clean_data(df)
df.to_csv("/Users/viviane/Desktop/MLOps/NYC-home-value/data/nyc-house-price-cleaned.csv")
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df = encode_cols(train_df)
test_df = encode_cols(test_df)
X_train, y_train, dv = extract_x_y(train_df)
X_test, y_test, _ = extract_x_y(test_df, dv=dv)

In [33]:
model = train_model(X_train, y_train, "randomforest")
pred = predict_price(X_test, model)
rmse, mae, r2 = evaluate_model(y_test, pred)
print(f"r2: {r2}")


r2: 0.5338


In [34]:
model = train_model(X_train, y_train, "xgb")
pred = predict_price(X_test, model)
rmse, mae, r2 = evaluate_model(y_test, pred)
print(f"r2: {r2}")


r2: 0.5504


In [35]:
model = train_model(X_train, y_train, "linear")
pred = predict_price(X_test, model)
rmse, mae, r2 = evaluate_model(y_test, pred)
print(f"r2: {r2}")


r2: 0.4702


In [18]:
# run MLflow
import mlflow.sklearn
from lib.config import PATH_TO_PREPROCESSOR

model_type = "xgb"
mlflow_experiment_path = f"nyc_house_price_{model_type}"

# Set the experiment name
mlflow.set_experiment(mlflow_experiment_path)
data_path = 'data/nyc-rolling-sales.csv'

# Start a run
with mlflow.start_run() as run:
    run_id = run.info.run_id

    # Set tags for the run
    mlflow.set_tag("stage", "training")

    # Load data
    df = load_data(data_path)
    df = clean_data(df)
    df.to_csv('data/nyc-house-price-cleaned.csv')

    # Split train, test dataset
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # Encode categorical and numerical columns
    train_df = encode_cols(train_df)

    # Extract X and y
    X_train, y_train, dv = extract_x_y(train_df)

    # Save preprocessor
    save_picked(PATH_TO_PREPROCESSOR, dv)

    # Train model, log model
    model = train_model(X_train, y_train, model_type)
    mlflow.sklearn.log_model(model, "model")

    # Define params, log params
    params = model.get_params()
    mlflow.log_params(params)

    # Evaluate model
    prediction = predict_price(X_train, model)
    train_mse, train_mae, train_r2 = evaluate_model(y_train, prediction)

    # Log metrics
    mlflow.log_metric("Train-MSE", train_mse)
    mlflow.log_metric("Train-MAE", train_mae)
    mlflow.log_metric("Train-R2", train_r2)

    # Evaluate model on test set
    test_df = encode_cols(test_df)
    X_test, y_test, _ = extract_x_y(test_df, dv=dv)
    y_pred_test = predict_price(X_test, model)
    test_mse, test_mae, test_r2 = evaluate_model(y_test, y_pred_test)

    # Log metrics
    mlflow.log_metric("Test-MSE", test_mse)
    mlflow.log_metric("Test-MAE", test_mae)
    mlflow.log_metric("Test-R2", test_r2)

    # Register your model in mlfow model registry
    registered_model = mlflow.register_model(f"runs:/{run_id}/model", mlflow_experiment_path)

Registered model 'nyc_house_price_xgb' already exists. Creating a new version of this model...
2024/01/27 23:19:08 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: nyc_house_price_xgb, version 2
Created version '2' of model 'nyc_house_price_xgb'.


In [20]:
from mlflow.tracking import MlflowClient
client = MlflowClient()

model_type = "xgb"
mlflow_experiment_path = f"nyc_house_price_{model_type}"

production_version = 1
client.transition_model_version_stage(name=mlflow_experiment_path, version=production_version, stage="Production")

<ModelVersion: aliases=[], creation_timestamp=1706393754589, current_stage='Production', description=None, last_updated_timestamp=1706394148585, name='nyc_house_price_xgb', run_id='ffb0877c87f74cce99303e75fd3f2583', run_link=None, source='file:///Users/viviane/Desktop/MLOps/NYC-home-value/mlruns/578894006709759806/ffb0877c87f74cce99303e75fd3f2583/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [19]:
!mlflow ui --host 0.0.0.0 --port 5002

[2024-01-27 23:19:17 +0100] [48214] [INFO] Starting gunicorn 21.2.0
[2024-01-27 23:19:17 +0100] [48214] [INFO] Listening at: http://0.0.0.0:5002 (48214)
[2024-01-27 23:19:17 +0100] [48214] [INFO] Using worker: sync
[2024-01-27 23:19:17 +0100] [48215] [INFO] Booting worker with pid: 48215
[2024-01-27 23:19:17 +0100] [48216] [INFO] Booting worker with pid: 48216
[2024-01-27 23:19:17 +0100] [48217] [INFO] Booting worker with pid: 48217
[2024-01-27 23:19:17 +0100] [48218] [INFO] Booting worker with pid: 48218
^C
[2024-01-27 23:19:48 +0100] [48214] [INFO] Handling signal: int
[2024-01-27 23:19:48 +0100] [48217] [INFO] Worker exiting (pid: 48217)
[2024-01-27 23:19:48 +0100] [48218] [INFO] Worker exiting (pid: 48218)
[2024-01-27 23:19:48 +0100] [48215] [INFO] Worker exiting (pid: 48215)
[2024-01-27 23:19:48 +0100] [48216] [INFO] Worker exiting (pid: 48216)


In [40]:
from lib.config import PATH_TO_MODEL, PATH_TO_PREPROCESSOR
# Load production model
model_uri = f"models:/{mlflow_experiment_path}/production"
model = mlflow.sklearn.load_model(model_uri)
save_picked(PATH_TO_MODEL, model)

def load_preprocessor(path):
    with open(path, "rb") as f:
        preprocessor = pickle.load(f)
    return preprocessor

dv = load_preprocessor(PATH_TO_PREPROCESSOR)
print(model.)

In [49]:
# Try predicting house price
features = ['NEIGHBORHOOD', 'building_category', 'building_class', 'zip_code','total_unit', 'square_feet', 'house_age']
user_input = {}

for f in features:
    user_input[f] = input(f"Enter value for {f}: ")

X_pred = pd.DataFrame([user_input])
X_pred = encode_cols(X_pred)
dicts = X_pred.to_dict(orient="records")
X_pred = dv.transform(dicts)
y_pred = predict_price(X_pred, model)
print(f"Predicted house price: {round(y_pred,0)}USD")

[1258670.5]
