In [1]:
import mlflow
from mlflow.tracking import MlflowClient

print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

client = MlflowClient()

# Use the search_experiments function to get a list of available experiments
experiments = client.search_experiments()
print(len(experiments))

# Display the list of available experiments
for experiment in experiments:
    print(f"Experiment Name: {experiment.name}, Experiment ID: {experiment.experiment_id}")

tracking URI: 'file:///Users/viviane/Desktop/MLOps/NYC-home-value/mlruns'
2
Experiment Name: nyc_house_price_randomforest, Experiment ID: 686346702712954572
Experiment Name: Default, Experiment ID: 0


In [142]:
import pandas as pd
def load_data(path):
    return pd.read_csv(path)

df = load_data('data/nyc-rolling-sales.csv')
df.head()
# df.shape

Unnamed: 0.1,Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
0,4,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,6,,C2,153 AVENUE B,...,5,0,5,1633,6440,1900,2,C2,6625000,2017-07-19 00:00:00
1,5,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,26,,C7,234 EAST 4TH STREET,...,28,3,31,4616,18690,1900,2,C7,-,2016-12-14 00:00:00
2,6,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,39,,C7,197 EAST 3RD STREET,...,16,1,17,2212,7803,1900,2,C7,-,2016-12-09 00:00:00
3,7,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,21,,C4,154 EAST 7TH STREET,...,10,0,10,2272,6794,1913,2,C4,3936272,2016-09-23 00:00:00
4,8,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,55,,C2,301 EAST 10TH STREET,...,6,0,6,2369,4615,1900,2,C2,8000000,2016-11-17 00:00:00


In [144]:
def clean_data(df):
    df = df[['BOROUGH', 'NEIGHBORHOOD', 'BUILDING CLASS CATEGORY', 'BUILDING CLASS AT PRESENT', 'ZIP CODE', 'TOTAL UNITS', 'GROSS SQUARE FEET', 'YEAR BUILT', 'SALE PRICE']]
    df['SALE PRICE'] = pd.to_numeric(df['SALE PRICE'], errors='coerce')
    df['GROSS SQUARE FEET'] = pd.to_numeric(df['GROSS SQUARE FEET'], errors='coerce')
    df.rename(columns={'BUILDING CLASS CATEGORY': 'building_category',
                    'BUILDING CLASS AT PRESENT': 'building_class',
                    'ZIP CODE': 'zip_code',
                    'TOTAL UNITS': 'total_unit',
                    'GROSS SQUARE FEET': 'square_feet',
                    'YEAR BUILT': 'year_built',
                    'SALE PRICE': 'price'
                    }, inplace=True)
    df = df[(df['zip_code']!=0) & (df['year_built']!=0)]
    df = df[(df['price'] >=20000) & (df['price'] <=3000000)]
    # df = df.dropna()
    return df

# df.shape
# df.head()

In [60]:
# Import libraries
# !pip install plotly
import seaborn as sns
import matplotlib.pyplot as plt
import re
import plotly.express as px

fig = px.box(df, y='price',color_discrete_sequence=px.colors.sequential.Agsunset,
             width=600, height=500)
fig.update_layout(title_text='Box Plot of Price by Gearbox Type')
fig.show()

In [112]:
# df.describe()
df.nunique()
# df.isna().sum()

NEIGHBORHOOD          251
building_category      42
building_class        136
zip_code              181
total_unit            114
square_feet          4179
year_built            150
price                7445
dtype: int64

In [54]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from typing import List
# from scipy.sparse import csr_matrix

In [145]:
from sklearn.model_selection import train_test_split
CATEGORICAL_COLS = ["BOROUGH", "NEIGHBORHOOD", "building_category", "building_class"]
NUMERICAL_COLS = ["zip_code", "total_unit", "square_feet", "year_built"]

In [147]:
def encode_cols(df: pd.DataFrame, categorical_cols: List[str] = None, numerical_cols: List[str] = None) -> pd.DataFrame:
    if categorical_cols is None:
        categorical_cols = ["BOROUGH","NEIGHBORHOOD", "building_category", "building_class"]
    if numerical_cols is None:
        numerical_cols = ["zip_code", "total_unit", "square_feet", "year_built"]
    # df[categorical_cols] = df[categorical_cols].fillna(-1).astype("int")
    df[numerical_cols] = df[numerical_cols].fillna(0).astype("float")
    df[categorical_cols] = df[categorical_cols].astype("str")
    
    return df

def extract_x_y(
    df: pd.DataFrame,
    categorical_cols: List[str] = None,
    numerical_cols: List[str] = None,
    dv: DictVectorizer = None,
    with_target: bool = True,
) -> dict:
    if categorical_cols is None:
        categorical_cols = ["BOROUGH","NEIGHBORHOOD", "building_category", "building_class"]
    if numerical_cols is None:
        numerical_cols = ["zip_code", "total_unit", "square_feet", "year_built"]
    dicts = df[[*categorical_cols, *numerical_cols]].to_dict(orient="records")

    y = None
    if with_target:
        if dv is None:
            dv = DictVectorizer()
            dv.fit(dicts)
        y = df["price"].values

    x = dv.transform(dicts)
    return x, y, dv

# save the preprocessor into saved_pkl folder
import pickle
def save_picked(path: str, dv: DictVectorizer):
    with open(path, "wb") as f:
        pickle.dump(dv, f)

In [136]:
# Train model
import xgboost as xgb
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def train_model(x_train: pd.DataFrame, y_train: np.ndarray, model_type):
    # model = None
    random = RandomForestRegressor(random_state=42, n_estimators=25,
                              max_depth=30, min_samples_leaf=1, min_samples_split=10)
    xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', learning_rate=0.5, n_estimators=370,
                                max_depth=8, subsample=1, colsample_bytree=1)
    ensemble = VotingRegressor([('random_forest', random), ('xgb', xgb_reg)])

    model_map = {
        "randomforest": random,
        "xgb": xgb_reg,
        "ensemble": ensemble
    }
    model = model_map.get(model_type, None)
    if model is None:
        raise ValueError(f"Invalid model type: {model_type}")
    
    model.fit(x_train, y_train)
    return model

def predict_price(input_data, model):
    return model.predict(input_data)

def evaluate_model(y_true: np.ndarray, y_pred: np.ndarray):
    rmse = round(np.sqrt(mean_squared_error(y_true, y_pred)), 2)
    mae = round(mean_absolute_error(y_true, y_pred), 2)
    r2 = round(r2_score(y_true, y_pred), 4)
    return rmse, mae, r2

In [148]:
# try all steps
df = load_data('data/nyc-rolling-sales.csv')
df = clean_data(df)
df.to_csv("data/nyc-house-price-cleaned.csv")
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df = encode_cols(train_df)
test_df = encode_cols(test_df)
X_train, y_train, dv = extract_x_y(train_df)
X_test, y_test, _ = extract_x_y(test_df, dv=dv)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [149]:
model = train_model(X_train, y_train, "randomforest")
pred = predict_price(X_test, model)
rmse, mae, r2 = evaluate_model(y_test, pred)
print(f"r2: {r2}")


r2: 0.5948


In [150]:
model = train_model(X_train, y_train, "xgb")
pred = predict_price(X_test, model)
rmse, mae, r2 = evaluate_model(y_test, pred)
print(f"r2: {r2}")


r2: 0.5723


In [151]:
model = train_model(X_train, y_train, "ensemble")
pred = predict_price(X_test, model)
rmse, mae, r2 = evaluate_model(y_test, pred)
print(f"r2: {r2}")


r2: 0.6003


In [140]:
# run MLflow
import mlflow.sklearn
from lib.config import PATH_TO_PREPROCESSOR
# from lib.config import MODEL_TYPE

model_type = "xgb"
mlflow_experiment_path = f"nyc_house_price_{model_type}"

# Set the experiment name
mlflow.set_experiment(mlflow_experiment_path)
data_path = 'data/nyc-rolling-sales.csv'

# Start a run
with mlflow.start_run() as run:
    run_id = run.info.run_id

    # Set tags for the run
    mlflow.set_tag("stage", "training")

    # Load data
    df = load_data(data_path)
    df = clean_data(df)

    # Split train, test dataset
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # Encode categorical and numerical columns
    train_df = encode_cols(train_df)

    # Extract X and y
    X_train, y_train, dv = extract_x_y(train_df)

    # Save preprocessor
    save_picked(PATH_TO_PREPROCESSOR, dv)

    # Train model, log model
    model = train_model(X_train, y_train, model_type)
    mlflow.sklearn.log_model(model, "model")

    # Log individual parameters of each regressor
    params = model.get_params()
    mlflow.log_params(params)

    # Evaluate model
    prediction = predict_price(X_train, model)
    train_mse, train_mae, train_r2 = evaluate_model(y_train, prediction)

    # Log metrics
    mlflow.log_metric("Train-MSE", train_mse)
    mlflow.log_metric("Train-MAE", train_mae)
    mlflow.log_metric("Train-R2", train_r2)

    # Evaluate model on test set
    test_df = encode_cols(test_df)
    X_test, y_test, _ = extract_x_y(test_df, dv=dv)
    y_pred_test = predict_price(X_test, model)
    test_mse, test_mae, test_r2 = evaluate_model(y_test, y_pred_test)

    # Log metrics
    mlflow.log_metric("Test-MSE", test_mse)
    mlflow.log_metric("Test-MAE", test_mae)
    mlflow.log_metric("Test-R2", test_r2)

    # Register your model in mlfow model registry
    registered_model = mlflow.register_model(f"runs:/{run_id}/model", mlflow_experiment_path)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Registered model 'nyc_house_price_randomforest' already exists. Creating a new version of this model...
2024/01/27 09:46:53 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: nyc_house_pric

In [153]:
from mlflow.tracking import MlflowClient
client = MlflowClient()

model_type = "xgb"
mlflow_experiment_path = f"nyc_house_price_{model_type}"

def update_models_info(mv, experiment_name):
    for m in mv:
        production_version = 2
        client.transition_model_version_stage(name=experiment_name, version=production_version, stage="Production")
        print(f"Success update stage Production for the lastest version: {m.version}")
        print(f"name: {m.name}")
        print(f"run_id: {m.run_id}")
        print(f"current_stage: {m.current_stage}")

update_models_info(client.get_latest_versions(mlflow_experiment_path, stages=["Production"]), mlflow_experiment_path)

In [152]:
!mlflow ui --host 0.0.0.0 --port 5002

[2024-01-27 10:22:19 +0100] [32959] [INFO] Starting gunicorn 21.2.0
[2024-01-27 10:22:19 +0100] [32959] [INFO] Listening at: http://0.0.0.0:5002 (32959)
[2024-01-27 10:22:19 +0100] [32959] [INFO] Using worker: sync
[2024-01-27 10:22:19 +0100] [32960] [INFO] Booting worker with pid: 32960
[2024-01-27 10:22:19 +0100] [32961] [INFO] Booting worker with pid: 32961
[2024-01-27 10:22:20 +0100] [32962] [INFO] Booting worker with pid: 32962
[2024-01-27 10:22:20 +0100] [32963] [INFO] Booting worker with pid: 32963
^C
[2024-01-27 10:24:22 +0100] [32959] [INFO] Handling signal: int
[2024-01-27 10:24:22 +0100] [32960] [INFO] Worker exiting (pid: 32960)
[2024-01-27 10:24:22 +0100] [32963] [INFO] Worker exiting (pid: 32963)
[2024-01-27 10:24:22 +0100] [32962] [INFO] Worker exiting (pid: 32962)
[2024-01-27 10:24:22 +0100] [32961] [INFO] Worker exiting (pid: 32961)


In [154]:
from lib.config import PATH_TO_MODEL, CATEGORICAL_COLS, NUMERICAL_COLS
# Load production model
model_uri = f"models:/{mlflow_experiment_path}/production"
model = mlflow.sklearn.load_model(model_uri)
save_picked(PATH_TO_MODEL, model)

# Make predictions
# features = ['NEIGHBORHOOD', 'BUILDING CLASS CATEGORY', 'BUILDING CLASS AT PRESENT', 'ZIP CODE', 'TOTAL UNITS', 'GROSS SQUARE FEET', 'YEAR BUILT', 'SALE PRICE']
# user_input = {}
# for f in features:
#     user_input[f] = input(f"Enter value for {f}: ")
# X_pred = pd.DataFrame([user_input])
# # Convert numeric columns to numeric type
# X_pred[NUMERICAL_COLS] = X_pred[NUMERICAL_COLS].apply(pd.to_numeric, errors='coerce')

# # Impute missing values in features
# imputer = SimpleImputer(strategy='mean')
# X_pred = pd.DataFrame(imputer.fit_transform(X_pred), columns=X_pred.columns)

# X_pred = encode_cols(X_pred)
# y_pred = predict_price(X_pred, model)

MlflowException: No versions of model with name 'nyc_house_price_xgb' and stage 'production' found