In [1]:
import sqlite3

# Database connection
conn = sqlite3.connect('group_project.db')
cursor = conn.cursor()

# Create tables
cursor.execute('''
CREATE TABLE IF NOT EXISTS City (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    name TEXT UNIQUE
)''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS StateZip (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    code TEXT UNIQUE
)''')

cursor.execute('''
CREATE TABLE IF NOT EXISTS Housing (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    date TEXT,
    price REAL,
    bedrooms INTEGER,
    bathrooms REAL,
    sqft_living INTEGER,
    sqft_lot INTEGER,
    floors REAL,
    waterfront INTEGER,
    view INTEGER,
    condition INTEGER,
    sqft_above INTEGER,
    sqft_basement INTEGER,
    yr_built INTEGER,
    yr_renovated INTEGER,
    city_id INTEGER,
    statezip_id INTEGER,
    country TEXT,
    FOREIGN KEY (city_id) REFERENCES City(id),
    FOREIGN KEY (statezip_id) REFERENCES StateZip(id)
)''')


conn.commit()

# Function to insert and obtain foreign key
def get_or_create_fk(cursor, table, column, value):
    cursor.execute(f'SELECT id FROM {table} WHERE {column} = ?', (value,))
    result = cursor.fetchone()
    if result:
        return result[0]
    else:
        cursor.execute(f'INSERT INTO {table} ({column}) VALUES (?)', (value,))
        conn.commit()
        return cursor.lastrowid

# Read and insert data
with open('property.csv', 'r') as file:
    next(file)  
    for line in file:
        row = line.strip().split(',')
        if len(row) < 18:  
            continue
        city_id = get_or_create_fk(cursor, 'City', 'name', row[15])
        statezip_id = get_or_create_fk(cursor, 'StateZip', 'code', row[16])
        housing_data = row[:15] + [city_id, statezip_id, row[17]]
        cursor.execute('''
        INSERT INTO Housing (
            date, price, bedrooms, bathrooms, sqft_living, sqft_lot, floors,
            waterfront, view, condition, sqft_above, sqft_basement, yr_built,
            yr_renovated, city_id, statezip_id, country
        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', group_project)


conn.commit()
conn.close()

In [13]:
import sqlite3
import pandas as pd
from sklearn.model_selection import train_test_split
from ydata_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler  # Import StandardScaler


conn = sqlite3.connect('group_project.db')


query = '''
        SELECT h.date,h.price, h.bedrooms, h.bathrooms, h.sqft_living, h.sqft_lot, h.floors,
        h.waterfront, h.view, h.condition, h.sqft_above, h.sqft_basement, h.yr_built,
        h.yr_renovated, h.country, c.name AS city, s.code AS statezip
        FROM  housing h
        JOIN city c ON h.city_id = c.id
        JOIN statezip s ON h.statezip_id = s.id;
'''
df = pd.read_sql_query(query, conn)


conn.close()


In [15]:
df

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,country,city,statezip
0,2014-05-02 00:00:00,3.130000e+05,3,1.50,1340,7912,1.5,0,0,3,1340,0,1955,2005,USA,Shoreline,WA 98133
1,2014-05-02 00:00:00,2.384000e+06,5,2.50,3650,9050,2.0,0,4,5,3370,280,1921,0,USA,Seattle,WA 98119
2,2014-05-02 00:00:00,3.420000e+05,3,2.00,1930,11947,1.0,0,0,4,1930,0,1966,0,USA,Kent,WA 98042
3,2014-05-02 00:00:00,4.200000e+05,3,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,USA,Bellevue,WA 98008
4,2014-05-02 00:00:00,5.500000e+05,4,2.50,1940,10500,1.0,0,0,4,1140,800,1976,1992,USA,Redmond,WA 98052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9195,2014-07-09 00:00:00,3.081667e+05,3,1.75,1510,6360,1.0,0,0,4,1510,0,1954,1979,USA,Seattle,WA 98133
9196,2014-07-09 00:00:00,5.343333e+05,3,2.50,1460,7573,2.0,0,0,3,1460,0,1983,2009,USA,Bellevue,WA 98007
9197,2014-07-09 00:00:00,4.169042e+05,3,2.50,3010,7014,2.0,0,0,3,3010,0,2009,0,USA,Renton,WA 98059
9198,2014-07-10 00:00:00,2.034000e+05,4,2.00,2090,6630,1.0,0,0,3,1070,1020,1974,0,USA,Seattle,WA 98178


In [43]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report")
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]


KeyboardInterrupt



In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
for column in train.columns:
    plt.figure(figsize=(8, 6))
    sns.histplot(train[column], kde=True)
    plt.title(f'Distribution of {column}')
    plt.tight_layout()
    plt.show()

In [148]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2, random_state=42, stratify=pd.qcut(df['price'], q=10))

In [146]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Load the CSV file into a DataFrame
df = pd.read_csv('property.csv')

# Inspect the unique values and data types in the 'street' column
print(df['city'].unique())
print(df['city'].dtype)

# Handle missing values if any
df['city'].fillna('Unknown', inplace=True)

# Initialize the OneHotEncoder
onehot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Fit and transform the 'street' column
city_encoded = onehot_encoder.fit_transform(df[['city']])

# Create a DataFrame with the encoded features
city_encoded_df = pd.DataFrame(city_encoded, columns=onehot_encoder.get_feature_names_out(['city']))

# Concatenate the encoded features with the original DataFrame
df = pd.concat([df, city_encoded_df], axis=1)

# Drop the original 'street' column
df.drop('city', axis=1, inplace=True)

# Display the DataFrame with the encoded 'street' column
print(df.head())


['Shoreline' 'Seattle' 'Kent' 'Bellevue' 'Redmond' 'Maple Valley'
 'North Bend' 'Lake Forest Park' 'Sammamish' 'Auburn' 'Des Moines'
 'Bothell' 'Federal Way' 'Kirkland' 'Issaquah' 'Woodinville'
 'Normandy Park' 'Fall City' 'Renton' 'Carnation' 'Snoqualmie' 'Duvall'
 'Burien' 'Covington' 'Inglewood-Finn Hill' 'Kenmore' 'Newcastle'
 'Mercer Island' 'Black Diamond' 'Ravensdale' 'Clyde Hill' 'Algona'
 'Skykomish' 'Tukwila' 'Vashon' 'Yarrow Point' 'SeaTac' 'Medina'
 'Enumclaw' 'Snoqualmie Pass' 'Pacific' 'Beaux Arts Village' 'Preston'
 'Milton']
object
            date      price  bedrooms  bathrooms  sqft_living  sqft_lot  \
0  5/2/2014 0:00   313000.0         3       1.50         1340      7912   
1  5/2/2014 0:00  2384000.0         5       2.50         3650      9050   
2  5/2/2014 0:00   342000.0         3       2.00         1930     11947   
3  5/2/2014 0:00   420000.0         3       2.25         2000      8030   
4  5/2/2014 0:00   550000.0         4       2.50         1940     10500

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['city'].fillna('Unknown', inplace=True)


In [202]:
train.drop('date', axis=1, inplace=True)
test.drop('date',  axis=1, inplace=True)

KeyError: "['date'] not found in axis"

In [170]:
train.columns


Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement',
       'yr_built', 'yr_renovated', 'city_Algona', 'city_Auburn',
       'city_Beaux Arts Village', 'city_Bellevue', 'city_Black Diamond',
       'city_Bothell', 'city_Burien', 'city_Carnation', 'city_Clyde Hill',
       'city_Covington', 'city_Des Moines', 'city_Duvall', 'city_Enumclaw',
       'city_Fall City', 'city_Federal Way', 'city_Inglewood-Finn Hill',
       'city_Issaquah', 'city_Kenmore', 'city_Kent', 'city_Kirkland',
       'city_Lake Forest Park', 'city_Maple Valley', 'city_Medina',
       'city_Mercer Island', 'city_Milton', 'city_Newcastle',
       'city_Normandy Park', 'city_North Bend', 'city_Pacific', 'city_Preston',
       'city_Ravensdale', 'city_Redmond', 'city_Renton', 'city_Sammamish',
       'city_SeaTac', 'city_Seattle', 'city_Shoreline', 'city_Skykomish',
       'city_Snoqualmie', 'city_Snoqualmie Pass', 'city_Tu

In [172]:
test.columns

Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement',
       'yr_built', 'yr_renovated', 'city_Algona', 'city_Auburn',
       'city_Beaux Arts Village', 'city_Bellevue', 'city_Black Diamond',
       'city_Bothell', 'city_Burien', 'city_Carnation', 'city_Clyde Hill',
       'city_Covington', 'city_Des Moines', 'city_Duvall', 'city_Enumclaw',
       'city_Fall City', 'city_Federal Way', 'city_Inglewood-Finn Hill',
       'city_Issaquah', 'city_Kenmore', 'city_Kent', 'city_Kirkland',
       'city_Lake Forest Park', 'city_Maple Valley', 'city_Medina',
       'city_Mercer Island', 'city_Milton', 'city_Newcastle',
       'city_Normandy Park', 'city_North Bend', 'city_Pacific', 'city_Preston',
       'city_Ravensdale', 'city_Redmond', 'city_Renton', 'city_Sammamish',
       'city_SeaTac', 'city_Seattle', 'city_Shoreline', 'city_Skykomish',
       'city_Snoqualmie', 'city_Snoqualmie Pass', 'city_Tu

In [150]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

class DataPreprocessor(BaseEstimator, TransformerMixin):
    categorical_columns = train.select_dtypes(include=['object']).columns
    numerical_columns = train.select_dtypes(include=['int64', 'float64']).columns

    def fit(self, X, y=None): 
        # Create and fit simple imputer
        self.imputer = SimpleImputer(strategy='median')
        self.imputer.fit(X[self.numerical_columns])
        
        # Create and fit Standard Scaler 
        self.scaler = StandardScaler()
        self.scaler.fit(X[self.numerical_columns]) 
        
        # Create and fit one hot encoder
        self.onehot = OneHotEncoder(handle_unknown='ignore')
        self.onehot.fit(X[self.categorical_columns])
        
        return self 
 

    def transform(self, X): 
        # Apply simple imputer 
        imputed_cols = self.imputer.transform(X[self.numerical_columns])
        onehot_cols = self.onehot.transform(X[self.categorical_columns])
        
        # Copy the df 
        transformed_df = X.copy()
         
        # Apply transformed columns
        transformed_df[self.numerical_columns] = imputed_cols
        transformed_df[self.numerical_columns] = self.scaler.transform(transformed_df[self.numerical_columns])        
        
        # Drop existing categorical columns and replace with one hot equivalent
        transformed_df = transformed_df.drop(self.categorical_columns, axis=1) 
        transformed_df[self.onehot.get_feature_names_out()] = onehot_cols.toarray().astype(int)
        
        return transformed_df

In [200]:
preprocessor = DataPreprocessor()
preprocessor.fit(train)
train_fixed = preprocessor.transform(train)

KeyError: "None of [Index(['date', 'country'], dtype='object')] are in the [columns]"

In [154]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
rfr = make_pipeline(DataPreprocessor(), RandomForestRegressor(n_estimators=50))

In [156]:
params = rfr.get_params()
params 

{'memory': None,
 'steps': [('datapreprocessor', DataPreprocessor()),
  ('randomforestregressor', RandomForestRegressor(n_estimators=50))],
 'verbose': False,
 'datapreprocessor': DataPreprocessor(),
 'randomforestregressor': RandomForestRegressor(n_estimators=50),
 'randomforestregressor__bootstrap': True,
 'randomforestregressor__ccp_alpha': 0.0,
 'randomforestregressor__criterion': 'squared_error',
 'randomforestregressor__max_depth': None,
 'randomforestregressor__max_features': 1.0,
 'randomforestregressor__max_leaf_nodes': None,
 'randomforestregressor__max_samples': None,
 'randomforestregressor__min_impurity_decrease': 0.0,
 'randomforestregressor__min_samples_leaf': 1,
 'randomforestregressor__min_samples_split': 2,
 'randomforestregressor__min_weight_fraction_leaf': 0.0,
 'randomforestregressor__n_estimators': 50,
 'randomforestregressor__n_jobs': None,
 'randomforestregressor__oob_score': False,
 'randomforestregressor__random_state': None,
 'randomforestregressor__verbose':

In [176]:
y_train = train["price"]
X_train = train.drop("price",axis=1)
y_test = test['price']
X_test = test.drop('price', axis=1)

In [178]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

In [179]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
y_train_hat=rfr.predict(X_train)



In [182]:
rmse = mean_squared_error(y_train, y_train_hat)

In [184]:
mae = mean_absolute_error(y_train, y_train_hat)

In [186]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
y_test_hat=rfr.predict(X_test)


In [188]:
mean_squared_error(y_test, y_test_hat,squared=False)


263852.01278111385

In [190]:
r2 = r2_score(y_train, y_train_hat)

In [192]:
mean_absolute_error(y_test, y_test_hat)

127796.55544685041

In [194]:
from sklearn.metrics import r2_score

In [196]:
r2_score(y_test, y_test_hat)

0.4497157723498919

In [68]:
test.columns

Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement',
       'yr_built', 'yr_renovated', 'country', 'city_Algona', 'city_Auburn',
       'city_Beaux Arts Village', 'city_Bellevue', 'city_Black Diamond',
       'city_Bothell', 'city_Burien', 'city_Carnation', 'city_Clyde Hill',
       'city_Covington', 'city_Des Moines', 'city_Duvall', 'city_Enumclaw',
       'city_Fall City', 'city_Federal Way', 'city_Inglewood-Finn Hill',
       'city_Issaquah', 'city_Kenmore', 'city_Kent', 'city_Kirkland',
       'city_Lake Forest Park', 'city_Maple Valley', 'city_Medina',
       'city_Mercer Island', 'city_Milton', 'city_Newcastle',
       'city_Normandy Park', 'city_North Bend', 'city_Pacific', 'city_Preston',
       'city_Ravensdale', 'city_Redmond', 'city_Renton', 'city_Sammamish',
       'city_SeaTac', 'city_Seattle', 'city_Shoreline', 'city_Skykomish',
       'city_Snoqualmie', 'city_Snoqualmie Pass

In [164]:
train.drop('country', axis=1, inplace=True)
test.drop('country',  axis=1, inplace=True)

In [212]:
y_train.columns


AttributeError: 'Series' object has no attribute 'columns'

In [216]:
import mlflow
from mlflow.models import infer_signature

# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
mlflow.set_experiment("Housing Price Prediction")
experiment_name = "RANDOM_FOREST_REGRESSI0N"
try:
    mlflow.set_experiment(experiment_name)
except MlflowException as e:
    if "RESOURCE_DOES_NOT_EXIST" in str(e):
        mlflow.create_experiment(experiment_name)
        mlflow.set_experiment(experiment_name)
    else:
        raise e
# Start an MLflow run

model = RandomForestRegressor()
model.fit(X_train, y_train)

with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(params)

    # Log metrics
    mlflow.log_metric("root_mean_squared_error", rmse)
    mlflow.log_metric("mean_absolute_error", mae)
    mlflow.log_metric("r2_score", r2)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Model Info", "RandomForestRegressor for housing data")

    # Infer the model signature
    signature = infer_signature(X_train, model.predict(X_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="housing_model",
        signature=signature,
        registered_model_name="rfr_housing_model",
    )

Successfully registered model 'rfr_housing_model'.
2024/05/15 22:59:02 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rfr_housing_model, version 1
Created version '1' of model 'rfr_housing_model'.


In [230]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB 217.9 kB/s eta 0:07:38
   ---------------------------------------- 0.1/99.8 MB 469.7 kB/s eta 0:03:33
   ---------------------------------------- 0.3/99.8 MB 1.9 MB/s eta 0:00:53
   ---------------------------------------- 1.2/99.8 MB 5.2 MB/s eta 0:00:19
   - -------------------------------------- 2.5/99.8 MB 10.0 MB/s eta 0:00:10
   - -------------------------------------- 4.1/99.8 MB 13.8 MB/s eta 0:00:07
   -- ------------------------------------- 6.0/99.8 MB 17.5 MB/s eta 0:00:06
   --- ------------------------------------ 8.1/99.8 MB 20.0 MB/s eta 0:00:05
   --- ------------------------------------ 10.0/99.8 MB 21.9 MB/s eta 0:00:05
   ---- --

In [232]:
import mlflow
from mlflow.models import infer_signature
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
experiment_name = "Housing Price Prediction"
try:
    mlflow.set_experiment(experiment_name)
except mlflow.exceptions.MlflowException as e:
    if "RESOURCE_DOES_NOT_EXIST" in str(e):
        mlflow.create_experiment(experiment_name)
        mlflow.set_experiment(experiment_name)
    else:
        raise e

# Define the model
model = XGBRegressor()

# Fit the model
model.fit(X_train, y_train)

# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(model.get_params())

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate and log metrics
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mlflow.log_metric("root_mean_squared_error", rmse)
    mlflow.log_metric("mean_absolute_error", mae)
    mlflow.log_metric("r2_score", r2)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Model Info", "XGBRegressor for housing data")

    # Infer the model signature
    signature = infer_signature(X_train, model.predict(X_train))

    # Log the model
    model_info = mlflow.xgboost.log_model(
        xgb_model=model,
        artifact_path="housing_model",
        signature=signature,
        registered_model_name="xgb_housing_model",
    )


Successfully registered model 'xgb_housing_model'.
2024/05/15 23:26:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: xgb_housing_model, version 1
Created version '1' of model 'xgb_housing_model'.


In [238]:
import mlflow
from mlflow.models import infer_signature
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
experiment_name = "Housing Price Prediction"
try:
    mlflow.set_experiment(experiment_name)
except mlflow.exceptions.MlflowException as e:
    if "RESOURCE_DOES_NOT_EXIST" in str(e):
        mlflow.create_experiment(experiment_name)
        mlflow.set_experiment(experiment_name)
    else:
        raise e

# Define the model using BaggingRegressor
model = BaggingRegressor()

# Fit the model
model.fit(X_train, y_train)

# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(model.get_params())

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate and log metrics
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mlflow.log_metric("root_mean_squared_error", rmse)
    mlflow.log_metric("mean_absolute_error", mae)
    mlflow.log_metric("r2_score", r2)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Model Info", "BaggingRegressor for housing data")

    # Infer the model signature
    signature = infer_signature(X_train, model.predict(X_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="housing_model",
        signature=signature,
        registered_model_name="bagging_housing_model",
    )


Successfully registered model 'bagging_housing_model'.
2024/05/15 23:40:13 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: bagging_housing_model, version 1
Created version '1' of model 'bagging_housing_model'.


In [240]:
train.drop('view', axis=1, inplace=True)
test.drop('view',  axis=1, inplace=True)
train.drop('waterfront', axis=1, inplace=True)
test.drop('waterfront',  axis=1, inplace=True)

In [242]:
import mlflow
from mlflow.models import infer_signature
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
experiment_name = "Housing Price Prediction"
try:
    mlflow.set_experiment(experiment_name)
except mlflow.exceptions.MlflowException as e:
    if "RESOURCE_DOES_NOT_EXIST" in str(e):
        mlflow.create_experiment(experiment_name)
        mlflow.set_experiment(experiment_name)
    else:
        raise e

# Define the model using GradientBoostingRegressor
model = GradientBoostingRegressor()

# Fit the model
model.fit(X_train, y_train)

# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(model.get_params())

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate and log metrics
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mlflow.log_metric("root_mean_squared_error", rmse)
    mlflow.log_metric("mean_absolute_error", mae)
    mlflow.log_metric("r2_score", r2)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Model Info", "GradientBoostingRegressor for housing data")

    # Infer the model signature
    signature = infer_signature(X_train, model.predict(X_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="housing_model",
        signature=signature,
        registered_model_name="gbr_housing_model",
    )


Successfully registered model 'gbr_housing_model'.
2024/05/15 23:47:04 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: gbr_housing_model, version 1
Created version '1' of model 'gbr_housing_model'.
