# House Sales in King Country, USA

## Using MLFLOW 

### Preprocessing the dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


data = pd.read_csv('kc_house_data.csv')
data = pd.get_dummies(data, drop_first=True)
X = data.drop('price', axis=1)
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [2]:
X

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,date_20150508T000000,date_20150509T000000,date_20150510T000000,date_20150511T000000,date_20150512T000000,date_20150513T000000,date_20150514T000000,date_20150515T000000,date_20150524T000000,date_20150527T000000
0,7129300520,3,1.00,1180,5650,1.0,0,0,3,7,...,False,False,False,False,False,False,False,False,False,False
1,6414100192,3,2.25,2570,7242,2.0,0,0,3,7,...,False,False,False,False,False,False,False,False,False,False
2,5631500400,2,1.00,770,10000,1.0,0,0,3,6,...,False,False,False,False,False,False,False,False,False,False
3,2487200875,4,3.00,1960,5000,1.0,0,0,5,7,...,False,False,False,False,False,False,False,False,False,False
4,1954400510,3,2.00,1680,8080,1.0,0,0,3,8,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,3,2.50,1530,1131,3.0,0,0,3,8,...,False,False,False,False,False,False,False,False,False,False
21609,6600060120,4,2.50,2310,5813,2.0,0,0,3,8,...,False,False,False,False,False,False,False,False,False,False
21610,1523300141,2,0.75,1020,1350,2.0,0,0,3,7,...,False,False,False,False,False,False,False,False,False,False
21611,291310100,3,2.50,1600,2388,2.0,0,0,3,8,...,False,False,False,False,False,False,False,False,False,False


In [3]:
y

0        221900.0
1        538000.0
2        180000.0
3        604000.0
4        510000.0
           ...   
21608    360000.0
21609    400000.0
21610    402101.0
21611    400000.0
21612    325000.0
Name: price, Length: 21613, dtype: float64

###   Train the ML models

In [4]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'RandomForest': RandomForestRegressor(),
    'GradientBoosting': GradientBoostingRegressor(),
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    predictions = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, predictions)
    print(f'{name} MSE: {mse}')


LinearRegression MSE: 6.016735988668334e+32
Ridge MSE: 45878422643.50392
RandomForest MSE: 21249184544.23715
GradientBoosting MSE: 22271469626.60548


### Track model performance

In [None]:
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Define the models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# Initialize an MLflow experiment
mlflow.start_run()

# Loop through the models
for name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)

    # Make predictions on the test data
    predictions = model.predict(X_test_scaled)

    # Calculate Mean Squared Error (MSE)
    mse = mean_squared_error(y_test, predictions)

    # Calculate Mean Absolute Error (MAE)
    mae = mean_absolute_error(y_test, predictions)

    # Calculate R-squared (R2) score
    r2 = r2_score(y_test, predictions)

    # Log the model parameters and metrics to the MLflow run
    with mlflow.start_run(nested=True):
        mlflow.log_params({
            'model_name': name,
        })
        mlflow.log_metrics({
            'mse': mse,
            'mae': mae,
            'r2': r2
        })

        # Log the model itself
        mlflow.sklearn.log_model(model, "model")

# After running your code, start the MLflow dashboard
# Use the desired port, e.g., 1234
!mlflow ui --port 1234




[2023-12-01 18:27:46 +0100] [46011] [INFO] Starting gunicorn 21.2.0
[2023-12-01 18:27:46 +0100] [46011] [INFO] Listening at: http://127.0.0.1:1234 (46011)
[2023-12-01 18:27:46 +0100] [46011] [INFO] Using worker: sync
[2023-12-01 18:27:46 +0100] [46012] [INFO] Booting worker with pid: 46012
[2023-12-01 18:27:46 +0100] [46013] [INFO] Booting worker with pid: 46013
[2023-12-01 18:27:46 +0100] [46014] [INFO] Booting worker with pid: 46014
[2023-12-01 18:27:46 +0100] [46015] [INFO] Booting worker with pid: 46015


In [19]:
mlflow.end_run()

### ONNX format

In [4]:
from sklearn.ensemble import RandomForestRegressor

random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)  # You can adjust n_estimators as needed

random_forest_model.fit(X_train_scaled, y_train)


In [None]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
import onnx

# Convert the Random Forest model to ONNX format
initial_type = [('float_input', FloatTensorType([None, X_train_scaled.shape[1]]))]
onnx_model = convert_sklearn(random_forest_model, initial_types=initial_type, target_opset=15) 
# Set the target opset to 15, or the opset that is compatible with your Random Forest model

# Save the model in ONNX format
onnx_file_path = 'random_forest_regression.onnx'
with open(onnx_file_path, "wb") as f:
    f.write(onnx_model.SerializeToString())

print(f"Random Forest Regression model saved in ONNX format at {onnx_file_path}")


In [36]:
import joblib

# After scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler to a pickle file
with open('scaler.pkl', 'wb') as f:
    joblib.dump(scaler, f)
