In [1]:
import os
import warnings
import sys

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split



In [4]:
import mlflow
import mlflow.sklearn

In [21]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_transformer

In [10]:
HOUSING_PATH = 'datasets/housing'
def load_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [11]:
housing = load_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [12]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [15]:
col_names = "total_rooms", "total_bedrooms", "population", "households"
rooms_ix, bedrooms_ix, population_ix, households_ix = [
    housing.columns.get_loc(c) for c in col_names]



class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]
        
        

def train_test(data):
    housing=data
    train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
    housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(housing, housing["income_cat"]):
        strat_train_set = housing.loc[train_index]
        strat_test_set = housing.loc[test_index]
    
    return strat_train_set,strat_test_set


def data_prep(data):
    housing=data
    
    housing_labels = housing["median_house_value"].copy()
    housing = housing.drop("median_house_value", axis=1)
    
    
    housing_num = housing.drop("ocean_proximity", axis=1)
    
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])
    
    
    num_attribs = list(housing_num)
    cat_attribs = ["ocean_proximity"]

    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])
    
    housing_prepared = full_pipeline.fit_transform(housing)
    
    return housing_prepared,housing_labels

In [16]:
remote_server_uri = "http://127.0.0.1:5000"
mlflow.set_tracking_uri(remote_server_uri)

In [17]:
mlflow.tracking.get_tracking_uri()

'http://127.0.0.1:5000'

In [18]:
exp_name = "Housing_exp"
mlflow.set_experiment(exp_name)

INFO: 'Housing_exp' does not exist. Creating a new experiment


In [22]:
def eval_metrics(actual, pred):
    # compute relevant metrics
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2



def train(max_features=5, n_estimators=25):
    warnings.filterwarnings("ignore")

    data_path = "datasets/housing/housing.csv"

    # Useful for multiple runs (only doing one run in this sample notebook)    
    with mlflow.start_run(run_name='PARENT_housing') as parent_run:
        mlflow.log_param("parent", "yes")
        with mlflow.start_run(run_name='CHILD_DATA_PREP', nested=True) as child_run:
            mlflow.log_param("child_dataprep", "yes")
            data = load_data()
    
            train_data, test_data = train_test(data)
    
            train_x, train_y = data_prep(train_data)
    
            test_x, test_y = data_prep(test_data)
        
        with mlflow.start_run(run_name='CHILD_TRAIN_MODEL', nested=True) as child_run:
            mlflow.log_param("child_trainmodel", "yes")
        
            forest_reg=RandomForestRegressor(max_features=max_features, n_estimators=n_estimators, random_state=42)
       
            forest_reg.fit(test_x, test_y)
        
        with mlflow.start_run(run_name='CHILD_SCORING', nested=True) as child_run:
            mlflow.log_param("child_scoring", "yes")
        
            # Evaluate Metrics
            predicted_qualities = forest_reg.predict(test_x)
            (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)
    
        
        # Print out metrics
        print("Elasticnet model (max_features=%f, n_estimators=%f):" % (max_features, n_estimators))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log parameter, metrics, and model to MLflow
        mlflow.log_param(key="max_features", value=max_features)
        mlflow.log_param(key="n_estimators", value=n_estimators)
        mlflow.log_metric(key="rmse", value=rmse)
        mlflow.log_metrics({"mae": mae, "r2": r2})
        mlflow.log_artifact(data_path)
        print("Save to: {}".format(mlflow.get_artifact_uri()))
        
        mlflow.sklearn.log_model(forest_reg, "model")

In [23]:
train(5, 30)

Elasticnet model (max_features=5.000000, n_estimators=30.000000):
  RMSE: 20831.96844633584
  MAE: 14203.298183139535
  R2: 0.966702531284177
Save to: mlruns/2/1915dd78a4bd4cc6ab11afd444269ef1/artifacts


In [24]:
train(8, 25)

Elasticnet model (max_features=8.000000, n_estimators=25.000000):
  RMSE: 21029.779321999176
  MAE: 14268.797916666666
  R2: 0.9660671738388706
Save to: mlruns/2/e31683711c5840318ca51bf09d05acdd/artifacts


In [25]:
train()

Elasticnet model (max_features=5.000000, n_estimators=25.000000):
  RMSE: 21050.25312854327
  MAE: 14312.002209302327
  R2: 0.9660010702158843
Save to: mlruns/2/69fb8dd4f7cf42cdb61ad92f976fca58/artifacts


In [26]:
train(8, 8)

Elasticnet model (max_features=8.000000, n_estimators=8.000000):
  RMSE: 24891.862213969343
  MAE: 15817.338299418605
  R2: 0.9524593212382285
Save to: mlruns/2/65a2e3522285491badacc3d5e22fc818/artifacts
