In [2]:
#!/usr/bin/env python
# coding: utf-8

## Importing Libraries

In[ ]:

In [3]:
import os
import warnings
import sys

In [4]:
import tarfile
import urllib

In [5]:
import pandas as pd
import numpy as np

In [6]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet

In [8]:
#!pip install mlflow

Collecting mlflow
  Downloading mlflow-1.30.1-py3-none-any.whl (17.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.0/17.0 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
Collecting cloudpickle<3
  Using cached cloudpickle-2.2.1-py3-none-any.whl (25 kB)
Collecting alembic<2
  Downloading alembic-1.11.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting Flask<3
  Downloading Flask-2.2.5-py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.8/101.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting gunicorn<21
  Using cached gunicorn-20.1.0-py3-none-any.whl (79 kB)
Collecting protobuf<5,>=3.12.0
  Downloading protobuf-4.23.2-cp37-abi3-manylinux2014_x86_64.whl (304 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m304.5/304.5 kB[0m [31m2.7 MB/s

In [9]:

import mlflow
import mlflow.sklearn

## Loading the data

In[ ]:

In [10]:
DOWNLOAD_ROOT ="https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In[ ]:

In [11]:
def fetch_housing_data(housing_url=HOUSING_URL,housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In[ ]:

In [12]:
fetch_housing_data(housing_url=HOUSING_URL,housing_path=HOUSING_PATH)

In[ ]:

In [13]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In[ ]:

In [14]:
housing = load_housing_data(housing_path=HOUSING_PATH)

In[ ]:

In [15]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Setting Mlflow server

In[ ]:

In [16]:
remote_server_uri = "http://localhost:5000" # set to your server URI
mlflow.set_tracking_uri(remote_server_uri)

In[ ]:

In [17]:
mlflow.tracking.get_tracking_uri()

'http://localhost:5000'

In[ ]:

In [16]:
exp_name = "ElasticNet_house"
mlflow.set_experiment(exp_name)

2023/01/16 14:13:03 INFO mlflow.tracking.fluent: Experiment with name 'ElasticNet_house' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlruns/158930113435695454', creation_time=1673858589900, experiment_id='158930113435695454', last_update_time=1673858589900, lifecycle_stage='active', name='ElasticNet_house', tags={}>

## Mlflow tracking parameters

In[ ]:

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [18]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

In [19]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:,households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household,population_per_household,bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household,population_per_household]

In [20]:
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),
                ('attribs_adder', CombinedAttributesAdder()),('std_scaler', StandardScaler()),])

In [21]:
def eval_metrics(actual, pred):
    # compute relevant metrics
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [22]:
def train(alpha=0.5, l1_ratio=0.5):
    # train a model with given parameters
    warnings.filterwarnings("ignore")
    np.random.seed(40)
    
    housing = load_housing_data(housing_path=HOUSING_PATH)
    train_set, test_set = train_test_split(housing, test_size=0.2,random_state=42)
    
    with mlflow.start_run(run_name='Main') as parent_run:
        mlflow.log_param("Main", "yes")
        split = StratifiedShuffleSplit(n_splits=1, test_size=0.2,random_state=42)
        
        with mlflow.start_run(run_name='Data_Preparation', nested=True) as child_run:
            mlflow.log_param("Data Preparayion", "yes")
    
            housing["income_cat"] = pd.cut(housing["median_income"],bins=[0., 1.5, 3.0, 4.5, 6., np.inf],labels=[1, 2, 3, 4, 5])
    
            for train_index, test_index in split.split(housing,housing["income_cat"]):
                strat_train_set = housing.loc[train_index]
                strat_test_set = housing.loc[test_index]
        
            for set_ in (strat_train_set, strat_test_set):
                set_.drop("income_cat", axis=1, inplace=True)
        
            housing = strat_train_set.copy()
    
            housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
            housing["bedrooms_per_room"] =housing["total_bedrooms"]/housing["total_rooms"]
            housing["population_per_household"]=housing["population"]/housing["households"]
    
            housing = strat_train_set.drop("median_house_value", axis=1)
            housing_labels = strat_train_set["median_house_value"].copy()
    
            median = housing["total_bedrooms"].median() # option 3
            housing["total_bedrooms"].fillna(median, inplace=True)
    
    
            imputer = SimpleImputer(strategy="median")
    
            housing_num = housing.drop("ocean_proximity", axis=1)
    
            imputer.fit(housing_num)
    
            X = imputer.transform(housing_num)
    
            housing_tr = pd.DataFrame(X, columns=housing_num.columns,index=housing_num.index)
    
            housing_cat = housing[["ocean_proximity"]]
    
            ordinal_encoder = OrdinalEncoder()
            housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
    
            cat_encoder = OneHotEncoder()
            housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
    
    
    
            attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
            housing_extra_attribs = attr_adder.transform(housing.values)
    
            housing_num_tr = num_pipeline.fit_transform(housing_num)
    
            num_attribs = list(housing_num)
            cat_attribs = ["ocean_proximity"]
    
            full_pipeline = ColumnTransformer([("num", num_pipeline, num_attribs),
                ("cat", OneHotEncoder(), cat_attribs),])
        
            housing_prepared = full_pipeline.fit_transform(housing)
            
        with mlflow.start_run(run_name='Model_Training', nested=True) as child_run:
            mlflow.log_param("Model Training", "yes")
    
            lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
            lr.fit(housing_prepared, housing_labels)
    
            predicted_qualities = lr.predict(housing_prepared)
            (rmse, mae, r2) = eval_metrics(housing_labels, predicted_qualities)
        
        with mlflow.start_run(run_name='Model_Performance', nested=True) as child_run:
            mlflow.log_param("Model Performance", "yes")
            print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
            print("  RMSE: %s" % rmse)
            print("  MAE: %s" % mae)
            print("  R2: %s" % r2)
    
            mlflow.log_param(key="alpha", value=alpha)
            mlflow.log_param(key="l1_ratio", value=l1_ratio)
            mlflow.log_metric(key="rmse", value=rmse)
            mlflow.log_metrics({"mae": mae, "r2": r2})
            mlflow.log_artifact('datasets')
            print("Save to: {}".format(mlflow.get_artifact_uri()))
            mlflow.sklearn.log_model(lr, "model")

In[ ]:

In [23]:
train(0.5, 0.5)

Elasticnet model (alpha=0.500000, l1_ratio=0.500000):
  RMSE: 73689.95546046467
  MAE: 54624.63812598483
  R2: 0.59433603239391
Save to: mlruns/158930113435695454/bb7df96da8d149aeadfb6bc0b3eb2a8f/artifacts


In[ ]:

In [24]:
train(0.2, 0.2)

Elasticnet model (alpha=0.200000, l1_ratio=0.200000):
  RMSE: 71734.82432989801
  MAE: 52579.03875426437
  R2: 0.6155765038209369
Save to: mlruns/158930113435695454/0a1dc352e2e3414f8cb915f8bf4c4744/artifacts


In[ ]: