In [1]:
import os
import warnings
import sys
import tarfile


import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import OrdinalEncoder,StandardScaler

from six.moves import urllib

import mlflow
import mlflow.sklearn

In [2]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()


In [3]:
def load_housing_data(housing_path=HOUSING_PATH):
    
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)


In [4]:
housing=load_housing_data()

In [5]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [6]:
num_col=[x for x in housing.columns if housing[x].dtype!='O' and x!='median_house_value']

In [7]:
housing[num_col]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,8.288136,0.129516,2.802260
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,6.281853,0.172096,2.181467
...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,5.045455,0.224625,2.560606
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,6.114035,0.215208,3.122807
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,5.205543,0.215173,2.325635
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,5.329513,0.219892,2.123209


In [8]:
imputer = SimpleImputer(strategy="median")

In [9]:
X=pd.DataFrame(data=imputer.fit_transform(housing[num_col]),columns=num_col)

In [10]:
X

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,8.288136,0.129516,2.802260
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,6.281853,0.172096,2.181467
...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,5.045455,0.224625,2.560606
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,6.114035,0.215208,3.122807
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,5.205543,0.215173,2.325635
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,5.329513,0.219892,2.123209


In [11]:
cat=housing[['ocean_proximity']]

In [12]:
ordinal_encoder = OrdinalEncoder()

In [13]:
housing_cat_encoded = ordinal_encoder.fit_transform(cat)

In [14]:
X['ocean_proximity']=housing_cat_encoded

In [15]:
X

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,6.984127,0.146591,2.555556,3.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,6.238137,0.155797,2.109842,3.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,8.288136,0.129516,2.802260,3.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,5.817352,0.184458,2.547945,3.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,6.281853,0.172096,2.181467,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,5.045455,0.224625,2.560606,1.0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,6.114035,0.215208,3.122807,1.0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,5.205543,0.215173,2.325635,1.0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,5.329513,0.219892,2.123209,1.0


In [16]:
y=housing['median_house_value']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)

In [18]:
s=StandardScaler()

In [19]:
X_train=s.fit_transform(X_train)

In [20]:
X_test=s.transform(X_test)

In [21]:
exp_name='housing_price_prediction'

In [23]:
mlflow.set_experiment(exp_name)

<Experiment: artifact_location='file:///mnt/c/Users/rushishwar.gattu/mlruns/2', experiment_id='2', lifecycle_stage='active', name='housing_price_prediction', tags={}>

In [63]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

def train(alpha=0.5, l1_ratio=0.5):
    
    warnings.filterwarnings("ignore")
    np.random.seed(40)
 
    with mlflow.start_run(nested=True) as parent_run:
        
        
        
        mlflow.log_param("parent", "yes")
        mlflow.log_param("parent_run_id",parent_run.info.run_id)
        
        with mlflow.start_run(nested=True) as train: 
            print('yes')
            mlflow.log_param("train_runid",train.info.run_id)
            mlflow.log_param("train", "yes")
            print('run')
            lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
            lr.fit(X_train,y_train)
            predicted_qualities = lr.predict(X_train)
            (rmse, mae, r2) = eval_metrics(y_train, predicted_qualities)
            print(rmse,mae,r2)
            mlflow.log_param(key="alpha", value=alpha)
            mlflow.log_param(key="l1_ratio", value=l1_ratio)
            mlflow.log_metric(key="train_rmse", value=rmse)
            mlflow.log_metrics({"train_mae": mae, "r2": r2})
        with mlflow.start_run(nested=True) as test: 
            mlflow.log_param("test_runid",test.info.run_id)
            
            mlflow.log_param("test", "yes")
            predicted_qualities = lr.predict(X_test)
            (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)
            mlflow.log_metric(key="test_rmse", value=rmse)
            mlflow.log_metrics({"test_mae": mae, "r2": r2})
       
       
   
        # Print out metrics
        print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        mlflow.sklearn.log_model(lr, "model")
    
    

In [64]:
train(0.5,0.8)

yes
run
71810.32764791348 52853.431658699585 0.6125753343256262
Elasticnet model (alpha=0.500000, l1_ratio=0.800000):
  RMSE: 72300.70097937273
  MAE: 53139.2396350826
  R2: 0.6077274899638425


In [None]:
lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(X_train,y_train)

        # Evaluate Metrics
        predicted_qualities = lr.predict(X_test)
        (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)

        # Print out metrics
        print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        # Log parameter, metrics, and model to MLflow
        mlflow.log_param(key="alpha", value=alpha)
        mlflow.log_param(key="l1_ratio", value=l1_ratio)
        mlflow.log_metric(key="rmse", value=rmse)
        mlflow.log_metrics({"mae": mae, "r2": r2})
        mlflow.sklearn.log_model(lr, "model")

In [36]:
train(0.5,0.5)

<RunInfo: artifact_uri='file:///mnt/c/Users/rushishwar.gattu/mlruns/2/63eb50e2a9a940878f0bb010751f0309/artifacts', end_time=None, experiment_id='2', lifecycle_stage='active', run_id='63eb50e2a9a940878f0bb010751f0309', run_uuid='63eb50e2a9a940878f0bb010751f0309', start_time=1653886740301, status='RUNNING', user_id='rushishwar'>
Elasticnet model (alpha=0.500000, l1_ratio=0.500000):
  RMSE: 76545.30039842239
  MAE: 57487.11554234574
  R2: 0.560316749198435
