In [1]:
import pickle

In [2]:
import pandas as pd

In [3]:
import seaborn as sns
import matplotlib as plt

In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression,Lasso,Ridge

from sklearn.metrics import mean_squared_error
import sklearn

In [5]:
import mlflow


mlflow.set_tracking_uri("sqlite:///mide.db")

In [14]:

mlflow.set_experiment("mide_first_experiment")


2022/05/28 17:21:06 INFO mlflow.tracking.fluent: Experiment with name 'mide_first_experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='mide_first_experiment', tags={}>

In [6]:
def read_dataframe(filename):
    
    data = pd.read_parquet(filename)
    
    data['duration'] = data.lpep_dropoff_datetime - data.lpep_pickup_datetime
    data.duration = data.duration.apply(lambda td:td.total_seconds()/60)
    
    #data = data[data.trip_type == 2]
    
    data = data[(data.duration >= 1) & (data.duration <= 60)]
    
    categorical = ['PULocationID','DOLocationID']
    
    data[categorical] =data[categorical].astype(str)
    
    return data
    

In [7]:
df_train = read_dataframe('./data/green_tripdata_2021-01.parquet')
df_val = read_dataframe('./data/green_tripdata_2021-02.parquet')

In [8]:
len(df_train),len(df_val)

(73908, 61921)

In [9]:

df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [10]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [11]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

## Mlflow Tracking

In [15]:

for i,name in zip([LinearRegression,Lasso,Ridge],['LinearRegression','Lasso','Ridge']):


    with mlflow.start_run():

        mlflow.set_tag("developer","olamide")

        mlflow.log_param("train_data_path","./data/green_tripdata_2021-01.parquet")

        mlflow.log_param("test_data_path","./data/green_tripdata_2021-02.parquet")

        mlflow.log_param("model",name)

        lr = i()
        lr.fit(X_train, y_train)

        y_pred = lr.predict(X_val)

        mse = mean_squared_error(y_val, y_pred, squared=False)

        mlflow.log_metric('mse',mse)
        
        with open("models/preprocesser.b","wb") as pre_out:
            pickle.dump(dv,pre_out)
            
        mlflow.log_artifact("models/preprocesser.b",artifact_path="preprocessor")

        mlflow.sklearn.log_model(lr, artifact_path="models_mlflow")
        
        print(f'successfully ran {name} model =======================================================> ')



# Mlflow Prediction

In [16]:
logged_model = 'runs:/d8d8b51e1484439b89b52688a5ead3ff/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)


In [17]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.sklearn
  run_id: d8d8b51e1484439b89b52688a5ead3ff

In [18]:
predicted = loaded_model.predict(X_val)

In [19]:
predicted

array([12.78586593, 10.48794163, 13.30708888, ..., 12.68717523,
        5.11874822,  6.76573538])

# Experiment two

In [20]:
mlflow.set_experiment("mide_second_experiment")

2022/05/28 17:22:05 INFO mlflow.tracking.fluent: Experiment with name 'mide_second_experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='./mlruns/2', experiment_id='2', lifecycle_stage='active', name='mide_second_experiment', tags={}>

In [12]:
#from sklearn.ensemble import RandomForestRegressor ,GradientBoostingRegressor, ExtraTreesRegressor
#from sklearn.svm import LinearSVR

mlflow.sklearn.autolog()

for model_class in (LinearRegression,Lasso,Ridge):

    with mlflow.start_run(experiment_id=2):

        mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
        mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")
        mlflow.log_artifact("models/preprocesser.b", artifact_path="preprocessor")


        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)



In [18]:
mlflow.set_experiment("mide_third__experiment")

2022/05/28 18:32:14 INFO mlflow.tracking.fluent: Experiment with name 'mide_third__experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='./mlruns/4', experiment_id='4', lifecycle_stage='active', name='mide_third__experiment', tags={}>

In [19]:
from sklearn.ensemble import RandomForestRegressor ,GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR

mlflow.sklearn.autolog()

for model_class in (RandomForestRegressor ,GradientBoostingRegressor, ExtraTreesRegressor,LinearSVR):

    with mlflow.start_run(experiment_id=4):

        mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
        mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")
        mlflow.log_artifact("models/preprocesser.b", artifact_path="preprocessor")


        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

