In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error

In [3]:
import mlflow

In [4]:
import pickle

In [5]:
# set the tracking uri for the mlflow- backend with sqlite database
mlflow.set_tracking_uri("sqlite:///ml_flow.db")

# set experiment name
mlflow.set_experiment("usman-practice-experiment-1")


<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='usman-practice-experiment-1', tags={}>

In [6]:
def read_dataframe(file_name):

    df = pd.read_parquet(file_name)


    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)


    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [7]:
train_df = read_dataframe('./Data/green_tripdata_2021-01.parquet')
test_df = read_dataframe('./Data/green_tripdata_2021-02.parquet')

In [8]:
print(f'The size of the train set is: {train_df.shape[0]} rows and {train_df.shape[1]} columns')
print(f'The size of the test set is: {test_df.shape[0]} rows and {test_df.shape[1]} columns')

The size of the train set is: 73908 rows and 21 columns
The size of the test set is: 61921 rows and 21 columns


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73908 entries, 0 to 76517
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   VendorID               73908 non-null  int64         
 1   lpep_pickup_datetime   73908 non-null  datetime64[ns]
 2   lpep_dropoff_datetime  73908 non-null  datetime64[ns]
 3   store_and_fwd_flag     38175 non-null  object        
 4   RatecodeID             38175 non-null  float64       
 5   PULocationID           73908 non-null  object        
 6   DOLocationID           73908 non-null  object        
 7   passenger_count        38175 non-null  float64       
 8   trip_distance          73908 non-null  float64       
 9   fare_amount            73908 non-null  float64       
 10  extra                  73908 non-null  float64       
 11  mta_tax                73908 non-null  float64       
 12  tip_amount             73908 non-null  float64       
 13  t

In [9]:
def preprocess_data(train_df, test_df):
    
    train_df['PU_DO'] = train_df['PULocationID'] + '_' + train_df['DOLocationID']
    test_df['PU_DO'] = test_df['PULocationID'] + '_' + test_df['DOLocationID']

    categorical = ['PU_DO'] 
    numerical = ['trip_distance']

    target = 'duration'


    dv = DictVectorizer()


    train_dicts = train_df[categorical + numerical].to_dict(orient='records')
    X_train = dv.fit_transform(train_dicts)

    test_dicts = test_df[categorical + numerical].to_dict(orient='records')
    X_test = dv.transform(test_dicts)

    y_train = train_df[target].values
    y_test = test_df[target].values

    return dv, X_train, y_train, X_test, y_test


In [10]:
dv, X_train, y_train, X_test, y_test = preprocess_data(train_df, test_df)

In [11]:
## train and test a the linear regression model on the data
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

print(f'rmse of the test_data: {mean_squared_error(y_test, y_pred, squared=False)}')

rmse of the test_data: 7.758715199477344


In [12]:
with open("./models/preprocessor.b", 'wb') as processor:
    pickle.dump(dv, processor)

## MY FIRST EXPERIMENT 

In [13]:
## use mlflow to portray and experiment the four linear algorithm
## Compare the error between the Linear_regression algrithm and Regularised Linear algorithm such as Ridge Regression, LASSO and ElasticNet

model_dict = {
    'lin_reg':LinearRegression,
    'ridge_reg':Ridge,
    'elastic_net':ElasticNet,
    'lasso':Lasso
}

for reg, model in model_dict.items():
    with mlflow.start_run(experiment_id=1):
        
        mlflow.set_tag('developer', 'Usman')

        mlflow.set_tag("model", reg)

        mlflow.log_param('train-data-path', './Data/green_tripdata_2021-01.parquet')

        mlflow.log_param('test-data-path', './Data/green_tripdata_2021-02.parquet')

        reg = model()

        reg.fit(X_train, y_train)

        y_pred = reg.predict(X_test)

        mlflow.log_artifact("./models/preprocessor.b", artifact_path="preprocessor")


        rmse = mean_squared_error(y_test, y_pred, squared=False)
        

        mlflow.log_metric('rmse', rmse)

## MY SECOND EXPERIMENT WITH AUTOLOG

In [14]:
mlflow.set_experiment("usman-practice-experiment-2")

<Experiment: artifact_location='./mlruns/2', experiment_id='2', lifecycle_stage='active', name='usman-practice-experiment-2', tags={}>

In [15]:
model_list = [
LinearRegression, Lasso, Ridge, ElasticNet
]

mlflow.sklearn.autolog()

for model in model_list:
    with mlflow.start_run(experiment_id=2):
    

        mlflow.log_param('train-data-path', './Data/green_tripdata_2021-01.parquet')

        mlflow.log_param('test-data-path', './Data/green_tripdata_2021-02.parquet')

        mlflow.log_artifact("./models/preprocessor.b", artifact_path="preprocessor")

        reg = model()

        reg.fit(X_train, y_train)

        y_pred = reg.predict(X_test)

        rmse = mean_squared_error(y_test, y_pred, squared=False)
        
        mlflow.log_metric('rmse', rmse)

        




## LOAD MODEL AND TEST FOR PREDICTION

In [18]:
logged_model = 'runs:/0f4f17495d994d5fa1ef936b91f741ef/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [19]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: model
  flavor: mlflow.sklearn
  run_id: 0f4f17495d994d5fa1ef936b91f741ef

In [20]:
y_pred = loaded_model.predict(X_test)
mean_squared_error(y_test, y_pred, squared=False)

7.758715199477344