# Packages

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score


import mlflow
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.linear_model import LinearRegression

from google.cloud import storage


# Global Variable Declaration

In [2]:
TRACKING_SERVER_HOST = "0.0.0.0"
TRACKING_SERVER_PORT = "5000"
TRAIN_DATA_PATH = "/home/Pranoy/MLOPS-Practice/data/yellow_tripdata_2023-01.parquet"
VALID_DATA_PATH = "/home/Pranoy/MLOPS-Practice/data/yellow_tripdata_2023-02.parquet"
EXPERIMENT_NAME = "practice_nyc_experiment"

TRACKING_URI = f'{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}'

MLFlow initialization
- mlflow server -h 0.0.0.0 -p 5000 --backend-store-uri postgresql://[username]:[password]@[private_IP_address]:5432/[database_name] --default-artifact-root gs://[bucket_name]

In [3]:
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='gs://mlflow_bucket_1408/1', creation_time=1702158711950, experiment_id='1', last_update_time=1702158711950, lifecycle_stage='active', name='practice_nyc_experiment', tags={}>

# Data Processing

### Functions

In [4]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        
        df.lpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename, engine = 'pyarrow')

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    df = df.sample(frac=0.1)
    df.reset_index(inplace=True)
    
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    
    return df

In [5]:
dv = DictVectorizer()
def vectorizer(df: pd.DataFrame, fit = False):
    features = ['PU_DO', 'trip_distance']
    dict_df = df[features].to_dict(orient='records')
    # If fit is True, fit the DictVectorizer to the data
    if fit:
        tdf = dv.fit_transform(dict_df)
    else:
        # If fit is False, transform using the already fitted DictVectorizer
        tdf = dv.transform(dict_df)

    return tdf

### Load Data

In [6]:
df_train = read_dataframe(TRAIN_DATA_PATH)
df_val = read_dataframe(VALID_DATA_PATH)

In [7]:
df_train.shape

(300917, 22)

In [8]:
df_val.shape

(285595, 22)

In [9]:
x_train = vectorizer(df_train,fit=True)
x_val = vectorizer(df_val)

In [11]:
print(f"Shape of x_train is {x_train.shape}")
print(f"Shape of x_val is {x_val.shape}")

Shape of x_train is (300917, 9984)
Shape of x_val is (285595, 9984)


In [12]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

### Model Train

In [13]:

with mlflow.start_run(run_name = f"Lasso_Exp_{0}", description=f"Lasso Regression of experiment number {0}"):
    mlflow.log_params({
        'alpha' : 0.01,
        'model'  : 'Lasso',
        'train_data' : TRAIN_DATA_PATH,
        'validation_data'  : VALID_DATA_PATH
        })
    mlflow.set_tags({'FirstName':'Pranoy'
                 ,'LastName':'Dewanjee'})
    ls = Lasso(0.01)
    ls.fit(x_train,y_train)
    y_pred = ls.predict(x_val)
    rmse = mean_squared_error(y_val,y_pred, squared = False)
    r2score = r2_score(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2score)
    mlflow.sklearn.log_model(ls,artifact_path="model")

In [14]:
count = 0
def model_train(x_train,y_train,x_val,y_val,alpha):
    global count
    count+=1
    mlflow.start_run(run_name = f"Lasso_Exp_{count}", description=f"Lasso Regression of experiment number {count}")
    mlflow.log_params({
        'alpha' : alpha,
        'model'  : 'Lasso',
        'train_data' : TRAIN_DATA_PATH,
        'validation_data'  : VALID_DATA_PATH
        })
    mlflow.set_tags({'FirstName':'Pranoy'
                 ,'LastName':'Dewanjee'})
    
    ls = Lasso(alpha)
    ls.fit(x_train,y_train)
    y_pred = ls.predict(x_val)
    rmse = mean_squared_error(y_val,y_pred, squared = False)
    r2score = r2_score(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2_score", r2score)
    mlflow.sklearn.log_model(ls,artifact_path="model")

    mlflow.end_run()

In [15]:
import random

random_numbers = [random.random() for _ in range(5)]

print(random_numbers)

[0.5985607175928325, 0.029874608995759888, 0.1802763495770121, 0.3691135911554605, 0.12544579854842985]


In [16]:
for alpha in random_numbers:
    model_train(x_train,y_train,x_val,y_val,alpha)

