<a href="https://colab.research.google.com/github/Sapphirevic/MLOp/blob/main/duration_explore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python -V

Python 3.10.12


In [None]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error
import os
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [None]:
from google.colab import files
upload = files.upload()

Saving green_tripdata_2021-01.parquet to green_tripdata_2021-01.parquet


In [None]:
from google.colab import files
upload = files.upload()

Saving green_tripdata_2021-02.parquet to green_tripdata_2021-02.parquet


In [None]:
!pip install mlflow #--quiet
!pip install pyngrok  #ngrok helps integrate mlflow in colab

In [None]:
import mlflow
import subprocess
from pyngrok import ngrok, conf
import getpass

In [None]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('Learn MLFLOW')

In [None]:
print('ngrok config add-authtoken 2ZrJVVeIYs9WvY2PriDSGfOk8Iz_7FtnWPBGCTht5dRGuJ4jZ')
conf.get_default().auth_token = getpass.getpass()
port=5000
public_url =ngrok.connect(port).public_url
print(f' *ngrok tunnel \'{public_url}\' -> \'http://127.0.0.1:{port}\'')

ngrok config add-authtoken 2ZrJVVeIYs9WvY2PriDSGfOk8Iz_7FtnWPBGCTht5dRGuJ4jZ
··········
 *ngrok tunnel 'https://e119-34-148-32-177.ngrok-free.app' -> 'http://127.0.0.1:5000'


In [None]:
def read_dataframe(filename):
  lab = pd.read_parquet(filename)

  lab.lpep_pickup_datetime = pd.to_datetime(lab.lpep_pickup_datetime)    #datetime - to tell pandas that it is not a string but date
  lab.lpep_dropoff_datetime = pd.to_datetime(lab.lpep_dropoff_datetime)

  lab['duration'] = lab.lpep_dropoff_datetime - lab.lpep_pickup_datetime   # adding duration to the list
  lab.duration = lab.duration.apply(lambda td: td.total_seconds() / 60)

  lab = lab[((lab.duration >= 1) & (lab.duration <= 60))]

  categ = ['PULocationID', 'DOLocationID']
  lab[categ] = lab[categ].astype(str)

  return lab

In [None]:
lab_train = read_dataframe('green_tripdata_2021-01.parquet')
lab_val = read_dataframe('green_tripdata_2021-02.parquet')

In [None]:
len(lab_train), len(lab_val)

(73908, 61921)

In [None]:
lab_train['PU_DO'] = lab_train['PULocationID'].astype(str) + '_' + lab_train['DOLocationID'].astype(str)
lab_val['PU_DO'] = lab_val['PULocationID'].astype(str) + '_' + lab_val['DOLocationID'].astype(str)

In [None]:
categ = ['PU_DO']#, 'DOLocationID']
num = ['trip_distance']

dv =DictVectorizer()    # turns a dictionary into a vector

train_dict = lab_train[categ + num].to_dict(orient='records') #iloc[:10 - takes first 10,  this syntax turns the dataframe into dictionaries
x_train = dv.fit_transform(train_dict)

val_dicts = lab_val[categ + num].to_dict(orient='records')
x_val =  dv.transform(val_dicts)

In [None]:
type(x_train)

scipy.sparse._csr.csr_matrix

In [None]:
type(x_val)

scipy.sparse._csr.csr_matrix

In [None]:
target = 'duration'
y_train = lab_train[target].values
y_val = lab_val[target].values

In [None]:
type(y_val)

numpy.ndarray

In [None]:
lr = LinearRegression()
lr.fit(x_train, y_train)

y_pred = lr.predict(x_val)

mean_squared_error(y_val, y_pred, squared=False)

7.758715206462274

In [None]:
import pathlib
pathlib.Path("models").mkdir(exist_ok=True)

# Creating the 'models' directory because it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')

# original code to save the file
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)


In [23]:
# End the current active run
mlflow.end_run()

# Start a new run
with mlflow.start_run():
    mlflow.set_tag('developer', 'Victoria')

    mlflow.log_param('train-data-path', 'green_tripdata_2021-01.parquet')
    mlflow.log_param('val-data-path', 'green_tripdata_2021-02.parquet')

    alpha = 0.01
    mlflow.log_param('alpha', alpha)

    lr = Lasso(alpha)
    lr.fit(x_train, y_train)

    y_pred = lr.predict(x_val)

    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric('rmse', rmse)

    mlflow.log_artifact(local_path='models/lin_reg.bin', artifact_path='models_pickle/lin_reg.bin')  #saved the model in lin_reg.bin in the artifact directory


In [24]:
train = xgb.DMatrix(x_train, label=y_train)
valid = xgb.DMatrix(x_val, label=y_val)

In [25]:
def objective(params):
      #  params = log and visualize parameters using MLflow
  with mlflow.start_run():
     mlflow.set_tag('model', 'xgboost')
     mlflow.log_params(params)
     booster = xgb.train(
        params=params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
     )
     y_pred = booster.predict(valid)
     rmse = mean_squared_error(y_val, y_pred, squared=False)
     mlflow.log_metric('rmse', rmse)

  return {'loss': rmse, 'status': STATUS_OK}

In [None]:
 search_space = {
     'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
     'learning_rate': hp.loguniform('learning_rate', -3, 0),
     'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
     'reg_lambda': hp.loguniform('reg_lamba', -6, -1),
     'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
     'objective': 'reg:linear',
     'seed': 42,
 }

 best_result = fmin(
     fn=objective,
     space=search_space,
     algo=tpe.suggest,
     max_evals=50,
     trials=Trials())

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR

mlflow.sklearn.autolog()

for model_class in (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR):

    with mlflow.start_run():

        mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.csv")
        mlflow.log_param("valid-data-path", "./data/green_tripdata_2021-02.csv")
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)

        y_pred = mlmodel.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)