<a href="https://colab.research.google.com/github/Sapphirevic/MLOp/blob/main/MLOps_Zoomcamp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Optional Labwork

---
Cloning Resipository( Using Git in Colaboratory)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer   #DictVectorizer is particularly useful when dealing with categorical features, and it helps convert such features into a format suitable for machine learning models that require numerical input.
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
import os
import pickle
from sklearn.linear_model import Ridge #An instance of the Ridge class is created with a specified value for the regularization parameter (alpha)

In [None]:
from google.colab import files
upload = files.upload()

Saving green_tripdata_2021-01.parquet to green_tripdata_2021-01.parquet


In [None]:
lab = pd.read_parquet('green_tripdata_2021-01.parquet')
lab.head()

In [None]:
def read_dataframe(filename):
  lab = pd.read_parquet(filename)

  lab.lpep_pickup_datetime = pd.to_datetime(lab.lpep_pickup_datetime)    #datetime - to tell pandas that it is not a string but date
  lab.lpep_dropoff_datetime = pd.to_datetime(lab.lpep_dropoff_datetime)

  lab['duration'] = lab.lpep_dropoff_datetime - lab.lpep_pickup_datetime   # adding duration to the list
  lab.duration = lab.duration.apply(lambda td: td.total_seconds() / 60)

  lab = lab[((lab.duration >= 1) & (lab.duration <= 60))]

  categ = ['PULocationID', 'DOLocationID']
  lab[categ] = lab[categ].astype(str)

  return lab

In [None]:
from google.colab import files
upload = files.upload()

Saving green_tripdata_2021-02.parquet to green_tripdata_2021-02.parquet


In [None]:
lab_train = read_dataframe('green_tripdata_2021-01.parquet')
lab_val = read_dataframe('green_tripdata_2021-02.parquet')

In [None]:
len(lab_train), len(lab_val)

(73908, 61921)

In [None]:
lab_train['PU_DO'] = lab_train['PULocationID'].astype(str) + '_' + lab_train['DOLocationID'].astype(str)
lab_val['PU_DO'] = lab_val['PULocationID'].astype(str) + '_' + lab_val['DOLocationID'].astype(str)

In [None]:
categ = ['PU_DO']#, 'DOLocationID']
num = ['trip_distance']

dv =DictVectorizer()    # turns a dictionary into a vector

train_dict = lab_train[categ + num].to_dict(orient='records') #iloc[:10 - takes first 10,  this syntax turns the dataframe into dictionaries
x_train = dv.fit_transform(train_dict)

val_dicts = lab_val[categ + num].to_dict(orient='records')
x_val =  dv.transform(val_dicts)

In [None]:
target = 'duration'
y_train = lab_train[target].values
y_val = lab_val[target].values

In [None]:
lr = LinearRegression()
lr.fit(x_train, y_train)

y_pred = lr.predict(x_val)

mean_squared_error(y_val, y_pred, squared=False)

7.758715206462274

In [None]:
# Creating the 'models' directory because it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')

# original code to save the file
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)


In [None]:
lr = Lasso(0.01)
lr.fit(x_train, y_train)

y_pred = lr.predict(x_val)

mean_squared_error(y_val, y_pred, squared=False)

In [None]:
lr = Ridge()
lr.fit(x_train, y_train)

y_pred = lr.predict(x_val)

mean_squared_error(y_val, y_pred, squared=False)

In [None]:
sns.displot(lab.duration)

In [None]:
lab.duration.describe(percentiles=[.95, .98, .99])

In [None]:
sns.displot(y_pred, label='prediction')
sns.displot(y_train, label='actual')

plt.legend

#MLOps Maturity

In [None]:
!pip install mlflow #--quiet
!pip install pyngrok  #ngrok helps integrate mlflow in colab

In [None]:
import mlflow
import subprocess
from pyngrok import ngrok, conf
import getpass

In [None]:
subprocess.Popen(['mlflow', 'ui', '--backend-store-uri', 'sqlite:///mlflow.db'])

In [None]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('Learn MLFLOW')

In [None]:
print('ngrok config add-authtoken 2ZrJVVeIYs9WvY2PriDSGfOk8Iz_7FtnWPBGCTht5dRGuJ4jZ')
conf.get_default().auth_token = getpass.getpass()
port=5000
public_url =ngrok.connect(port).public_url
print(f' *ngrok tunnel \'{public_url}\' -> \'http://127.0.0.1:{port}\'')

In [None]:
# Creating the 'models' directory because it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')

# original code to save the file
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)


In [None]:
# End the current active run
mlflow.end_run()

# Start a new run
with mlflow.start_run():
    mlflow.set_tag('developer', 'Victoria')

    mlflow.log_param('train-data-path', 'green_tripdata_2021-01.parquet')
    mlflow.log_param('val-data-path', 'green_tripdata_2021-02.parquet')

    alpha = 0.01
    mlflow.log_param('alpha', alpha)

    lr = Lasso(alpha)
    lr.fit(x_train, y_train)

    y_pred = lr.predict(x_val)

    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric('rmse', rmse)

    mlflow.log_artifact(local_path='models/lin_reg.bin', artifact_path='models_pickle/lin_reg.bin')  #saved the model in lin_reg.bin in the artifact directory


In [None]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [None]:
train = xgb.DMatrix(x_train, label=y_train)
valid = xgb.DMatrix(x_val, label=y_val)

In [None]:
def objective(params):
      #  params = log and visualize parameters using MLflow
  with mlflow.start_run():
     mlflow.set_tag('model', 'xgboost')
     mlflow.log_params(params)
     booster = xgb.train(
        params=params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
     )
     y_pred = booster.predict(valid)
     rmse = mean_squared_error(y_val, y_pred, squared=False)
     mlflow.log_metric('rmse', rmse)

  return {'loss': rmse, 'status': STATUS_OK}

In [None]:
 search_space = {
     'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
     'learning_rate': hp.loguniform('learning_rate', -3, 0),
     'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
     'reg_lambda': hp.loguniform('reg_lamba', -6, -1),
     'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
     'objective': 'reg:linear',
     'seed': 42,
 }

 best_result = fmin(
     fn=objective,
     space=search_space,
     algo=tpe.suggest,
     max_evals=50,
     trials=Trials())

In [None]:
best_params = {
    'learning_rate': 0.0842682702128579,
    'max_depth': 84,
    'min_child_weight': 4.563936109962006,
    'objective': 'reg:squarederror',  # Use a valid regression objective
    'reg_alpha': 0.18802869296387734,
    'reg_lambda': 0.01761052467223192,
    'seed': 42
}

mlflow.xgboost.autolog()

booster = xgb.train(
    params=best_params,
    dtrain=train,
    num_boost_round=1000,
    evals=[(valid, 'validation')],
    early_stopping_rounds=50
)

In [None]:
mlflow.xgboost.autolog(disable=True)

In [None]:
with mlflow.start_run():

  mlflow.set_tag('developer', 'Victoria')
  train = xgb.DMatrix(x_train, label=y_train)
  valid = xgb.DMatrix(x_val, label=y_val)

  best_params = {
    'learning_rate': 0.0842682702128579,
    'max_depth': 84,
    'min_child_weight': 4.563936109962006,
    'objective': 'reg:linear',  # Use a valid regression objective
    'reg_alpha': 0.18802869296387734,
    'reg_lambda': 0.01761052467223192,
    'seed': 42
  }

  mlflow.log_params(best_params)

  booster = xgb.train(
      params=best_params,
      dtrain=train,
      num_boost_round=1000,
      evals=[(valid, 'validation')],
      early_stopping_rounds=50
  )

  y_pred = booster.predict(valid)
  rmse = mean_squared_error(y_val, y_pred, squared=False)
  mlflow.log_metric('rmse', rmse)

  with open('models/preprocessor.b', 'wb') as f_out:
    pickle.dump(dv, f_out)

  mlflow.log_artifact('models/preprocessor.b', artifact_path='preprocessor')
  mlflow.xgboost.log_model(booster, artifact_path='models_mlflow')

In [None]:
logged_model = 'runs:/b849064b539040d9919ff819d274fffc/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [None]:
loaded_model

In [None]:
xgboost_model = mlflow.xgboost.load_model(logged_model)
xgboost_model

In [None]:
y_pred = xgboost_model.predict(valid)

In [None]:
y_pred[:10]

In [None]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = 'sqlite:///mlflow.db'

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [None]:
client.create_experiment(name='My_next_experiment')  #creating new experiment

In [None]:
from mlflow.entities import ViewType

from mlflow.entities import ViewType

runs = client.search_runs(
    experiment_ids='1',
    filter_string='metric.rmse < 6.8',
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,  # Change '5' to 5 (integer)
    order_by=['metrics.rmse ASC']
)

In [None]:
for run in runs:
    rmse_value = run.data.metrics.get('rmse')
    if rmse_value is not None:
        print(f"run id: {run.info.run_id}, rmse: {rmse_value:.4f}")
    else:
        print(f"run id: {run.info.run_id}, rmse not available")

In [None]:
import mlflow
mlflow.set_tracking_uri['MLFLOW_TRACKING_URI']

In [None]:
run_id = 'b849064b539040d9919ff819d274fffc/models_mlflow'
model_uri = f'runs:/{run_id}/model'
mlflow.register_model(model_uri=model_uri, name='NYC-TAXI-REGRESSION-MODEL')

In [None]:
model_uri

In [None]:
model_name = 'NYC-TAXI-REGRESSION-MODEL'
latest_version = client.get_latest_versions(name=model_name)

for version in latest_version:
  print(f'version = {version.version}, stage = {version.current_stage}')

In [None]:
model_version = 1
new_stage = 'staging'
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage='staging',
    archive_existing_versions=False
)

In [None]:
from datetime import datetime
date = datetime.today().date()
client.update_model_version(
    name=model_name,
    version=model_version,
    description=f'This model version {model_version} was updated to {new_stage} on {date}'
)

In [None]:
from google.colab import files
upload = files.upload()

Saving green_tripdata_2021-03.parquet to green_tripdata_2021-03.parquet


In [None]:
def read_dataframe(filename):
  lab = pd.read_parquet(filename)

  lab.lpep_pickup_datetime = pd.to_datetime(lab.lpep_pickup_datetime)    #datetime - to tell pandas that it is not a string but date
  lab.lpep_dropoff_datetime = pd.to_datetime(lab.lpep_dropoff_datetime)

  lab['duration'] = lab.lpep_dropoff_datetime - lab.lpep_pickup_datetime   # adding duration to the list
  lab.duration = lab.duration.apply(lambda td: td.total_seconds() / 60)

  lab = lab[((lab.duration >= 1) & (lab.duration <= 60))]

  categ = ['PULocationID', 'DOLocationID']
  lab[categ] = lab[categ].astype(str)

  return lab

def preprocess(lab, dv):
  lab['PU_DO'] = lab['PULocationID'].astype(str) + '_' + lab['DOLocationID'].astype(str)
  categ = ['PU_DO']
  num = ['trip_distance']
  train_dict = lab[categ + num].to_dict(orient='records')
  return dv.transform(train_dict)

def test_model(name, stage, x_test, y_test):
  model = mlflow.pyfunc.load_model(f'models:/{name}/{stage}')
  y_pred = model.predict(x_test)
  return {'rmse': mean_squared_error(y_test, y_pred, squared=False)}


In [None]:
lab = read_dataframe('green_tripdata_2021-03.parquet')

In [None]:
client.download_artifacts(run_id=run_id, path='preprocessor', dst_path='.')

In [None]:
with open('preprocessor/preprocessor.b', 'rb') as f_in:
  dv = pickle.load(f_in)

In [None]:
x_test = preprocess(lab, dv)

In [None]:
target ='duration'
y_test = lab[target].values

In [None]:
%time test_model(name=model_name, stage='staging', x_test=x_test, y_test=y_test)

In [None]:
mlflow.list_experiments()