<a href="https://colab.research.google.com/github/WHC-1998/CI-CD-for-ML-demo/blob/main/Monitoring_ML_with_NannyML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [76]:
!pip install nannyml --quiet

In [77]:
!pip install lightgbm --quiet

In [78]:
import nannyml
import numpy as np
import pandas as pd
import lightgbm
from sklearn.metrics import mean_absolute_error

In [79]:
data = pd.read_csv('green_taxi_dataset.csv')
print(data.head())

  lpep_pickup_datetime  PULocationID  DOLocationID  trip_distance  \
0  2016-12-01 00:00:02            82           129           0.60   
1  2016-12-01 00:01:57           255             7           4.53   
2  2016-12-01 00:04:17            65           195           1.94   
3  2016-12-01 00:06:45            41            41           1.00   
4  2016-12-01 00:09:18            74            42           2.02   

   fare_amount  tip_amount  pickup_time partition  
0          5.0         1.0            0     train  
1         17.5         4.7            0     train  
2          9.5         0.0            0     train  
3          5.5         2.0            0     train  
4          8.5         1.0            0     train  


In [80]:
print(data.dtypes)

lpep_pickup_datetime     object
PULocationID              int64
DOLocationID              int64
trip_distance           float64
fare_amount             float64
tip_amount              float64
pickup_time               int64
partition                object
dtype: object


In [81]:
print(data.describe())

       PULocationID  DOLocationID  trip_distance   fare_amount    tip_amount  \
count  24244.000000  24244.000000   24244.000000  24244.000000  24244.000000   
mean     114.508084    133.155585       2.948626     12.803840      2.369033   
std       77.978822     77.648204       3.030861     10.589611      2.679303   
min        1.000000      1.000000       0.000000      0.000000      0.000000   
25%       48.500000     65.000000       1.070000      6.500000      1.060000   
50%       89.000000    133.000000       1.930000     10.000000      1.960000   
75%      181.000000    197.000000       3.720000     15.500000      3.000000   
max      265.000000    265.000000      73.740000    499.000000     93.970000   

        pickup_time  
count  24244.000000  
mean      13.907853  
std        6.637918  
min        0.000000  
25%        9.000000  
50%       15.000000  
75%       19.000000  
max       23.000000  


In [82]:
data['lpep_pickup_datetime'] = pd.to_datetime(data['lpep_pickup_datetime'])

**Create data partition**

In [83]:
data['partition'] = pd.cut(data['lpep_pickup_datetime'],
                        bins = [pd.to_datetime('2016-12-01'), pd.to_datetime('2016-12-08'),
                        pd.to_datetime('2016-12-16'), pd.to_datetime('2017-01-01')],
                           right = False, labels = ['train','test','prod'])

**Target and features column names**

In [84]:
target = 'tip_amount'
features = ['PULocationID','DOLocationID','trip_distance','pickup_time']

**Training set**

In [85]:
X_train = data.loc[data['partition'] == 'train', features]
y_train = data.loc[data['partition'] == 'train', target]

**Test set (later reference set)**

In [86]:
X_test = data.loc[data['partition'] == 'test', features]
y_test = data.loc[data['partition'] == 'test', target]

**Production set (later analysis set)**

In [87]:
X_prod = data.loc[data['partition'] == 'prod', features]
y_prod = data.loc[data['partition'] == 'prod', target]

**Train the model**

In [88]:
model = lightgbm.LGBMRegressor(random_state =42)
model.fit(X_train, y_train)

**Make predictions**

In [89]:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

**Evaluate the model on training and test set**

In [90]:
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)

**Deploy the model to production**

In [91]:
y_pred_prod = model.predict(X_prod)

**Create reference set**

In [92]:
reference = X_test.copy() # Test set features
reference['y_pred'] = y_pred_test # Predctions
reference['tip_amount'] = y_test # Labels
reference = reference.join(data['lpep_pickup_datetime']) # Timestamp

**Create analysis set**

In [100]:
analysis = X_prod.copy() # Production features
analysis['y_pred'] = y_pred_prod # Predictions
analysis['tip_amount'] = y_prod
analysis = analysis.join(data['lpep_pickup_datetime']) # Timestamp

**Initialize the Direct Loss Estimation (DLE) algorithm**

In [94]:
estimator_dle = nannyml.DLE(y_true = 'tip_amount', y_pred = 'y_pred', metrics = ['rmse'],
                        timestamp_column_name = 'lpep_pickup_datetime', chunk_period = 'd',
                        feature_column_names = features, tune_hyperparameters = False)

**Fit the algorithm**

In [95]:
estimator_dle.fit(reference)
results_estimator_dle = estimator_dle.estimate(analysis)


Using categorical_feature in Dataset.



In [96]:
results_estimator_dle.plot().show()

**Initialize the performance calculator**

In [101]:
calculator_dle = nannyml.PerformanceCalculator(y_true = 'tip_amount', y_pred = 'y_pred', metrics = ['rmse'],
                        timestamp_column_name = 'lpep_pickup_datetime', chunk_period = 'd',
                        problem_type = 'regression')

**Fit the calculator**

In [102]:
calculator_dle.fit(reference)
realized_results = calculator_dle.calculate(analysis)



**Plot the realised performance result**

In [103]:
realized_results.plot().show()

**Compare the estimated performance wih realized performance**

In [104]:
realized_results.compare(results_estimator_dle).plot().show()