In [None]:
import requests
import datetime
import pandas as pd
import evidently

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric, RegressionQualityMetric, ColumnQuantileMetric

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

import warnings
import os

# Ignore specific FutureWarnings from sklearn
warnings.filterwarnings("ignore", category=FutureWarning, module='sklearn')

In [None]:
#pip install evidently


In [None]:
files = [('green_tripdata_2024-03.parquet', './data')] #Tuples (file, destination)

# Ensure the directory exists
if not os.path.exists('./data'):
    os.makedirs('./data')

print("Download files:")
for file, path in files:
    url=f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp=requests.get(url, stream=True)
    save_path=f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=f"{file}",
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

In [None]:
mar_data = pd.read_parquet('data/green_tripdata_2024-03.parquet')

In [None]:
mar_data.describe()

In [None]:
mar_data.shape

In [None]:
# create target (duration in minutes)
mar_data["duration_min"] = mar_data.lpep_dropoff_datetime - mar_data.lpep_pickup_datetime
mar_data.duration_min = mar_data.duration_min.apply(lambda td : float(td.total_seconds())/60)

In [None]:
# filter out outliers
mar_data = mar_data[(mar_data.duration_min >= 0) & (mar_data.duration_min <= 60)]
mar_data = mar_data[(mar_data.passenger_count > 0) & (mar_data.passenger_count <= 8)]

In [None]:
mar_data.duration_min.hist()

In [None]:
# data labeling
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

In [None]:
mar_data.shape

In [None]:
train_data = mar_data[:30000]
val_data = mar_data[30000:]

In [None]:
model = LinearRegression()

In [None]:
model.fit(train_data[num_features + cat_features], train_data[target])

In [None]:
train_preds = model.predict(train_data[num_features + cat_features])
train_data['prediction'] = train_preds

In [None]:
val_preds = model.predict(val_data[num_features + cat_features])
val_data['prediction'] = val_preds

In [None]:
print(mean_absolute_error(train_data.duration_min, train_data.prediction))
print(mean_absolute_error(val_data.duration_min, val_data.prediction))

In [None]:
# Ensure the 'models' directory exists
models_dir = 'models'
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
    
with open('models/lin_reg.bin', 'wb') as f_out:
    dump(model, f_out)

In [None]:
val_data.to_parquet('data/reference.parquet')

In [None]:
column_mapping = ColumnMapping(
    target=target,
    prediction='prediction',
    numerical_features=num_features,
    categorical_features=cat_features
)

In [None]:
report = Report(metrics=[
    ColumnDriftMetric(column_name='prediction'),
    DatasetDriftMetric(),
    DatasetMissingValuesMetric(),
    RegressionQualityMetric(),
    ColumnQuantileMetric(column_name="fare_amount", quantile=0.5)
])

In [None]:
report.run(reference_data=train_data, current_data=val_data, column_mapping=column_mapping)

In [None]:
report.show(mode='inline')

In [None]:
#A dictionary is much easier to use in automation
result = report.as_dict()

In [None]:
#prediction drift
result['metrics'][0]['result']['drift_score']

In [None]:
#number of drifted columns
result['metrics'][1]['result']['number_of_drifted_columns']

In [None]:
#share of missing values
result['metrics'][2]['result']['current']['share_of_missing_values']

In [None]:
#Regression Quality RMSE
result['metrics'][3]['result']['current']['rmse']

In [None]:
from evidently.metric_preset import DataDriftPreset, DataQualityPreset

from evidently.ui.workspace import Workspace
from evidently.ui.dashboards import DashboardPanelCounter, DashboardPanelPlot, CounterAgg, PanelValue, PlotType, ReportFilter
from evidently.renderers.html_widgets import WidgetSize

In [None]:
ws = Workspace("workspace")

In [None]:
project = ws.create_project("NYC Taxi Data Quality Project")
project.description = "monitoring for ML batch services"
project.save()

In [None]:
maxFare = float('-inf')
for i in range(1,31):
    regular_report = Report(
        metrics=[
            ColumnQuantileMetric(column_name="fare_amount", quantile=0.5)
        ],
        timestamp=datetime.datetime(2024,3,i)
    )

    regular_report.run(reference_data=None,
                    current_data=mar_data.loc[mar_data.lpep_pickup_datetime.between(f'2024-03-{i:02}', f'2024-03-{i+1:02}', inclusive="left")],
                    column_mapping=column_mapping)
    
    # regular_report
    result = regular_report.as_dict()
    maxFare = max(maxFare,result['metrics'][0]['result']['current']['value'])

print(f"The max fare is ${maxFare}")


In [None]:
#configure the dashboard
project.dashboard.add_panel(
    DashboardPanelCounter(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        agg=CounterAgg.NONE,
        title="NYC taxi data dashboard"
    )
)

project.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title="Inference Count",
        values=[
            PanelValue(
                metric_id="DatasetSummaryMetric",
                field_path="current.number_of_rows",
                legend="count"
            ),
        ],
        plot_type=PlotType.BAR,
        size=WidgetSize.HALF,
    ),
)

project.dashboard.add_panel(
    DashboardPanelPlot(
        filter=ReportFilter(metadata_values={}, tag_values=[]),
        title="Number of Missing Values",
        values=[
            PanelValue(
                metric_id="DatasetSummaryMetric",
                field_path="current.number_of_missing_values",
                legend="count"
            ),
        ],
        plot_type=PlotType.LINE,
        size=WidgetSize.HALF,
    ),
)

project.save()

In [None]:
regular_report = Report(
    metrics=[
        DataQualityPreset(),
        ColumnQuantileMetric(column_name="fare_amount", quantile=0.5),
        RegressionQualityMetric()
    ]
)

regular_report.run(reference_data=None,
                  current_data=val_data.loc[val_data.lpep_pickup_datetime.between('2024-03-01', '2024-03-31', inclusive="left")],
                  column_mapping=column_mapping)

regular_report

In [None]:
#Add report to the workspace
ws.add_report(project.id, regular_report)