In [1]:
import pandas as pd
import uuid

In [2]:
DATA_RAW_DIR = "taxi_data/"
DATA_FEATURES_DIR = "taxi_data/features"

files = [
        "green_tripdata_2024-01.parquet",
    #    "green_tripdata_2024-02.parquet",
    #    "green_tripdata_2024-03.parquet",
    #    "green_tripdata_2024-04.parquet",
    #    "green_tripdata_2024-05.parquet",
        
    ]




#### Make data preprocession

In [3]:
for file in files:

    path_source = f"{DATA_RAW_DIR}/{file}"
    data = pd.read_parquet(path_source)
    print("data shape : ",data.shape)
    print("Generate UID")

    data["uuid"] = [uuid.uuid4() for x in range(len(data))]
    data["uuid"] = data["uuid"].astype("str")

    # Generate target variable (duration in minutes)
    dropoff_dt = data.lpep_dropoff_datetime
    pickup_dt = data.lpep_pickup_datetime
    data["duration_min"] = dropoff_dt - pickup_dt

    data.duration_min = data.duration_min.apply(
        lambda td: float(td.total_seconds() / 60)
    )

 
    numeric_columns = data.select_dtypes(include='number').columns
    medians = data[numeric_columns].median()
    data = data.fillna(medians).fillna(0)
    data = data[data['duration_min'] != 0]






    print("Save data")
    path_destination = f"{DATA_FEATURES_DIR}/{file}"
    data.to_parquet(path_destination)

data shape :  (56475, 21)
Generate UID
Save data


In [4]:
features_path = "taxi_data/features/"

all_features_df = pd.concat([pd.read_parquet(features_path+file) for file in files], ignore_index=True)

# Optional: display the shape or first few rows
print(all_features_df.shape)
all_features_df.head()


(56475, 21)


Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,...,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,uuid,duration_min
0,2,2024-01-01 00:46:55,2024-01-01 00:58:25,1.0,236,239,1.0,1.98,12.8,1.0,...,3.61,0.0,0.0,1.0,21.66,1.0,1.0,2.75,380c4118-85e9-4593-bcf4-8ba3933fb1f4,11.5
1,2,2024-01-01 00:31:42,2024-01-01 00:52:34,1.0,65,170,5.0,6.54,30.3,1.0,...,7.11,0.0,0.0,1.0,42.66,1.0,1.0,2.75,408fb41f-96cb-41c2-9b59-b62e5cc5068f,20.866667
2,2,2024-01-01 00:30:21,2024-01-01 00:49:23,1.0,74,262,1.0,3.08,19.8,1.0,...,3.0,0.0,0.0,1.0,28.05,1.0,1.0,2.75,fade57c0-9c1d-4e65-b6d0-bf9553116622,19.033333
3,1,2024-01-01 00:30:20,2024-01-01 00:42:12,1.0,74,116,1.0,2.4,14.2,1.0,...,0.0,0.0,0.0,1.0,16.7,2.0,1.0,0.0,309fd581-56a7-4b09-99fb-085714709164,11.866667
4,2,2024-01-01 00:32:38,2024-01-01 00:43:37,1.0,74,243,1.0,5.14,22.6,1.0,...,6.28,0.0,0.0,1.0,31.38,1.0,1.0,0.0,cdbb464b-03c6-425f-acb8-770207dba60d,10.983333


In [5]:

# Shuffle rows
all_features_shuffled = all_features_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [6]:
 # Filter out outliers
all_features_shuffled = all_features_shuffled[(all_features_shuffled.duration_min >= 1) & (all_features_shuffled.duration_min <= 60)]
all_features_shuffled = all_features_shuffled[(all_features_shuffled.passenger_count > 0) & (all_features_shuffled.passenger_count <= 6)]

In [7]:
train_amount = 30000 #200000
# Split data into training and validation sets
train_data = all_features_shuffled.loc[:train_amount, :]
val_data = all_features_shuffled.loc[train_amount:, :]

### Train model

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

# data labeling

target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

dates = ["lpep_pickup_datetime","lpep_dropoff_datetime"]

In [9]:

model = LinearRegression()
model.fit(
    X=train_data[num_features + cat_features],
    y=train_data[target],
)

In [10]:
train_data[num_features + cat_features].head()

Unnamed: 0,passenger_count,trip_distance,fare_amount,total_amount,PULocationID,DOLocationID
0,5.0,3.04,17.0,18.5,181,89
2,1.0,0.65,7.2,14.56,134,28
3,1.0,7.29,35.2,50.34,24,79
4,1.0,1.16,9.3,19.26,75,236
5,1.0,1.16,7.9,10.4,166,74


In [11]:
train_preds = model.predict(train_data[num_features + cat_features])
val_preds = model.predict(val_data[num_features + cat_features])

#### Calculate validation metrics: MAE

In [12]:
print(mean_absolute_error(train_data[target], train_preds))
print(mean_absolute_error(val_data[target], val_preds))

3.129596837857931
3.1285812359555156


#### Calculate validation metrics: MAPE

In [13]:
print(mean_absolute_percentage_error(train_data[target], train_preds))
print(mean_absolute_percentage_error(val_data[target], val_preds))

0.33096136773242485
0.3301708327503358


In [14]:
import joblib
model_path = "data/taxi_lr_model.pkl"
joblib.dump(model, model_path)

['data/taxi_lr_model.pkl']

In [15]:
train_data.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,...,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,uuid,duration_min
0,2,2024-01-26 10:33:25,2024-01-26 10:48:32,1.0,181,89,5.0,3.04,17.0,0.0,...,0.0,0.0,0.0,1.0,18.5,2.0,1.0,0.0,0206b51f-30aa-412e-9698-47221468f817,15.116667
2,2,2024-01-19 17:32:02,2024-01-19 17:38:05,1.0,134,28,1.0,0.65,7.2,2.5,...,3.36,0.0,0.0,1.0,14.56,1.0,1.0,0.0,48e64228-c2ed-4fd7-9250-333710d3516a,6.05
3,2,2024-01-05 16:47:39,2024-01-05 17:18:16,1.0,24,79,1.0,7.29,35.2,2.5,...,8.39,0.0,0.0,1.0,50.34,1.0,1.0,2.75,d8afe70a-1c4f-44aa-abae-bb354429128c,30.616667
4,2,2024-01-25 17:48:05,2024-01-25 17:55:54,1.0,75,236,1.0,1.16,9.3,2.5,...,3.21,0.0,0.0,1.0,19.26,1.0,1.0,2.75,27221636-251a-440d-a005-2b3d81f8adfb,7.816667
5,2,2024-01-06 00:11:51,2024-01-06 00:17:37,1.0,166,74,1.0,1.16,7.9,1.0,...,0.0,0.0,0.0,1.0,10.4,2.0,1.0,0.0,945b3f13-9b32-48fb-9046-46053f6c9750,5.766667


In [16]:
train_data['prediction']= train_preds
reference_data_df = train_data[num_features + cat_features+['lpep_pickup_datetime','prediction','duration_min']]


reference_data_df.rename(columns={
    'lpep_pickup_datetime': 'timestamp',
    'PULocationID' : 'pulocationid',
    'DOLocationID':'dolocationid'
}, inplace=True)
print(reference_data_df.shape)
reference_data_df.head()



(28621, 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['prediction']= train_preds
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reference_data_df.rename(columns={


Unnamed: 0,passenger_count,trip_distance,fare_amount,total_amount,pulocationid,dolocationid,timestamp,prediction,duration_min
0,5.0,3.04,17.0,18.5,181,89,2024-01-26 10:33:25,13.294961,15.116667
2,1.0,0.65,7.2,14.56,134,28,2024-01-19 17:32:02,7.545782,6.05
3,1.0,7.29,35.2,50.34,24,79,2024-01-05 16:47:39,25.396091,30.616667
4,1.0,1.16,9.3,19.26,75,236,2024-01-25 17:48:05,9.835356,7.816667
5,1.0,1.16,7.9,10.4,166,74,2024-01-06 00:11:51,8.068688,5.766667


In [17]:
reference_data_df.to_parquet("taxi_reference_data.parquet")

### Test Trainin model

In [18]:
new_model = None
new_model = joblib.load(model_path)
val_preds = new_model.predict(val_data[num_features + cat_features])
print(mean_absolute_percentage_error(val_data[target], val_preds))

0.3301708327503358


In [19]:
temp_datatemp1 = val_data[num_features + cat_features].head()

In [20]:
temp_data = [1.00, 4.16, 23.30, 29.80,260.00,138.00]

In [21]:
model.predict(temp_datatemp1)

array([ 9.43646432, 11.0007599 , 11.2889677 , 20.32910466, 18.14415896])

### Test with schema

In [22]:
from pydantic import BaseModel,Field

class taxi_model(BaseModel):
    passenger_count :float =      1.00
    trip_distance   :float =      4.16
    fare_amount   :float =       23.30
    total_amount  :float =       29.80
    PULocationID   :float =     260.00
    DOLocationID :float =       138.00
  

In [23]:
tempt = taxi_model()

In [24]:
features = pd.DataFrame([tempt.dict()])
features.head()

/var/folders/xd/3z5vvpds0zxf_pypxd4cn3d80000gn/T/ipykernel_13344/2037298444.py:1: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  features = pd.DataFrame([tempt.dict()])


Unnamed: 0,passenger_count,trip_distance,fare_amount,total_amount,PULocationID,DOLocationID
0,1.0,4.16,23.3,29.8,260.0,138.0


In [25]:
print(type(features))
print(features.shape)
print(features.dtypes)

<class 'pandas.core.frame.DataFrame'>
(1, 6)
passenger_count    float64
trip_distance      float64
fare_amount        float64
total_amount       float64
PULocationID       float64
DOLocationID       float64
dtype: object


In [26]:
_pred = model.predict(features)
_pred

array([17.76030184])

In [27]:
features["predictions"] = _pred

In [28]:
features

Unnamed: 0,passenger_count,trip_distance,fare_amount,total_amount,PULocationID,DOLocationID,predictions
0,1.0,4.16,23.3,29.8,260.0,138.0,17.760302


In [29]:
from evidently import ColumnMapping
from evidently.metrics import (
    RegressionQualityMetric,
    RegressionPredictedVsActualScatter,
    RegressionPredictedVsActualPlot,
    RegressionErrorPlot,
    RegressionAbsPercentageErrorPlot,
    RegressionErrorDistribution,
    RegressionErrorNormality,
    RegressionTopErrorMetric
)

ImportError: cannot import name 'ColumnMapping' from 'evidently' (/opt/anaconda3/envs/ths_dev/lib/python3.10/site-packages/evidently/__init__.py)

In [None]:
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["pulocationid", "dolocationid"]

In [None]:
column_mapping = ColumnMapping(
            target=target,
            prediction='prediction',
            numerical_features=num_features,
            categorical_features=cat_features
        )

from evidently.report import Report

In [30]:
current_data = reference_data_df
current_data.head()

Unnamed: 0,passenger_count,trip_distance,fare_amount,total_amount,pulocationid,dolocationid,timestamp,prediction,duration_min
0,5.0,3.04,17.0,18.5,181,89,2024-01-26 10:33:25,13.294961,15.116667
2,1.0,0.65,7.2,14.56,134,28,2024-01-19 17:32:02,7.545782,6.05
3,1.0,7.29,35.2,50.34,24,79,2024-01-05 16:47:39,25.396091,30.616667
4,1.0,1.16,9.3,19.26,75,236,2024-01-25 17:48:05,9.835356,7.816667
5,1.0,1.16,7.9,10.4,166,74,2024-01-06 00:11:51,8.068688,5.766667


In [31]:
reference_data = reference_data_df

In [32]:
model_performance_report = Report(metrics=[
    RegressionQualityMetric(),
    RegressionPredictedVsActualScatter(),
    RegressionPredictedVsActualPlot(),
    RegressionErrorPlot(),
    RegressionAbsPercentageErrorPlot(),
    RegressionErrorDistribution(),
    RegressionErrorNormality(),
    RegressionTopErrorMetric()
])
model_performance_report.run(
    reference_data=reference_data,
    current_data=current_data,
    column_mapping=column_mapping
)
report_path = f'/fastapi/reports/model_performance.html'
#model_performance_report.save_html(report_path)

NameError: name 'Report' is not defined

In [33]:
model_performance_report

NameError: name 'model_performance_report' is not defined

In [34]:
model_performance_report.save_html("test.html")

NameError: name 'model_performance_report' is not defined