In [256]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Turn into function

In [15]:
import pandas as pd
import numpy as np
import pickle
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import TargetEncoder, StandardScaler

In [43]:
def read_dataframe(filename: str):
    # Check file format
    if not filename.endswith(".csv") and not filename.endswith(".parquet"):
        raise ValueError(
            "The file format is not supported. Please provide a CSV or Parquet file."
        )

    # Read the file
    try:
        if filename.endswith(".csv"):
            df = pd.read_csv(filename)
            df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
            df["tpep_dropoff_datetime"] = pd.to_datetime(df["tpep_dropoff_datetime"])
        elif filename.endswith(".parquet"):
            df = pd.read_parquet(filename)
    except FileNotFoundError:
        raise FileNotFoundError(f"The file {filename} was not found.")
    except Exception as e:
        raise IOError(f"Error reading the file: {str(e)}")

    # Calculate the trip duration
    try:
        df["trip_duration"] = (
            df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
        ).dt.total_seconds() / 60
    except Exception as e:
        raise ValueError(
            "Error calculating the trip duration. Please check the columns 'tpep_pickup_datetime' and 'tpep_dropoff_datetime'."
        )
    df = df[(df["trip_duration"] > 0) & (df["trip_duration"] < 60)]

    # Extract date features
    df["tpep_pickup_month"] = df["tpep_pickup_datetime"].dt.month
    df["tpep_pickup_day"] = df["tpep_pickup_datetime"].dt.day
    df["tpep_pickup_day_of_week"] = df["tpep_pickup_datetime"].dt.day_of_week
    df["tpep_pickup_hour"] = df["tpep_pickup_datetime"].dt.hour

    # Rearrange columns
    column_order = [
        "trip_distance",
        "tpep_pickup_month",
        "tpep_pickup_day",
        "tpep_pickup_day_of_week",
        "tpep_pickup_hour",
        "PULocationID",
        "DOLocationID",
        "trip_duration",
    ]
    df = df[column_order]

    # Cast the columns to the appropriate data types
    df[["PULocationID", "DOLocationID"]] = df[["PULocationID", "DOLocationID"]].astype(
        "category"
    )

    return df

In [44]:
train_df = read_dataframe('./data/yellow_tripdata_2024-01.parquet')
test_df = read_dataframe('./data/yellow_tripdata_2024-02.parquet')

In [45]:
train_df.head()

Unnamed: 0,trip_distance,tpep_pickup_month,tpep_pickup_day,tpep_pickup_day_of_week,tpep_pickup_hour,PULocationID,DOLocationID,trip_duration
0,1.72,1,1,0,0,186,79,19.8
1,1.8,1,1,0,0,140,236,6.6
2,4.7,1,1,0,0,236,79,17.916667
3,1.4,1,1,0,0,79,211,8.3
4,0.8,1,1,0,0,211,148,6.1


In [46]:
test_df.head()

Unnamed: 0,trip_distance,tpep_pickup_month,tpep_pickup_day,tpep_pickup_day_of_week,tpep_pickup_hour,PULocationID,DOLocationID,trip_duration
0,4.39,2,1,3,0,68,236,15.216667
1,7.71,2,1,3,0,48,243,14.366667
2,28.69,2,1,3,0,132,261,35.366667
3,1.1,2,1,3,0,161,163,8.966667
4,2.6,2,1,3,0,246,79,13.666667


In [108]:
from sklearn.base import TransformerMixin, BaseEstimator

class CyclialEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cyclial_dict, drop_original=True):
        self.cyclial_dict = cyclial_dict
        self.drop_original = drop_original

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        for col, max_val in self.cyclial_dict.items():
            X[col + "_sin"] = np.sin(2 * np.pi * X[col] / max_val)
            X[col + "_cos"] = np.cos(2 * np.pi * X[col] / max_val)
            if self.drop_original:
                X = X.drop(col, axis=1)
        return X

    def fit_transform(self, X, y=None):
        return self.transform(X)

In [85]:
cyclial_dict = {
    "tpep_pickup_month": 12,
    "tpep_pickup_day": 31,
    "tpep_pickup_day_of_week": 6,
    "tpep_pickup_hour": 23
}

In [157]:
def remove_outlier(data, column_name, threshold=1.5):
    try:
        iqr = data[column_name].quantile(0.75) - data[column_name].quantile(0.25)
        lower_bound = data[column_name].quantile(0.25) - (threshold * iqr)
        upper_bound = data[column_name].quantile(0.75) + (threshold * iqr)
        data_copy = data[
            (data[column_name] > lower_bound) & (data[column_name] < upper_bound)
        ]
    except Exception as e:
        raise ValueError(f"Error removing outliers: {str(e)}")

    return data_copy


def prepare_X_y(data, mode="train", return_preprocessor=False, preprocessor=None):
    if mode == "train" and preprocessor is not None:
        print(
            "The preprocessor should be None when the mode is 'train'. Setting preprocessor to None."
        )
        preprocessor = None
    elif mode == "test" and preprocessor is None:
        raise ValueError("The preprocessor should be provided when the mode is 'test'.")

    # Remove outliers
    data = remove_outlier(data, "trip_distance")

    # Split data into features and target
    X = data.drop("trip_duration", axis=1)
    y = data["trip_duration"]

    # Define preprocessor if not provided
    if preprocessor is None:
        # Define the preprocessor
        preprocessor = ColumnTransformer(
            transformers=[
                (
                    "cyclial_encoder",
                    CyclialEncoder(cyclial_dict),
                    [
                        "tpep_pickup_month",
                        "tpep_pickup_day",
                        "tpep_pickup_day_of_week",
                        "tpep_pickup_hour",
                    ],
                ),
                (
                    "target_encoder",
                    TargetEncoder(target_type="continuous"),
                    ["PULocationID", "DOLocationID"],
                ),
                ("scaler", StandardScaler(), ["trip_distance"]),
            ],
        )

    if mode == "train":
        X = preprocessor.fit_transform(X, y)
    elif mode == "test":
        X = preprocessor.transform(X)
    else:
        raise ValueError("The mode should be either 'train' or 'test'.")

    if return_preprocessor:
        return X, y, preprocessor
    return X, y

In [146]:
X_train, y_train, preprocessor = prepare_X_y(train_df, return_preprocessor=True)

In [147]:
X_test, y_test = prepare_X_y(test_df, mode="test", preprocessor=preprocessor)

In [148]:
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")

Shape of X_train: (2570269, 11)
Shape of y_train: (2570269,)


In [149]:
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_test: (2615609, 11)
Shape of y_test: (2615609,)


In [150]:
def train(X, y, model):
    model.fit(X, y)
    return model

def evaluate(X, y, model):
    y_pred = model.predict(X)
    mse = root_mean_squared_error(y, y_pred)
    print(y_pred[0], y[0])
    return mse

In [151]:
from xgboost import XGBRegressor
from sklearn.metrics import root_mean_squared_error

In [154]:
xgb_model = XGBRegressor()
xgb_model = train(X_train, y_train, xgb_model)
xgb_rmse = evaluate(X_test, y_test, xgb_model)
print(f"XGBoost RMSE: {xgb_rmse}")

20.177097 15.216666666666667
XGBoost RMSE: 4.037091180649304


In [156]:
from lightgbm import LGBMRegressor

lgbm_model = LGBMRegressor()
lgbm_model = train(X_train, y_train, lgbm_model)
lgbm_rmse = evaluate(X_test, y_test, lgbm_model)
print(f"LGBM RMSE: {lgbm_rmse}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034265 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 873
[LightGBM] [Info] Number of data points in the train set: 2570269, number of used features: 9
[LightGBM] [Info] Start training from score 11.741983
21.282791260353758 15.216666666666667
LGBM RMSE: 4.128901499928223


In [155]:
with open('model/model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)