In [None]:
import pandas as pd
import hopsworks
import mlflow
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Step 1: Login and load feature group
project = hopsworks.login(
    project="Citi_Bike_TripData",
    api_key_value="eVrgcmUQIaYJz4kj.QNITpj9s3ieWAofZVNhhPtsjGng1ra5ZA9BsSGNRuI6i9WLGojdUuD0i0TBKfIx1"
)
fs = project.get_feature_store()
fg = fs.get_feature_group("citibike_features_hourly", version=1)
df = fg.read()

# Step 2: Preprocessing
df['date'] = pd.to_datetime(df['date'])  # ensure datetime
df['start_station_name'] = df['start_station_name'].astype('category')

# Drop non-numeric or unsupported columns
X = df.drop(columns=["rides", "date"])
y = df["rides"]

# Step 3: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train model
model = lgb.LGBMRegressor()
model.fit(X_train, y_train)

# Step 5: Evaluation
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE: {mae:.2f}")

# Step 6: Log to MLflow
mlflow.set_tracking_uri("https://dagshub.com/SaiRishi9/Citi_Bike_tripdata.mlflow")
mlflow.set_experiment("CitiBike_Demand_Prediction")

with mlflow.start_run(run_name="LightGBM_All28Lags"):
    mlflow.log_metric("mae", mae)
    mlflow.lightgbm.log_model(model, artifact_path="model", registered_model_name="citibike_demand_model")
    mlflow.set_tag("model_type", "LightGBM")
    mlflow.set_tag("feature_set", "All 28 lag features")


2025-05-09 22:40:20,641 INFO: Initializing external client
2025-05-09 22:40:20,644 INFO: Base URL: https://c.app.hopsworks.ai:443




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'


2025-05-09 22:40:21,956 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1225931
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.22s) 
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001212 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5268
[LightGBM] [Info] Number of data points in the train set: 7053, number of used features: 29
[LightGBM] [Info] Start training from score 41.243017
MAE: 8.05
