<a href="https://colab.research.google.com/github/Nishviprp/citibike-redistribution-ml/blob/main/citibike_redistribution_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import HistGradientBoostingRegressor

import matplotlib.pyplot as plt


In [2]:
from google.colab import files
uploaded = files.upload()
print("Uploaded files:", list(uploaded.keys()))


Saving JC-202506-citibike-tripdata.csv.zip to JC-202506-citibike-tripdata.csv.zip
Saving JC-202507-citibike-tripdata.csv.zip to JC-202507-citibike-tripdata.csv.zip
Saving JC-202508-citibike-tripdata.csv.zip to JC-202508-citibike-tripdata.csv.zip
Saving JC-202509-citibike-tripdata.csv.zip to JC-202509-citibike-tripdata.csv.zip
Saving JC-202510-citibike-tripdata.zip to JC-202510-citibike-tripdata.zip
Saving JC-202511-citibike-tripdata.csv.zip to JC-202511-citibike-tripdata.csv.zip
Uploaded files: ['JC-202506-citibike-tripdata.csv.zip', 'JC-202507-citibike-tripdata.csv.zip', 'JC-202508-citibike-tripdata.csv.zip', 'JC-202509-citibike-tripdata.csv.zip', 'JC-202510-citibike-tripdata.zip', 'JC-202511-citibike-tripdata.csv.zip']


In [3]:
def read_tripdata_csv(filepath, usecols=None):
    # Low-memory read for big files
    return pd.read_csv(filepath, low_memory=False, usecols=usecols)


In [4]:
def standardize_trip_columns(df):
    df = df.copy()
    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

    # Common variations
    col_map_candidates = {
        "started_at": ["started_at", "starttime", "start_time"],
        "ended_at": ["ended_at", "stoptime", "stop_time"],
        "start_station_id": ["start_station_id", "start_station_id", "start_station_name", "start_station"],
        "end_station_id": ["end_station_id", "end_station_id", "end_station_name", "end_station"],
        "rideable_type": ["rideable_type", "bikeid", "bike_id", "bike_type"]
    }

    # pick first existing column from candidates
    def pick_col(options):
        for o in options:
            if o in df.columns:
                return o
        return None

    started = pick_col(col_map_candidates["started_at"])
    ended = pick_col(col_map_candidates["ended_at"])
    start_station = pick_col(col_map_candidates["start_station_id"])
    end_station = pick_col(col_map_candidates["end_station_id"])
    ride_type = pick_col(col_map_candidates["rideable_type"])

    if started is None or ended is None or start_station is None or end_station is None:
        raise ValueError(f"Could not find required time/station columns. Columns found: {df.columns.tolist()}")

    # Create unified columns
    df["started_at_std"] = pd.to_datetime(df[started], errors="coerce")
    df["ended_at_std"] = pd.to_datetime(df[ended], errors="coerce")

    # If station "id" is missing and only names exist, we still use it as categorical
    df["start_station_std"] = df[start_station].astype(str)
    df["end_station_std"] = df[end_station].astype(str)

    if ride_type is None:
        df["bike_type_std"] = "unknown"
    else:
        df["bike_type_std"] = df[ride_type].astype(str)

    # Basic cleanup
    df = df.dropna(subset=["started_at_std", "ended_at_std", "start_station_std", "end_station_std"])
    return df


In [6]:
import zipfile
from pathlib import Path

# Define the paths for the uploaded zip files
# I've chosen July and November 2025 files based on your uploaded data.
JULY_ZIP_FILE = Path("/content/JC-202507-citibike-tripdata.csv.zip")
NOV_ZIP_FILE = Path("/content/JC-202511-citibike-tripdata.csv.zip")

# Define the expected paths for the extracted CSV files
JULY_FILE = Path("/content/JC-202507-citibike-tripdata.csv")
NOV_FILE = Path("/content/JC-202511-citibike-tripdata.csv")

# Function to extract a single zip file if it hasn't been extracted already
def extract_zip_if_needed(zip_path, extract_path_expected):
    if not extract_path_expected.exists():
        print(f"Extracting {zip_path.name}...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            # Extract all contents to the directory where the zip file resides
            zip_ref.extractall(zip_path.parent)
    else:
        print(f"{extract_path_expected.name} already extracted.")

# Extract the files
extract_zip_if_needed(JULY_ZIP_FILE, JULY_FILE)
extract_zip_if_needed(NOV_ZIP_FILE, NOV_FILE)

df_july = read_tripdata_csv(JULY_FILE)
df_nov  = read_tripdata_csv(NOV_FILE)

df_july = standardize_trip_columns(df_july)
df_nov  = standardize_trip_columns(df_nov)

df = pd.concat([df_july, df_nov], ignore_index=True)
print("Total rows:", len(df))
df.head()

Extracting JC-202507-citibike-tripdata.csv.zip...
Extracting JC-202511-citibike-tripdata.csv.zip...
Total rows: 184450


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,started_at_std,ended_at_std,start_station_std,end_station_std,bike_type_std
0,8974E7FE6E43119D,electric_bike,2025-07-13 21:07:32.221,2025-07-14 03:12:34.695,Bergen Ave & Sip Ave,JC109,,,40.731009,-74.064437,,,casual,2025-07-13 21:07:32.221,2025-07-14 03:12:34.695,JC109,,electric_bike
1,31EF8E5CBC2FA664,electric_bike,2025-07-10 19:02:28.736,2025-07-10 19:19:43.664,Stevens - River Ter & 6 St,HB602,,,40.743133,-74.026989,,,casual,2025-07-10 19:02:28.736,2025-07-10 19:19:43.664,HB602,,electric_bike
2,F0FB644DB2CDEE04,electric_bike,2025-07-04 14:02:35.827,2025-07-04 15:48:04.024,Grove St PATH,JC115,,,40.71941,-74.04309,,,casual,2025-07-04 14:02:35.827,2025-07-04 15:48:04.024,JC115,,electric_bike
3,DB7BEA3FEC700AC5,electric_bike,2025-07-18 14:46:32.568,2025-07-18 14:59:42.892,14 St Ferry - 14 St & Shipyard Ln,HB202,,,40.752961,-74.024353,,,member,2025-07-18 14:46:32.568,2025-07-18 14:59:42.892,HB202,,electric_bike
4,772B1CAA099E7DA5,classic_bike,2025-07-13 10:49:03.936,2025-07-14 11:48:59.171,Southwest Park - Jackson St & Observer Hwy,HB401,,,40.737551,-74.041664,,,casual,2025-07-13 10:49:03.936,2025-07-14 11:48:59.171,HB401,,classic_bike


In [7]:
# مثال: replace these with your actual uploaded file names
JULY_FILE = "/content/201907-citibike-tripdata.csv"      # example
NOV_FILE  = "/content/201911-citibike-tripdata.csv"      # example

df_july = read_tripdata_csv(JULY_FILE)
df_nov  = read_tripdata_csv(NOV_FILE)

df_july = standardize_trip_columns(df_july)
df_nov  = standardize_trip_columns(df_nov)

df = pd.concat([df_july, df_nov], ignore_index=True)
print("Total rows:", len(df))
df.head()


FileNotFoundError: [Errno 2] No such file or directory: '/content/201907-citibike-tripdata.csv'

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184450 entries, 0 to 184449
Data columns (total 18 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   ride_id             184450 non-null  object        
 1   rideable_type       184450 non-null  object        
 2   started_at          184450 non-null  object        
 3   ended_at            184450 non-null  object        
 4   start_station_name  184450 non-null  object        
 5   start_station_id    184450 non-null  object        
 6   end_station_name    183885 non-null  object        
 7   end_station_id      183643 non-null  object        
 8   start_lat           184450 non-null  float64       
 9   start_lng           184450 non-null  float64       
 10  end_lat             183646 non-null  float64       
 11  end_lng             183646 non-null  float64       
 12  member_casual       184450 non-null  object        
 13  started_at_std      184450 no

In [9]:
df["date"] = df["started_at_std"].dt.date
df["year"] = df["started_at_std"].dt.year
df["month"] = df["started_at_std"].dt.month
df["day"] = df["started_at_std"].dt.day
df["dow"] = df["started_at_std"].dt.dayofweek  # 0=Mon

# first week filter (days 1 to 7)
df_first_week = df[(df["day"] >= 1) & (df["day"] <= 7)].copy()

# define rush window for "shortage" calculation (start time)
rush_start_hour = 7
rush_end_hour   = 16  # 4 PM

df_rush = df_first_week[
    (df_first_week["started_at_std"].dt.hour >= rush_start_hour) &
    (df_first_week["started_at_std"].dt.hour < rush_end_hour)
].copy()

print("First week rows:", len(df_first_week))
print("Rush-window rows:", len(df_rush))


First week rows: 43817
Rush-window rows: 21022


In [10]:
# Count departures
dep = df_rush.groupby(["year","month","date","start_station_std","bike_type_std"]).size().reset_index(name="departures")

# Count arrivals
arr = df_rush.groupby(["year","month","date","end_station_std","bike_type_std"]).size().reset_index(name="arrivals")
arr = arr.rename(columns={"end_station_std": "start_station_std"})  # align key name

# Merge
daily = dep.merge(arr, on=["year","month","date","start_station_std","bike_type_std"], how="outer").fillna(0)

daily["net_outflow"] = daily["departures"] - daily["arrivals"]
daily["shortage_target"] = daily["net_outflow"].clip(lower=0)  # only shortage part

# add day-of-week
daily["date_dt"] = pd.to_datetime(daily["date"])
daily["dow"] = daily["date_dt"].dt.dayofweek

daily.head()


Unnamed: 0,year,month,date,start_station_std,bike_type_std,departures,arrivals,net_outflow,shortage_target,date_dt,dow
0,2025,7,2025-07-01,4962.01,classic_bike,0.0,1.0,-1.0,0.0,2025-07-01,1
1,2025,7,2025-07-01,4993.15,electric_bike,0.0,1.0,-1.0,0.0,2025-07-01,1
2,2025,7,2025-07-01,5216.07,electric_bike,0.0,1.0,-1.0,0.0,2025-07-01,1
3,2025,7,2025-07-01,5247.1,electric_bike,0.0,1.0,-1.0,0.0,2025-07-01,1
4,2025,7,2025-07-01,5297.02,classic_bike,0.0,2.0,-2.0,0.0,2025-07-01,1


In [12]:
X = daily[["year","month","dow","start_station_std","bike_type_std"]]
y = daily["shortage_target"]

# time-based split (simple): train on July, test on Nov
train_mask = X["month"] == 7
test_mask  = X["month"] == 11

X_train, y_train = X[train_mask], y[train_mask]
X_test, y_test   = X[test_mask],  y[test_mask]

cat_features = ["start_station_std","bike_type_std"]
num_features = ["year","month","dow"]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_features),
        ("num", "passthrough", num_features),
    ]
)

model = HistGradientBoostingRegressor(max_depth=6, learning_rate=0.1, random_state=42)

pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", model)
])

pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)

mae = mean_absolute_error(y_test, pred)
rmse = mean_squared_error(y_test, pred) ** 0.5

print("Evaluation on November (test):")
print("MAE :", round(mae, 4))
print("RMSE:", round(rmse, 4))


Evaluation on November (test):
MAE : 2.0545
RMSE: 2.7875


In [13]:
daily["pred_shortage"] = pipe.predict(X)

# Produce separate outputs for July and November first week
out_july = daily[daily["month"] == 7].copy()
out_nov  = daily[daily["month"] == 11].copy()

def make_plan(out_df, month_name):
    plan = out_df.groupby(["date","start_station_std","bike_type_std"], as_index=False)["pred_shortage"].mean()
    plan["recommended_bikes_to_add"] = np.ceil(plan["pred_shortage"]).astype(int)
    plan = plan.drop(columns=["pred_shortage"])
    plan = plan.sort_values(["date","recommended_bikes_to_add"], ascending=[True, False])
    plan.to_csv(f"redistribution_plan_{month_name}.csv", index=False)
    return plan

plan_july = make_plan(out_july, "july")
plan_nov  = make_plan(out_nov,  "november")

print("Saved:")
print("redistribution_plan_july.csv")
print("redistribution_plan_november.csv")

plan_july.head(10)


Saved:
redistribution_plan_july.csv
redistribution_plan_november.csv


Unnamed: 0,date,start_station_std,bike_type_std,recommended_bikes_to_add
0,2025-07-01,4962.01,classic_bike,3
4,2025-07-01,5297.02,classic_bike,3
16,2025-07-01,HB101,classic_bike,3
18,2025-07-01,HB103,classic_bike,3
20,2025-07-01,HB105,classic_bike,3
22,2025-07-01,HB106,classic_bike,3
24,2025-07-01,HB201,classic_bike,3
26,2025-07-01,HB202,classic_bike,3
28,2025-07-01,HB203,classic_bike,3
30,2025-07-01,HB301,classic_bike,3


In [14]:
from google.colab import files
files.download("redistribution_plan_july.csv")
files.download("redistribution_plan_november.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>