In [5]:
# ...existing code...
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

csv_path = r"C:\Users\asara\OneDrive\Documents\Desktop\BIG DATA\data\silver\ride_demand_silver.csv"

# read CSV in chunks and aggregate rides by pickup hour to avoid MemoryError
usecols = ["tpep_pickup_datetime"]
parse_dates = ["tpep_pickup_datetime"]
chunksize = 100_000

# initialize counts for hours 0-23
hour_counts = pd.Series(0, index=range(24), dtype="int64")

try:
    for chunk in pd.read_csv(csv_path, usecols=usecols, parse_dates=parse_dates, chunksize=chunksize, low_memory=True):
        # ensure datetime and extract hour
        chunk["tpep_pickup_datetime"] = pd.to_datetime(chunk["tpep_pickup_datetime"], errors="coerce")
        hrs = chunk["tpep_pickup_datetime"].dt.hour.dropna().astype(int)
        if not hrs.empty:
            counts = hrs.value_counts().sort_index()
            hour_counts = hour_counts.add(counts, fill_value=0)
except MemoryError:
    raise MemoryError("Reading the CSV still failed due to memory. Try reducing chunksize or processing on a machine with more RAM.")

demand_df = hour_counts.reset_index()
demand_df.columns = ["hour", "demand"]
demand_df["hour"] = demand_df["hour"].astype(int)

# simple model: predict demand from hour of day
X = demand_df[["hour"]]
y = demand_df["demand"]

# train/test split (note: only up to 24 rows if aggregating by hour)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print("Rows in demand_df:", demand_df.shape[0])
print("Mean Absolute Error:", mae)
# ...existing code...

Rows in demand_df: 24
Mean Absolute Error: 52213.249496981894
