In [3]:
# 1️⃣ Import libraries
import pandas as pd
import os

# 2️⃣ Load raw Uber CSV
data_path = '../data/raw/uber-raw-data-sep14.csv'
data = pd.read_csv(data_path)

# 3️⃣ Check first few rows
print("First 5 rows of the dataset:")
data.head()


First 5 rows of the dataset:


Unnamed: 0,Date/Time,Lat,Lon,Base
0,9/1/2014 0:01:00,40.2201,-74.0021,B02512
1,9/1/2014 0:01:00,40.75,-74.0027,B02512
2,9/1/2014 0:03:00,40.7559,-73.9864,B02512
3,9/1/2014 0:06:00,40.745,-73.9889,B02512
4,9/1/2014 0:11:00,40.8145,-73.9444,B02512


In [2]:
import pandas as pd
import os

# Path to raw CSV
raw_path = '../data/raw/uber-raw-data-sep14.csv'
processed_path = '../data/processed/uber_processed.csv'

# Ensure raw file exists
if not os.path.exists(raw_path):
    raise FileNotFoundError(f"File not found: {raw_path}")

# Load raw data
data = pd.read_csv(raw_path)
print("✅ Raw data loaded successfully!")

# Convert Date/Time to datetime
data['Date/Time'] = pd.to_datetime(data['Date/Time'])

# Feature engineering
data['hour'] = data['Date/Time'].dt.hour
data['day_of_week'] = data['Date/Time'].dt.dayofweek
data['month'] = data['Date/Time'].dt.month

# Aggregate ride counts per Base and hour/day/month
ride_counts = data.groupby(['Base', 'hour', 'day_of_week', 'month']).size().reset_index(name='ride_count')

# Ensure processed folder exists
if not os.path.exists('../data/processed'):
    os.makedirs('../data/processed')

# Save processed data
ride_counts.to_csv(processed_path, index=False)
print(f"✅ Processed data saved at {processed_path}")


✅ Raw data loaded successfully!
✅ Processed data saved at ../data/processed/uber_processed.csv
