In [3]:
# 1️⃣ Import libraries
import pandas as pd
import os

# 2️⃣ Load raw Uber CSV
data_path = '../data/raw/uber-raw-data-sep14.csv'
data = pd.read_csv(data_path)

# 3️⃣ Check first few rows
print("First 5 rows of the dataset:")
data.head()


First 5 rows of the dataset:


Unnamed: 0,Date/Time,Lat,Lon,Base
0,9/1/2014 0:01:00,40.2201,-74.0021,B02512
1,9/1/2014 0:01:00,40.75,-74.0027,B02512
2,9/1/2014 0:03:00,40.7559,-73.9864,B02512
3,9/1/2014 0:06:00,40.745,-73.9889,B02512
4,9/1/2014 0:11:00,40.8145,-73.9444,B02512


In [4]:
# Convert Date/Time to datetime
data['Date/Time'] = pd.to_datetime(data['Date/Time'])

# Remove duplicates
data = data.drop_duplicates()

# Remove invalid coordinates (optional)
data = data[(data['Lat'] != 0) & (data['Lon'] != 0)]

# Check cleaned data
data.head()


Unnamed: 0,Date/Time,Lat,Lon,Base
0,2014-09-01 00:01:00,40.2201,-74.0021,B02512
1,2014-09-01 00:01:00,40.75,-74.0027,B02512
2,2014-09-01 00:03:00,40.7559,-73.9864,B02512
3,2014-09-01 00:06:00,40.745,-73.9889,B02512
4,2014-09-01 00:11:00,40.8145,-73.9444,B02512


In [5]:
# Create new features
data['hour'] = data['Date/Time'].dt.hour
data['day_of_week'] = data['Date/Time'].dt.dayofweek  # Monday=0, Sunday=6
data['month'] = data['Date/Time'].dt.month

# Preview
data.head()


Unnamed: 0,Date/Time,Lat,Lon,Base,hour,day_of_week,month
0,2014-09-01 00:01:00,40.2201,-74.0021,B02512,0,0,9
1,2014-09-01 00:01:00,40.75,-74.0027,B02512,0,0,9
2,2014-09-01 00:03:00,40.7559,-73.9864,B02512,0,0,9
3,2014-09-01 00:06:00,40.745,-73.9889,B02512,0,0,9
4,2014-09-01 00:11:00,40.8145,-73.9444,B02512,0,0,9


In [6]:
# Aggregate ride counts per Base and hour/day/month
ride_counts = data.groupby(['Base', 'hour', 'day_of_week', 'month']).size().reset_index(name='ride_count')
ride_counts.head()


Unnamed: 0,Base,hour,day_of_week,month,ride_count
0,B02512,0,0,9,41
1,B02512,0,1,9,37
2,B02512,0,2,9,51
3,B02512,0,3,9,49
4,B02512,0,4,9,58


In [7]:
# Ensure folder exists
if not os.path.exists('../data/processed'):
    os.makedirs('../data/processed')

processed_path = '../data/processed/uber_features.csv'
ride_counts.to_csv(processed_path, index=False)
print(f"✅ Features saved successfully at {processed_path}")


✅ Features saved successfully at ../data/processed/uber_features.csv
