In [1]:
import pandas as pd

# Load the cleaned CSV from Phase 1
df = pd.read_csv('uber_cleaned.csv')

# Verify
df.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5


In [3]:
# Convert 'pickup_datetime' to datetime (if not already done)
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [4]:
# Extract features
df['hour'] = df['pickup_datetime'].dt.hour          # Hour of day (0-23)
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek  # Monday=0, Sunday=6
df['month'] = df['pickup_datetime'].dt.month        # Month (1-12)
df['year'] = df['pickup_datetime'].dt.year          # Year

# Check results
df[['pickup_datetime', 'hour', 'day_of_week', 'month', 'year']].head()


Unnamed: 0,pickup_datetime,hour,day_of_week,month,year
0,2015-05-07 19:52:06+00:00,19,3,5,2015
1,2009-07-17 20:04:56+00:00,20,4,7,2009
2,2009-08-24 21:45:00+00:00,21,0,8,2009
3,2009-06-26 08:22:21+00:00,8,4,6,2009
4,2014-08-28 17:47:00+00:00,17,3,8,2014


In [6]:
# Define peak hours (e.g., 7-9 AM and 5-7 PM)
df['is_peak'] = df['hour'].apply(lambda x: 1 if (7 <= x <= 9) or (17 <= x <= 19) else 0)

# Label weekdays vs. weekends
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)  # 5=Sat, 6=Sun

# Check counts
print("Peak vs. Off-Peak Rides:")
print(df['is_peak'].value_counts())

Peak vs. Off-Peak Rides:
is_peak
0    139844
1     60044
Name: count, dtype: int64


In [7]:
from math import radians, sin, cos, sqrt, atan2

def haversine(lon1, lat1, lon2, lat2):
    # Convert degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    distance_km = 6371 * c  # Earth radius in km
    return distance_km

# Calculate distance (ensure columns exist)
if all(col in df.columns for col in ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']):
    df['trip_distance_km'] = df.apply(
        lambda row: haversine(
            row['pickup_longitude'], row['pickup_latitude'],
            row['dropoff_longitude'], row['dropoff_latitude']
        ), axis=1
    )
    print(df['trip_distance_km'].describe())
else:
    print("Coordinate columns not found. Skipping distance calculation.")

count    199887.000000
mean         20.772470
std         382.094319
min           0.000000
25%           1.215511
50%           2.120992
75%           3.874109
max       16409.239135
Name: trip_distance_km, dtype: float64


In [11]:
# Example: Convert payment type to dummy variables
if 'payment_type' in df.columns:
    df = pd.get_dummies(df, columns=['payment_type'], prefix='pay')
    df.head()

In [12]:
df.to_csv('uber_enhanced.csv', index=False)
print("Enhanced dataset saved successfully!")

Enhanced dataset saved successfully!
