In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import datetime as dt

In [3]:
# Load dataset
data = pd.read_csv("uber.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [5]:
# Drop rows with missing dropoff coordinates
data = data.dropna(subset=['dropoff_longitude', 'dropoff_latitude'])

In [7]:
#Haversine distance function to calculate distance between two lat/long points
def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # Radius of Earth in kilometers
    return c * r

In [9]:
# Calculate distance for each ride in the dataset
data['distance_km'] = data.apply(
    lambda row: haversine(
        row['pickup_latitude'], row['pickup_longitude'],
        row['dropoff_latitude'], row['dropoff_longitude']
    ), axis=1
)


In [10]:
# Convert pickup_datetime to datetime and extract features
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])
data['hour'] = data['pickup_datetime'].dt.hour
data['day_of_week'] = data['pickup_datetime'].dt.dayofweek


In [13]:
# Prepare features and target variable
X = data[['distance_km', 'hour', 'day_of_week', 'passenger_count']]
y = data['fare_amount']


In [15]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
#Train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)


In [22]:
# Predict on test set
y_pred = model.predict(X_test)

In [24]:
# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 6.15888270328115


In [26]:
# Example Prediction
pickup = (-73.9614469999999, 40.693965)  # Example pickup location
dropoff = (-73.871195, 40.774297) # Example dropoff location
datetime_of_ride = dt.datetime(2024, 10, 12, 7, 4)

In [28]:
# Calculate distance and prepare other features for prediction
distance = haversine(pickup[0], pickup[1], dropoff[0], dropoff[1])
hour = datetime_of_ride.hour
day_of_week = datetime_of_ride.weekday()
passenger_count = 1 # Example passenger count

In [30]:
# Prepare input data for prediction
input_data = scaler.transform([[distance, hour, day_of_week, passenger_count]])



In [32]:
# Predict fare
predicted_fare = model.predict(input_data)
print(f"Predicted Fare: ${predicted_fare[0]:.2f}")

Predicted Fare: $23.93
