In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

df = pd.read_csv("uber.csv")

# Select and clean data
df = df[['fare_amount','pickup_datetime','pickup_longitude','pickup_latitude',
         'dropoff_longitude','dropoff_latitude','passenger_count']].dropna()
df = df[(df['fare_amount']>0) & (df['passenger_count']>0)]

df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['hour'] = df['pickup_datetime'].dt.hour
df['day'] = df['pickup_datetime'].dt.day
df['month'] = df['pickup_datetime'].dt.month

# Distance
def dist(lat1,lon1,lat2,lon2):
    lat1,lon1,lat2,lon2 = map(np.radians,[lat1,lon1,lat2,lon2])
    return 6371*2*np.arcsin(np.sqrt(np.sin((lat2-lat1)/2)**2 +
             np.cos(lat1)*np.cos(lat2)*np.sin((lon2-lon1)/2)**2))

df['distance'] = dist(df.pickup_latitude,df.pickup_longitude,
                      df.dropoff_latitude,df.dropoff_longitude)

X = df[['passenger_count','hour','day','month','distance']]
y = df['fare_amount']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# Models
lr = LinearRegression().fit(X_train,y_train)
rf = RandomForestRegressor().fit(X_train,y_train)

# Evaluate
def score(model,name):
    pred = model.predict(X_test)
    print(f"\n{name} → R2:",round(r2_score(y_test,pred),3),
          "RMSE:",round(np.sqrt(mean_squared_error(y_test,pred)),3))

score(lr,"Linear Regression")
score(rf,"Random Forest")



Linear Regression → R2: 0.001 RMSE: 10.133

Random Forest → R2: 0.677 RMSE: 5.76
