In [11]:
!pip install pandas numpy scikit-learn



In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

# Load dataset
uber = pd.read_csv('uber.csv')

# Preprocessing
uber.dropna(inplace=True)
uber = uber[uber['fare_amount'] > 0]
uber['pickup_datetime'] = pd.to_datetime(uber['pickup_datetime'], errors='coerce')

uber['hour'] = uber['pickup_datetime'].dt.hour
uber['day'] = uber['pickup_datetime'].dt.dayofweek

uber['distance'] = np.sqrt((uber['dropoff_longitude'] - uber['pickup_longitude'])**2 +
                           (uber['dropoff_latitude'] - uber['pickup_latitude'])**2)

# Outlier removal
uber = uber[(uber['fare_amount'] < 100) & (uber['distance'] < 5)]

# Correlation
print("Correlation Matrix:\n", uber[['fare_amount', 'distance', 'passenger_count']].corr())

# Train-test split
X = uber[['distance', 'passenger_count', 'hour']]
y = uber['fare_amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Evaluation
print("\nModel Evaluation Results:")
print("Linear Regression → R2:", round(r2_score(y_test, y_pred_lr), 4))
print("Random Forest     → R2:", round(r2_score(y_test, y_pred_rf), 4))
print("Linear Regression → RMSE:", round(np.sqrt(mean_squared_error(y_test, y_pred_lr)), 4))
print("Random Forest     → RMSE:", round(np.sqrt(mean_squared_error(y_test, y_pred_rf)), 4))
print("Linear Regression → MAE:", round(mean_absolute_error(y_test, y_pred_lr), 4))
print("Random Forest     → MAE:", round(mean_absolute_error(y_test, y_pred_rf), 4))


Correlation Matrix:
                  fare_amount  distance  passenger_count
fare_amount         1.000000  0.758895         0.012600
distance            0.758895  1.000000         0.005031
passenger_count     0.012600  0.005031         1.000000

Model Evaluation Results:
Linear Regression → R2: 0.6049
Random Forest     → R2: 0.7465
Linear Regression → RMSE: 5.9147
Random Forest     → RMSE: 4.7374
Linear Regression → MAE: 2.9632
Random Forest     → MAE: 2.5784
