In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
df = pd.read_csv("./datasets/uber_9_10.csv")
df.head()

In [None]:
print(df.info())
print(df.describe())
print(df.isnull().sum())

In [None]:
df = df.dropna()

df = df[(df['pickup_longitude'] > -80) & (df['pickup_longitude'] < -70)]
df = df[(df['dropoff_longitude'] > -80) & (df['dropoff_longitude'] < -70)]
df = df[(df['pickup_latitude'] > 35) & (df['pickup_latitude'] < 45)]
df = df[(df['dropoff_latitude'] > 35) & (df['dropoff_latitude'] < 45)]

df = df[(df['fare_amount'] > 0) & (df['fare_amount'] < 100)]
df.head()


In [None]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')

df['hour'] = df['pickup_datetime'].dt.hour
df['day'] = df['pickup_datetime'].dt.day
df['month'] = df['pickup_datetime'].dt.month
df['year'] = df['pickup_datetime'].dt.year

def haversine(lat1, lon1, lat2, lon2):
    R = 6371 
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)
    a = np.sin(dphi/2)**2 + np.cos(phi1)*np.cos(phi2)*np.sin(dlambda/2)**2
    return 2 * R * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

df['distance_km'] = haversine(df['pickup_latitude'], df['pickup_longitude'],
                              df['dropoff_latitude'], df['dropoff_longitude'])

df = df[df['distance_km'] > 0]
df.head()

In [None]:
plt.figure(figsize=(5,4))
sns.histplot(df['fare_amount'], kde=True)
plt.title("Distribution of Fare Amounts")
plt.show()

plt.figure(figsize=(5,4))
sns.scatterplot(x='distance_km', y='fare_amount', data=df)
plt.title("Fare vs Distance")
plt.show()

plt.figure(figsize=(5,4))
sns.boxplot(x='passenger_count', y='fare_amount', data=df)
plt.title("Passenger Count vs Fare")
plt.show()

In [None]:
X = df[['pickup_longitude','pickup_latitude','dropoff_longitude',
        'dropoff_latitude','passenger_count','distance_km','hour','day','month','year']]
y = df['fare_amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)

print("R² without PCA:", r2_score(y_test, y_pred))
print("RMSE without PCA:", np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)


plt.figure(figsize=(6,4))
plt.plot(np.cumsum(pca.explained_variance_ratio_), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Variance Explained by PCA Components')
plt.grid(True)
plt.show()


print("Explained variance ratio:", pca.explained_variance_ratio_)
print("Total variance preserved:", sum(pca.explained_variance_ratio_))


In [None]:
lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train)
y_pred_pca = lr_pca.predict(X_test_pca)

print("R² with PCA:", r2_score(y_test, y_pred_pca))
print("RMSE with PCA:", np.sqrt(mean_squared_error(y_test, y_pred_pca)))


In [None]:
results = pd.DataFrame({
    "Model":["Without PCA","With PCA"],
    "R2":[r2_score(y_test, y_pred), r2_score(y_test, y_pred_pca)],
    "RMSE":[np.sqrt(mean_squared_error(y_test, y_pred)),
            np.sqrt(mean_squared_error(y_test, y_pred_pca))]
})

print(results)

plt.bar(results["Model"], results["R2"], color=['skyblue','orange'])
plt.ylabel("R² Score")
plt.title("Model Comparison")
plt.show()

Good afternoon, ma’am. This notebook predicts Uber fare amount using Linear Regression, and also compares performance with and without PCA. I start by importing the common libraries: pandas and numpy for data handling, matplotlib.pyplot and seaborn for charts, and from scikit-learn I bring LabelEncoder (not used later), StandardScaler for feature scaling, train_test_split to make train/test sets, PCA for dimensionality reduction, LinearRegression as the model, and the metrics r2_score and mean_squared_error to evaluate predictions. I load the dataset with pd.read_csv("./datasets/uber_9_10.csv"), quickly peek at the first rows using df.head(), and then print the dataset structure with df.info(), basic statistics with df.describe(), and a null count using df.isnull().sum(). To avoid errors from missing data, I drop any rows containing nulls using df.dropna().

Next, I perform geographic and target cleaning to remove obvious outliers. I keep only trips with pickup and dropoff longitudes between −80 and −70, and latitudes between 35 and 45 (roughly a bounding box around the study area), and I also clip fares to (0, 100) dollars to remove negative or extreme values. I convert the pickup_datetime text to a true datetime with pd.to_datetime(..., errors='coerce'), and from that timestamp I engineer useful time features: hour, day, month, and year, which often explain fare variation (rush hours, seasons, etc.).

To estimate trip distance, I define a haversine function. It assumes the Earth’s radius R = 6371 km and converts the two coordinate pairs to radians. Then it computes the central angle using the standard haversine formula a = sin²(Δφ/2) + cos φ1 · cos φ2 · sin²(Δλ/2) and returns the great-circle distance 2R · atan2(√a, √(1−a)). I apply this function to each row to create distance_km, and I drop trips with distance ≤ 0 which are invalid. For quick exploration, I plot three visuals: a histogram of fare_amount to see its distribution, a scatterplot of distance_km vs fare_amount to confirm that fare grows with distance, and a boxplot of passenger_count vs fare_amount to check if group size influences price.

Now I assemble the feature matrix X and the target y. The features include raw coordinates for pickup and dropoff, passenger_count, the computed distance_km, and the time features hour, day, month, year. The target y is fare_amount. I split the data into training and testing sets with a 70/30 ratio using a fixed random_state=42 for reproducibility. Because linear models are sensitive to feature scale, I standardize features: I fit StandardScaler on the training set and transform both train and test sets, which is the correct way to avoid data leakage.

First, I train a baseline Linear Regression on the scaled features. After fitting, I predict on the test set and report two metrics: R² (how much variance is explained by the model) and RMSE (root mean squared error, a measure of average prediction error in fare units). Next, I try PCA to reduce dimensionality and possibly denoise the inputs. I set PCA(n_components=5), fit it on the scaled training data, and transform both train and test. I also plot the cumulative explained variance curve to show how much total variance is captured as we add components, and print the per-component explained_variance_ratio_ as well as their sum to show how much information we’ve preserved. Then I train a second Linear Regression on the PCA-reduced features, make predictions, and again compute R² and RMSE.

Finally, I gather both runs into a small results DataFrame labeled “Without PCA” and “With PCA,” print it, and draw a simple bar chart comparing their R² scores. If PCA improves R² and/or lowers RMSE, it suggests that compressing the feature space to the most informative directions helped the linear model; if not, it means the original standardized features already captured the relationships well. In summary, this workflow: loads and cleans rides data, engineers time and distance features, visualizes distributions and relationships, trains a linear model, introduces PCA to test dimensionality reduction, evaluates both approaches using R² and RMSE, and then clearly compares which setup predicts Uber fares more accurately.