In [None]:
#Predict the price of the Uber ride from a given pickup point to the agreed drop-off location.
#Perform following tasks:
#1. Pre-process the dataset.
#2. Identify outliers.
#3. Check the correlation.
#4. Implement linear regression and random forest regression models.
#5. Evaluate the models and compare their respective scores like R2, RMSE, etc

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
path='/content/drive/MyDrive/DSBDAL/uber.csv'
ds=pd.read_csv(path)

In [None]:
ds.dtypes

In [None]:

ds.describe()

In [None]:
# check for null values in the data-frame
print(ds.isna().sum())

# dropoff_longitude and dropoff_latitude have a single null value
# remove those rows from the data-frame
ds.dropna(inplace=True)

In [None]:
#Taxi fare trends often vary by time of day (e.g., peak hours like morning and evening may have higher fares, whereas late-night or early-morning hours could see lower fares). Using just the hour helps capture these patterns.
# remove date from the `key` column
# and keep only 'hour of the day'
ds["key"] = pd.to_datetime(ds["key"]).dt.strftime("%H").astype('float32')

# rename `key` to `time`
ds = ds.rename(columns={"key": "time"})

# `key` and `pickup_datetime` are same
# remove `pickup_datetime`
ds.drop(["pickup_datetime"], axis=1, inplace=True)

In [None]:

# drop `Unnamed: 0`
ds.drop(["Unnamed: 0"], axis=1, inplace=True)

In [None]:
import math

# add distance between pickup location and dropoff location
# in the ds

def euclidean_distance(lat1, lon1, lat2, lon2):
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    distance = math.sqrt(dlat**2 + dlon**2)
    return distance

ds['distance'] = ds.apply(
    lambda row: euclidean_distance(
        row['pickup_latitude'],
        row['pickup_longitude'],
        row['dropoff_latitude'],
        row['dropoff_longitude']
    ), axis=1
)

In [None]:
ds.columns

In [None]:
# remove outlier(s) where passenger_count > 100
sns.scatterplot(ds, y="fare_amount", x="passenger_count")
ds = ds[ds["passenger_count"] < 100]

In [None]:
#BEFORE
plt.figure(figsize=(10, 6))
sns.boxplot(df['fare_amount'])
plt.show()

In [None]:
# remove outliers from fare amount
q1=ds['fare_amount'].quantile(0.25)
q3=ds['fare_amount'].quantile(0.75)

iqr=q3-q1

lower=q1-1.5*iqr
upper=q3+1.5*iqr

ds=ds[(ds['fare_amount']>lower) & (ds['fare_amount']<upper)]


In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(df['fare_amount'])
plt.show()

In [None]:
import matplotlib.pyplot as plt
correlation_matrix=ds.corr()
sns.heatmap(correlation_matrix,annot=True)
plt.show()
#ranging from -1 to 1
#1 is perfect positive correlation.it means if one increases other also increases
#-1 one increases other decrease
#0 no correalation one cannot predict other


##distance will mostly impact the target variable fare amount


In [None]:

from sklearn.model_selection import train_test_split

# train-test split
X = ds.drop('fare_amount', axis=1)
y = ds['fare_amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

from sklearn.linear_model import LinearRegression

# linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import root_mean_squared_error

print("R2 score: ", r2_score(y_test, y_pred))
print("RMSE: ", root_mean_squared_error(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestRegressor
# random forest regression model
# takes more time to train (comeback after 2 mins)
model = RandomForestRegressor(n_estimators = 100, random_state = 101)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import root_mean_squared_error

print("R2 score: ", r2_score(y_test, y_pred))
print("RMSE: ", root_mean_squared_error(y_test, y_pred))
#both metrics used to evaluate the performance of regression models,

R2 score:  0.7152077887150894
RMSE:  2.223690457075081
