### Loading Important Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Loading the dataset

In [2]:
data = pd.read_csv(r"C:\Users\nsaha\Downloads\uber_rides_data.xlsx - sample_train.csv")

### Reading the data

In [3]:
data.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


### What is the shape of given dataset?

In [4]:
data.shape

(200000, 8)

### How many integer columns(by default) are given in the dataset?

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ride_id            200000 non-null  int64  
 1   fare_amount        200000 non-null  float64
 2   pickup_datetime    200000 non-null  object 
 3   pickup_longitude   200000 non-null  float64
 4   pickup_latitude    200000 non-null  float64
 5   dropoff_longitude  199999 non-null  float64
 6   dropoff_latitude   199999 non-null  float64
 7   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 12.2+ MB


### How many missing values exists in 'dropoff_longitude' column?

In [6]:
data.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [7]:
data['dropoff_longitude'].isnull().sum()

1

### What is the data type of ' pickup_datetime' feature in your data?

In [8]:
column_data_type = data['pickup_datetime'].dtype
print(column_data_type)

object


### Which function can be used to remove null values from the dataframe?

### We cann't drop remove column here because we need column to perform so we can simply fill the value with 0

In [9]:
data['dropoff_longitude'].fillna(0, inplace=True)
data['dropoff_latitude'].fillna(0, inplace=True)

In [10]:
data.isnull().sum()

ride_id              0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

### Which of the following is the correct syntax to convert 'pickup_datetime' to datetime datatype?

In [11]:
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])

In [12]:
data['pickup_datetime']

0        2015-05-07 19:52:06+00:00
1        2009-07-17 20:04:56+00:00
2        2009-08-24 21:45:00+00:00
3        2009-06-26 08:22:21+00:00
4        2014-08-28 17:47:00+00:00
                    ...           
199995   2012-10-28 10:49:00+00:00
199996   2014-03-14 01:09:00+00:00
199997   2009-06-29 00:42:00+00:00
199998   2015-05-20 14:56:25+00:00
199999   2010-05-15 04:08:00+00:00
Name: pickup_datetime, Length: 200000, dtype: datetime64[ns, UTC]

In [13]:
data['year'] = data['pickup_datetime'].dt.year
data['month'] = data['pickup_datetime'].dt.month
data['day'] = data['pickup_datetime'].dt.day

### What is the average fare amount?

In [14]:
average_fare = data['fare_amount'].mean()

In [15]:
print("The average fare amount is: ",average_fare)

The average fare amount is:  11.359955250000002


### Calculate distance between each pickup and dropoff points using Haversine formula. 

#### What is the median haversine distance between pickup and dropoff location according to the given dataset?

### What is the maximum haversine distance between pickup and dropoff location according to the given dataset?

### How many rides have 0.0 haversine distance between pickup and dropoff location according to the given dataset?

In [16]:

# Function to calculate Haversine distance
def haversine_distance(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    r = 6371  # Radius of the Earth in kilometers
    return r * c

# Calculate Haversine distance for each row
data['haversine_distance'] = data.apply(lambda row: haversine_distance(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']), axis=1)

# Find the median Haversine distance
median_distance = data['haversine_distance'].median()

print("Median Haversine Distance:", median_distance, "kilometers")

# Find the maximum Haversine distance
max_distance = data['haversine_distance'].max()

print("Maximum Haversine Distance:", max_distance, "kilometers")


# Count rides with 0.0 Haversine distance
zero_distance_rides = data[data['haversine_distance'] == 0.0]
count_zero_distance_rides = zero_distance_rides.shape[0]

print("Number of rides with 0.0 Haversine distance:", count_zero_distance_rides)

Median Haversine Distance: 2.121005261503789 kilometers
Maximum Haversine Distance: 16409.239135313168 kilometers
Number of rides with 0.0 Haversine distance: 5632


### What is the maximum 'fare_amount' for a ride?

In [17]:
max_fare = data['fare_amount'].max()
max_fare

499.0

### What is the mean 'fare_amount' for rides with 0 haversine distance?

In [18]:
mean_fare_amount_zero_distance = zero_distance_rides['fare_amount'].mean()
print("Mean 'fare_amount' for rides with 0.0 Haversine distance:", mean_fare_amount_zero_distance)

Mean 'fare_amount' for rides with 0.0 Haversine distance: 11.585317826704546


### What is the haversine distance between pickup and dropoff location for the costliest ride?

In [19]:
# Find the index of the row with the maximum 'fare_amount'
costliest_ride_index = data['fare_amount'].idxmax()

In [20]:
# Get the pickup and dropoff coordinates for the costliest ride
pickup_lat = data.at[costliest_ride_index, 'pickup_latitude']
pickup_lon = data.at[costliest_ride_index, 'pickup_longitude']
dropoff_lat = data.at[costliest_ride_index, 'dropoff_latitude']
dropoff_lon = data.at[costliest_ride_index, 'dropoff_longitude']

# Calculate the Haversine distance for the costliest ride
distance = haversine_distance(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon)

print("Haversine distance for the costliest ride:", distance, "kilometers")

Haversine distance for the costliest ride: 0.0007899213191009994 kilometers


In [21]:
data.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,haversine_distance
0,24238194,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,2015,5,7,1.683323
1,27835199,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,2009,7,17,2.45759
2,44984355,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,2009,8,24,5.036377
3,25894730,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,2009,6,26,1.661683
4,17610152,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,2014,8,28,4.47545


### How many rides were recorded in the year 2014?

In [22]:
# Count the number of rides recorded in the year 2014
rides_in_2014 = data[data['year'] == 2014].shape[0]

print("Number of rides recorded in the year 2014:", rides_in_2014)

Number of rides recorded in the year 2014: 29968


### How many rides were recorded in the first quarter of 2014?

In [23]:
# Filter the DataFrame to select rides in the first quarter of 2014 (January to March)
q1_2014_rides = data[(data['pickup_datetime'] >= '2014-01-01') & (data['pickup_datetime'] <= '2014-03-31')]

# Count the number of rides in the first quarter of 2014
count_q1_2014_rides = q1_2014_rides.shape[0]

print("Number of rides recorded in the first quarter of 2014:", count_q1_2014_rides)

Number of rides recorded in the first quarter of 2014: 7617


### On which day of the week in September 2010, maximum rides were recorded ?

In [24]:
data['ride_week_day'] = data['pickup_datetime'].dt.day_name()

In [25]:
september_rides = data[data['pickup_datetime'].dt.month == 9]

In [26]:
daily_counts = september_rides['ride_week_day'].value_counts()

In [27]:
max_day = daily_counts.idxmax()
max_rides = daily_counts[max_day]

print(f"The day in September with the maximum rides is {max_day} with {max_rides} rides.")

The day in September with the maximum rides is Thursday with 2351 rides.


### Apply a Machine Learning Algorithm to predict the fare amount given following input features:

#### passenger_count, distance and ride_week_day. Perform a 70-30 split of data

In [28]:
data.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,haversine_distance,ride_week_day
0,24238194,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,2015,5,7,1.683323,Thursday
1,27835199,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,2009,7,17,2.45759,Friday
2,44984355,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,2009,8,24,5.036377,Monday
3,25894730,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,2009,6,26,1.661683,Friday
4,17610152,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,2014,8,28,4.47545,Thursday


#### Convert the ride_week_day categorical variable into numerical format. One common way to do this is by using one-hot encoding.

In [29]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data['ride_week_day'] = label_encoder.fit_transform(data['ride_week_day'])

In [30]:
data.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,haversine_distance,ride_week_day
0,24238194,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,2015,5,7,1.683323,4
1,27835199,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,2009,7,17,2.45759,0
2,44984355,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,2009,8,24,5.036377,1
3,25894730,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,2009,6,26,1.661683,0
4,17610152,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,2014,8,28,4.47545,4


#### Split the Data into Training and Testing Sets: 

Split your dataset into training and testing sets with a 70-30 split. 

This can be done using scikit-learn's train_test_split function.

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
X = data[['passenger_count', 'haversine_distance', 'ride_week_day']]  # Features
y = data['fare_amount']               # Target variable


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#### Choose a Machine Learning Algorithm:
##### You can choose a regression algorithm like Linear Regression, Random Forest Regressor. Here, I'll use Linear Regression.

In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [35]:
# Create a linear regression model
model = LinearRegression()

In [36]:
# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

In [37]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R2): {r2}')

# We can also access the coefficients and intercept of the linear regression model
coefficients = model.coef_
intercept = model.intercept_

print(f'Coefficients: {coefficients}')
print(f'Intercept: {intercept}')

Mean Squared Error (MSE): 103.87934491593914
Root Mean Squared Error (RMSE): 10.192121708257764
R-squared (R2): 0.00072748102082143
Coefficients: [ 0.06823667  0.00063852 -0.00592088]
Intercept: 11.249892158178197
