In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error

In [4]:
#loading the dataset
pred=pd.read_csv("Uber.csv")

In [8]:
#finding Data Shape
pred.shape

(200000, 8)

In [11]:
#find integer columns
col=pred.select_dtypes(include=['int']).columns
len(col)

2

In [12]:
#finding null values
pred.isnull()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
199995,False,False,False,False,False,False,False,False
199996,False,False,False,False,False,False,False,False
199997,False,False,False,False,False,False,False,False
199998,False,False,False,False,False,False,False,False


In [14]:
#missing values count
miss=pred['dropoff_longitude'].isnull().sum()
miss

1

In [17]:
#finding data type
dtyp = pred['pickup_datetime'].dtype
print(dtyp)

object


In [18]:
pred.dropna(subset=['fare_amount'], inplace=True)
avgf = pred['fare_amount'].mean()
print(avgf)

11.359955250000626


In [26]:
#function to calculate Haversine distance
def haversine(lat1, lon1, lat2, lon2):
    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    #haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    
    #radius of Earth in kilometers mean value
    r = 6371
    
    #calculate the distance
    distance = c * r
    return distance

#calculate the Haversine distance for each row and store it in a new column
pred['hd'] = pred.apply(lambda row: haversine(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']), axis=1)

#calculate the median Haversine distance
mhd = pred['hd'].median()

print("Median Haversine distance between pickup and dropoff locations: " + str(mhd) + " kilometers")


Median Haversine distance between pickup and dropoff locations: 2.1209923961833708 kilometers


In [27]:
max_distance = pred['haversine_distance'].max()

# Print the maximum Haversine distance
print("Maximum Haversine Distance:", max_distance)

Maximum Haversine Distance: 16409.239135313164


In [29]:
zero_distance_rides = pred[pred['haversine_distance'] == 0.0]

# Calculate the mean 'fare_amount' for rides with 0.0 Haversine distance
mfzero = zero_distance_rides['fare_amount'].mean()

# Print the mean fare for rides with 0.0 Haversine distance
print("Mean 'fare_amount' for rides with 0.0 Haversine Distance:", mfzero)

Mean 'fare_amount' for rides with 0.0 Haversine Distance: 11.585317826704578


In [30]:
mfa = pred['fare_amount'].max()

# Print the maximum 'fare_amount'
print("Maximum 'fare_amount' for a ride:", mfa)

Maximum 'fare_amount' for a ride: 499.0


In [31]:
# Find the row with the maximum 'fare_amount'
costliest_ride = pred[pred['fare_amount'] == pred['fare_amount'].max()]

# Calculate the Haversine distance for the costliest ride
costliest_ride_distance = haversine(costliest_ride['pickup_latitude'].values[0],
                                     costliest_ride['pickup_longitude'].values[0],
                                     costliest_ride['dropoff_latitude'].values[0],
                                     costliest_ride['dropoff_longitude'].values[0])

# Print the Haversine distance for the costliest ride
print("Haversine Distance for the Costliest Ride:", costliest_ride_distance)

Haversine Distance for the Costliest Ride: 0.0007899213191009994


In [32]:

# Convert the 'pickup_datetime' column to a datetime data type
pred['pickup_datetime'] = pd.to_datetime(pred['pickup_datetime'])

# Extract the year from the 'pickup_datetime' column
pred['pickup_year'] = pred['pickup_datetime'].dt.year

# Count the number of rides recorded in the year 2014
rides_in_2014 = pred[pred['pickup_year'] == 2014]

num_rides_2014 = len(rides_in_2014)

print("Number of rides recorded in the year 2014: " + str(num_rides_2014))


Number of rides recorded in the year 2014: 29968


In [33]:
pred['pickup_datetime'] = pd.to_datetime(pred['pickup_datetime'])

# Filter for rides in the first quarter of 2014
rides_first_quarter_2014 = pred[(pred['pickup_datetime'].dt.year == 2014) & (pred['pickup_datetime'].dt.quarter == 1)]

# Count the number of rides recorded in the first quarter of 2014
num_rides_first_quarter_2014 = len(rides_first_quarter_2014)

print("Number of rides recorded in the first quarter of 2014: " + str(num_rides_first_quarter_2014))

Number of rides recorded in the first quarter of 2014: 7687


In [34]:
rides_september_2010 = pred[(pred['pickup_datetime'].dt.year == 2010) & (pred['pickup_datetime'].dt.month == 9)]

# Calculate the day of the week and count the rides for each day
day_of_week_counts = rides_september_2010['pickup_datetime'].dt.day_name().value_counts()

# Find the day with the maximum rides
max_rides_day = day_of_week_counts.idxmax()
max_rides_count = day_of_week_counts.max()

print("On " + max_rides_day + " of September 2010, the maximum rides were recorded with a count of " + str(max_rides_count) + " rides.")


On Thursday of September 2010, the maximum rides were recorded with a count of 457 rides.


In [40]:
#working with null values
numeric_columns = pred.select_dtypes(include=['number']).columns
categorical_columns = pred.select_dtypes(exclude=['number']).columns

for column in numeric_columns:
    pred[column].fillna(pred[column].mean(), inplace=True)

for column in categorical_columns:
    pred[column].fillna(pred[column].mode()[0], inplace=True)

In [38]:
pred.isnull().sum()

ride_id               0
fare_amount           0
pickup_datetime       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
passenger_count       0
haversine_distance    0
pickup_year           0
dtype: int64