In [1]:
import pandas as pd

df = pd.read_excel('uber_rides_data.xlsx')

df.head()


Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [2]:
shape = df.shape

print("Number of rows:", shape[0])
print("Number of columns:", shape[1])

Number of rows: 200000
Number of columns: 8


In [3]:

integer_columns = df.select_dtypes(include=['int64'])

num_integer_columns = len(integer_columns.columns)

print("Number of integer columns:", num_integer_columns)

Number of integer columns: 2


In [4]:
missing_values = df['dropoff_longitude'].isna().sum()

print("Number of missing values in 'dropoff_longitude' column:", missing_values)

Number of missing values in 'dropoff_longitude' column: 1


In [5]:
pickup_datetime_dtype = df['pickup_datetime'].dtype

print("Data type of 'pickup_datetime' feature:", pickup_datetime_dtype)

Data type of 'pickup_datetime' feature: object


In [6]:
df.dropna(subset=['fare_amount'], inplace=True)

In [8]:
average_fare_amount = df['fare_amount'].mean()

In [9]:
print("Average fare amount:", average_fare_amount)

Average fare amount: 11.359955250000002


In [10]:
import numpy as np

In [11]:
def haversine(lat1, lon1, lat2, lon2):
  
    R = 6371.0

    lat1_rad = np.radians(lat1)
    lon1_rad = np.radians(lon1)
    lat2_rad = np.radians(lat2)
    lon2_rad = np.radians(lon2)

    
    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad

    
    a = np.sin(dlat / 2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    
    distance = R * c
    return distance

In [12]:
df['distance_km'] = df.apply(lambda row: haversine(
    row['pickup_latitude'], row['pickup_longitude'],
    row['dropoff_latitude'], row['dropoff_longitude']
), axis=1)

In [13]:
print(df[['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'distance_km']])

        pickup_latitude  pickup_longitude  dropoff_latitude  \
0             40.738354        -73.999817         40.723217   
1             40.728225        -73.994355         40.750325   
2             40.740770        -74.005043         40.772647   
3             40.790844        -73.976124         40.803349   
4             40.744085        -73.925023         40.761247   
...                 ...               ...               ...   
199995        40.739367        -73.987042         40.740297   
199996        40.736837        -73.984722         40.739620   
199997        40.756487        -73.986017         40.692588   
199998        40.725452        -73.997124         40.695415   
199999        40.720077        -73.984395         40.768793   

        dropoff_longitude  distance_km  
0              -73.999512     1.683323  
1              -73.994710     2.457590  
2              -73.962565     5.036377  
3              -73.965316     1.661683  
4              -73.973082     4.475450

In [14]:
median_distance = df['distance_km'].median()


print("Median Haversine distance between pickup and dropoff locations:", median_distance, "km")

Median Haversine distance between pickup and dropoff locations: 2.120992396182902 km


In [15]:
max_distance = df['distance_km'].max()

print("Maximum Haversine distance between pickup and dropoff locations:", max_distance, "km")

Maximum Haversine distance between pickup and dropoff locations: 16409.23913531317 km


In [16]:
zero_distance_rides = df[df['distance_km'] == 0.0]

num_zero_distance_rides = len(zero_distance_rides)

print("Number of rides with 0.0 Haversine distance between pickup and dropoff locations:", num_zero_distance_rides)

Number of rides with 0.0 Haversine distance between pickup and dropoff locations: 5632


In [17]:

zero_distance_rides = df[df['distance_km'] == 0.0]


mean_fare_amount_zero_distance = zero_distance_rides['fare_amount'].mean()

print("Mean 'fare_amount' for rides with 0.0 Haversine distance:", mean_fare_amount_zero_distance)

Mean 'fare_amount' for rides with 0.0 Haversine distance: 11.585317826704546


In [18]:
max_fare_amount = df['fare_amount'].max()

print("Maximum 'fare_amount' for a ride:", max_fare_amount)

Maximum 'fare_amount' for a ride: 499.0


In [19]:
costliest_ride = df[df['fare_amount'] == df['fare_amount'].max()]

pickup_lat = costliest_ride['pickup_latitude'].values[0]
pickup_lon = costliest_ride['pickup_longitude'].values[0]
dropoff_lat = costliest_ride['dropoff_latitude'].values[0]
dropoff_lon = costliest_ride['dropoff_longitude'].values[0]


def haversine(lat1, lon1, lat2, lon2):
    
    R = 6371.0

    lat1_rad = np.radians(lat1)
    lon1_rad = np.radians(lon1)
    lat2_rad = np.radians(lat2)
    lon2_rad = np.radians(lon2)

    dlon = lon2_rad - lon1_rad
    dlat = lat2_rad - lat1_rad

    a = np.sin(dlat / 2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    
    distance = R * c
    return distance

costliest_distance = haversine(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon)

print("Haversine distance for the costliest ride:", costliest_distance, "km")

Haversine distance for the costliest ride: 0.0007899213191009994 km


In [22]:

df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

df['pickup_year'] = df['pickup_datetime'].dt.year

rides_2014 = df[df['pickup_year'] == 2014]

num_rides_2014 = len(rides_2014)

print("Number of rides recorded in the year 2014:", num_rides_2014)

Number of rides recorded in the year 2014: 29968


In [23]:
rides_first_quarter_2014 = df[(df['pickup_year'] == 2014) & (df['pickup_datetime'].dt.quarter == 1)]

num_rides_first_quarter_2014 = len(rides_first_quarter_2014)

print("Number of rides recorded in the first quarter of 2014:", num_rides_first_quarter_2014)

Number of rides recorded in the first quarter of 2014: 7687


In [25]:
df['pickup_month'] = df['pickup_datetime'].dt.month

rides_september_2010 = df[(df['pickup_year'] == 2010) & (df['pickup_month'] == 9)]

rides_september_2010['day_of_week'] = rides_september_2010['pickup_datetime'].dt.dayofweek

rides_by_day = rides_september_2010['day_of_week'].value_counts()

day_with_max_rides = rides_by_day.idxmax()


day_names = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
day_with_max_rides_name = day_names[day_with_max_rides]

print("Day of the week with maximum rides in September 2010:", day_with_max_rides_name)

Day of the week with maximum rides in September 2010: Thursday


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rides_september_2010['day_of_week'] = rides_september_2010['pickup_datetime'].dt.dayofweek


In [27]:

df['ride_week_day'] = df['pickup_datetime'].dt.dayofweek

In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score



X = df[['passenger_count', 'distance_km', 'ride_week_day']]
y = df['fare_amount']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [47]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')


X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

model = LinearRegression()
model.fit(X_train_imputed, y_train)

y_pred = model.predict(X_test_imputed)

In [48]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)

print("R-squared (R2) Score:", r2)

R-squared (R2) Score: 0.0007624973420021774


In [49]:
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

imputer = SimpleImputer(strategy='mean')


X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)


tree_model = DecisionTreeRegressor()
tree_model.fit(X_train_imputed, y_train)


tree_y_pred = tree_model.predict(X_test_imputed)


tree_r2 = r2_score(y_test, tree_y_pred)


print("Decision Tree R-squared (R2) Score:", tree_r2)

Decision Tree R-squared (R2) Score: 0.48080029125400736


In [50]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score


imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)


rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_imputed, y_train)


rf_y_pred = rf_model.predict(X_test_imputed)


rf_r2 = r2_score(y_test, rf_y_pred)
print("Random Forest R-squared (R2) Score:", rf_r2)


Random Forest R-squared (R2) Score: 0.6213390060564012


In [51]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')


X_train_imputed = imputer.fit_transform(X_train)


X_test_imputed = imputer.transform(X_test)

In [52]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

k = 5  
knn_classifier = KNeighborsClassifier(n_neighbors=k)
knn_classifier.fit(X_train_imputed, y_train)

ValueError: Unknown label type: 'continuous'

In [53]:
n = df.shape[0]  

In [54]:
print(n)

200000


In [55]:
k = df.shape[1]

In [56]:
print(k)

12


In [57]:
r_squared_linear = 0.0007624973420021774
r_squared_decision_tree = 0.4808560096256872
r_squared_random_forest = 0.6213390060564012

n = 200000
k = 12


adjusted_r_squared_linear = 1 - ((1 - r_squared_linear) * ((n - 1) / (n - k - 1)))


adjusted_r_squared_decision_tree = 1 - ((1 - r_squared_decision_tree) * ((n - 1) / (n - k - 1)))


adjusted_r_squared_random_forest = 1 - ((1 - r_squared_random_forest) * ((n - 1) / (n - k - 1)))


print("Adjusted R-squared for Linear Regression:", adjusted_r_squared_linear)
print("Adjusted R-squared for Decision Tree:", adjusted_r_squared_decision_tree)
print("Adjusted R-squared for Random Forest:", adjusted_r_squared_random_forest)

Adjusted R-squared for Linear Regression: 0.0007025391945629789
Adjusted R-squared for Decision Tree: 0.48082485896147154
Adjusted R-squared for Random Forest: 0.6213162849198908
