In [32]:
import pandas as pd,numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,RobustScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

In [2]:
data = pd.read_csv(r"C:\Users\sreya\Downloads\uber_rides_data.xlsx - sample_train.csv")

In [3]:
data

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1
2,44984355,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1
3,25894730,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...
199995,42598914,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1
199996,16382965,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1
199997,27804658,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2
199998,20259894,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695416,1


## What is the shape of given dataset?


In [4]:
data.shape

(200000, 8)

## How many integer columns(by default) are given in the dataset?


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ride_id            200000 non-null  int64  
 1   fare_amount        200000 non-null  float64
 2   pickup_datetime    200000 non-null  object 
 3   pickup_longitude   200000 non-null  float64
 4   pickup_latitude    200000 non-null  float64
 5   dropoff_longitude  199999 non-null  float64
 6   dropoff_latitude   199999 non-null  float64
 7   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(1)
memory usage: 12.2+ MB


- Here we can see we have 2 integer columns by default

## How many missing values exists in 'dropoff_longitude' column?


In [6]:
data['dropoff_longitude'].isnull().sum()

1

## What is the data type of ' pickup_datetime' feature in your data?


In [7]:
data['pickup_datetime'].dtype

dtype('O')

## Which of the following is the correct syntax to convert 'pickup_datetime' to datetime datatype?

In [8]:
# we have 2 ways to convert into date time

#1
data['pickup_datetime'] = data['pickup_datetime'].astype('datetime64[ns]')

#2
#pd.to_datetime(df['pickup_datetime'])

In [9]:
## Which function can be used to remove null values from the dataframe?

In [10]:
data.dropna()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,7.5,2015-05-07 19:52:06,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,7.7,2009-07-17 20:04:56,-73.994355,40.728225,-73.994710,40.750325,1
2,44984355,12.9,2009-08-24 21:45:00,-74.005043,40.740770,-73.962565,40.772647,1
3,25894730,5.3,2009-06-26 08:22:21,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,16.0,2014-08-28 17:47:00,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...
199995,42598914,3.0,2012-10-28 10:49:00,-73.987042,40.739367,-73.986525,40.740297,1
199996,16382965,7.5,2014-03-14 01:09:00,-73.984722,40.736837,-74.006672,40.739620,1
199997,27804658,30.9,2009-06-29 00:42:00,-73.986017,40.756487,-73.858957,40.692588,2
199998,20259894,14.5,2015-05-20 14:56:25,-73.997124,40.725452,-73.983215,40.695416,1


## What is the average fare amount?

In [11]:
data['fare_amount'].mean()

11.359955250000626

## Function to calculate Harversine formula 

In [12]:
# Function to calculate Haversine distance
def haversine(lat1, lon1, lat2, lon2):
    # Radius of the Earth in kilometers
    R = 6371

    # Convert latitude and longitude from degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = R * c

    return distance

# Calculate Haversine distances for each record
data['haversine_distance'] = data.apply(lambda row: haversine(row['pickup_latitude'], row['pickup_longitude'], 
                                                             row['dropoff_latitude'], row['dropoff_longitude']), axis=1)

In [13]:
data

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,haversine_distance
0,24238194,7.5,2015-05-07 19:52:06,-73.999817,40.738354,-73.999512,40.723217,1,1.683323
1,27835199,7.7,2009-07-17 20:04:56,-73.994355,40.728225,-73.994710,40.750325,1,2.457590
2,44984355,12.9,2009-08-24 21:45:00,-74.005043,40.740770,-73.962565,40.772647,1,5.036377
3,25894730,5.3,2009-06-26 08:22:21,-73.976124,40.790844,-73.965316,40.803349,3,1.661683
4,17610152,16.0,2014-08-28 17:47:00,-73.925023,40.744085,-73.973082,40.761247,5,4.475450
...,...,...,...,...,...,...,...,...,...
199995,42598914,3.0,2012-10-28 10:49:00,-73.987042,40.739367,-73.986525,40.740297,1,0.112210
199996,16382965,7.5,2014-03-14 01:09:00,-73.984722,40.736837,-74.006672,40.739620,1,1.875050
199997,27804658,30.9,2009-06-29 00:42:00,-73.986017,40.756487,-73.858957,40.692588,2,12.850319
199998,20259894,14.5,2015-05-20 14:56:25,-73.997124,40.725452,-73.983215,40.695416,1,3.539715


## Calculate distance between each pickup and dropoff points using Haversine formula. What is the median haversine distance between pickup and dropoff location according to the given dataset?

In [14]:
# Calculate the median Haversine distance
median_distance = data['haversine_distance'].median()
print("Median Haversine Distance:", median_distance,"kilometers")


Median Haversine Distance: 2.1209923961833708 kilometers


## What is the maximum haversine distance between pickup and dropoff location according to the given dataset?

In [15]:
# Calculate the maximum Haversine distance
maximum_distance = data['haversine_distance'].max()
print("Maximum Haversine Distance:", maximum_distance,"kilometers")


Maximum Haversine Distance: 16409.239135313168 kilometers


## How many rides have 0.0 haversine distance between pickup and dropoff location according to the given dataset?

In [16]:
distances = data['haversine_distance']

zero_distance = 0
for i in distances:
    if i == 0.0:
        zero_distance = zero_distance+1
print(zero_distance)

5632


## What is the mean 'fare_amount' for rides with 0 haversine distance?*Do you sense something fishy? Try to analyze, and give your expert opinion in Jupyter Notebook.

In [17]:
mean_fare_for_zero_distance = data[data['haversine_distance'] == 0]['fare_amount'].mean()
print("Mean 'fare_amount' for rides with 0 Haversine distance:", mean_fare_for_zero_distance)


Mean 'fare_amount' for rides with 0 Haversine distance: 11.585317826704578


In [18]:
# In my opinion these are the following thing which can happen for a 0 harvesian distance but have fare_amount.

*Data Entry Errors:*
It's possible that some records in the dataset contain data entry errors, where the pickup and dropoff locations are recorded as the same coordinates but should not be. These errors could be due to manual entry mistakes or system glitches.

Short or Invalid Rides: 
Rides with a Haversine distance of 0 might represent very short trips that are not accurately reflected in the 'fare_amount.' In some cases, these might be valid rides (e.g., walking distance), but the fare might be recorded incorrectly.

Fraud or Manipulation: 
In some cases, individuals might manipulate the system to generate fake or fraudulent rides with a Haversine distance of 0 to exploit fare calculations.

Data Collection Issues: 
The GPS data used to calculate Haversine distances might have inaccuracies, especially when GPS signals are weak or disrupted. This could lead to incorrect distances.

## What is the maximum 'fare_amount' for a ride?


In [19]:
print(' maximum fare_amount for a ride :',data['fare_amount'].max())

 maximum fare_amount for a ride : 499.0


## What is the haversine distance between pickup and dropoff location for the costliest ride?Do you sense something fishy? Try to analyze, and give your expert opinion in Jupyter Notebook.

In [20]:
ditance_of_costliest_fare = data[data['fare_amount'] == data['fare_amount'].max() ]['haversine_distance']
print("Haversine distance for costliest fare :", ditance_of_costliest_fare)

Haversine distance for costliest fare : 170081    0.00079
Name: haversine_distance, dtype: float64


Distance Validity: 
Check if the calculated Haversine distance for the costliest ride is within a reasonable range for your dataset and geographic area. Extremely long or short distances may indicate data quality issues.

Fare Amount vs. Distance: 
Compare the calculated distance with the 'fare_amount' for this ride. If the fare amount is significantly higher or lower than what you would expect for the calculated distance, it may indicate a fare miscalculation or data error.

Data Anomalies:
Investigate the entire record for the costliest ride to see if there are any other anomalies or outliers, such as unrealistic pickup/dropoff locations, timestamps, or passenger counts.

Geographic Context: 
Consider the geographic context of your dataset. Does the pickup and dropoff location make sense in the context of your area of study? If not, it might indicate a data error or fraud.

## How many rides were recorded in the year 2014?


In [21]:
# Filter the records for the year 2014
rides_in_2014 = data[data['pickup_datetime'].dt.year == 2014]

# Get the count of rides in 2014
num_rides_in_2014 = len(rides_in_2014)

print("Number of rides recorded in the year 2014:", num_rides_in_2014)

Number of rides recorded in the year 2014: 29968


## How many rides were recorded in the first quarter of 2014?


In [22]:
# Filter the records for the first quarter of 2014 (January 1, 2014, to March 31, 2014)
rides_in_first_quarter_2014 = data[
    (data['pickup_datetime'].dt.year == 2014) & 
    (data['pickup_datetime'].dt.month >= 1) & 
    (data['pickup_datetime'].dt.month <= 3)
]

# Get the count of rides in the first quarter of 2014
num_rides_in_first_quarter_2014 = len(rides_in_first_quarter_2014)

print("Number of rides recorded in the first quarter of 2014:", num_rides_in_first_quarter_2014)

Number of rides recorded in the first quarter of 2014: 7687


## On which day of the week in September 2010, maximum rides were recorded ?

In [23]:
# Filter the records for September 2010
rides_in_september_2010 = data[
    (data['pickup_datetime'].dt.year == 2010) &
    (data['pickup_datetime'].dt.month == 9)
]

# Extract the day of the week for each ride's pickup date (0 = Monday, 6 = Sunday)
rides_in_september_2010['day_of_week'] = rides_in_september_2010['pickup_datetime'].dt.dayofweek

# Count the occurrences of each day of the week
day_counts = rides_in_september_2010['day_of_week'].value_counts()

# Find the day of the week with the maximum rides
max_rides_day = day_counts.idxmax()

# Convert the numeric day of the week to a string representation
days_of_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
max_rides_day_name = days_of_week[max_rides_day]

print("On which day of the week in September 2010 the maximum rides were recorded:", max_rides_day_name)


On which day of the week in September 2010 the maximum rides were recorded: Thursday


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rides_in_september_2010['day_of_week'] = rides_in_september_2010['pickup_datetime'].dt.dayofweek


## Apply a Machine Learning Algorithm to predict the fare amount given following input features:passenger_count, distance and ride_week_day.Perform a 70-30 split of data.Which algorithm gives the least adjusted R square value?

In [24]:
# Create a new column 'day_of_week' that contains the day of the week
data['day_of_week'] = data['pickup_datetime'].dt.strftime('%A')
data.head()

Unnamed: 0,ride_id,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,haversine_distance,day_of_week
0,24238194,7.5,2015-05-07 19:52:06,-73.999817,40.738354,-73.999512,40.723217,1,1.683323,Thursday
1,27835199,7.7,2009-07-17 20:04:56,-73.994355,40.728225,-73.99471,40.750325,1,2.45759,Friday
2,44984355,12.9,2009-08-24 21:45:00,-74.005043,40.74077,-73.962565,40.772647,1,5.036377,Monday
3,25894730,5.3,2009-06-26 08:22:21,-73.976124,40.790844,-73.965316,40.803349,3,1.661683,Friday
4,17610152,16.0,2014-08-28 17:47:00,-73.925023,40.744085,-73.973082,40.761247,5,4.47545,Thursday


In [43]:
# Split the data into features (X) and the target variable (y)
cont_col = ['passenger_count', 'haversine_distance']
nom_col = [ 'day_of_week']

x = data[cont_col+nom_col]
y = data['fare_amount']

# Perform a 70-30 split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# Preprocessing the data

preprocessor = ColumnTransformer(transformers = [
            ('cont_pipeline' , Pipeline([
               ( 'cont_scale', RobustScaler())
            ]), cont_col),
            
            ('nom_pipeline', Pipeline([
                ('nom_encode',OneHotEncoder())
            ]),nom_col)
],remainder = 'passthrough')

x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)

In [44]:
# I got an error of "U have null values or infinite values in x_train that's why I am applying this"
x_train[np.isnan(x_train)] = 0
x_train[np.isinf(x_train)] = 1e9
x_train[np.isneginf(x_train)] = -1e9

In [45]:
# Initialize and train regression models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest Regression': RandomForestRegressor(),
    'KNN Regressor': KNeighborsRegressor(),
    'Decission Tree Regressor':DecisionTreeRegressor()
}

adjusted_r2_values = {}

for model_name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    r2 = r2_score(y_test, y_pred)
    n = len(y_test)
    p = x_test.shape[1]
    
    adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))
    adjusted_r2_values[model_name] = adjusted_r2

# Find the algorithm with the least adjusted R-squared value
least_adjusted_r2_algorithm = min(adjusted_r2_values, key=adjusted_r2_values.get)

print("Adjusted R-squared values:")
for model_name, adjusted_r2 in adjusted_r2_values.items():
    print(f"{model_name}: {adjusted_r2}")

print("Algorithm with the least adjusted R-squared value:", least_adjusted_r2_algorithm)


Adjusted R-squared values:
Linear Regression: 0.0010100757494996282
Random Forest Regression: 0.6196186717379977
KNN Regressor: 0.6323137478596443
Decission Tree Regressor: 0.46808411330126776
Algorithm with the least adjusted R-squared value: Linear Regression
