<a href="https://colab.research.google.com/github/Dhanush-adk/machine_learning/blob/main/project/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics.pairwise import haversine_distances
from math import radians

In [2]:
url = 'https://raw.githubusercontent.com/Dhanush-adk/machine_learning/main/project/farepriceprediction.csv'
df = pd.read_csv(url)

# Display the first 5 rows of the DataFrame
df.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 8:22:21,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [3]:
df = df.drop('Unnamed: 0',axis=1)
df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-05-07 19:52:06,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,2009-07-17 20:04:56,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,2009-08-24 21:45:00,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,2009-06-26 8:22:21,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,2014-08-28 17:47:00,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   key                200000 non-null  object 
 1   fare_amount        200000 non-null  float64
 2   pickup_datetime    200000 non-null  object 
 3   pickup_longitude   200000 non-null  float64
 4   pickup_latitude    200000 non-null  float64
 5   dropoff_longitude  199999 non-null  float64
 6   dropoff_latitude   199999 non-null  float64
 7   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 12.2+ MB


column - 5 and 6  dropoff_longitude  and dropoff_latitude has 1 null value

Dropping the null values here

In [5]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199999 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   key                199999 non-null  object 
 1   fare_amount        199999 non-null  float64
 2   pickup_datetime    199999 non-null  object 
 3   pickup_longitude   199999 non-null  float64
 4   pickup_latitude    199999 non-null  float64
 5   dropoff_longitude  199999 non-null  float64
 6   dropoff_latitude   199999 non-null  float64
 7   passenger_count    199999 non-null  int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 13.7+ MB


In [6]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,199999.0,199999.0,199999.0,199999.0,199999.0,199999.0
mean,11.359892,-72.527631,39.935881,-72.525292,39.92389,1.684543
std,9.90176,11.437815,7.720558,13.117408,6.794829,1.385995
min,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,6.0,-73.992065,40.734796,-73.991407,40.733823,1.0
50%,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,12.5,-73.967154,40.767158,-73.963659,40.768001,2.0
max,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0


Based on the provided information, we can conclude that:


*   The presence of a negative minimum fare amount is unrealistic.
*  The maximum passenger count of 208 exceeds the plausible limit in the Uber system.

To address these issues, it is advisable to:


*  Exclude data points where the fare amount is negative.
* Eliminate data points where the passenger count exceeds 6, as Uber cannot
accommodate more than six passengers.










In [7]:
df = df[df['passenger_count'] <= 6]
df = df[df['fare_amount'] > 0]

In [8]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,199976.0,199976.0,199976.0,199976.0,199976.0,199976.0
mean,11.362584,-72.527837,39.935991,-72.526236,39.924406,1.683457
std,9.897088,11.437314,7.720481,13.115146,6.793455,1.306932
min,0.01,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,6.0,-73.992065,40.734794,-73.991407,40.733825,1.0
50%,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,12.5,-73.967155,40.767158,-73.96366,40.768001,2.0
max,499.0,57.418457,1644.421482,1153.572603,872.697628,6.0


In [9]:
def preprocess_data(data):
    data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])
    data['day'] = data['pickup_datetime'].dt.day
    data['hour'] = data['pickup_datetime'].dt.hour
    data['weekday'] = data['pickup_datetime'].dt.weekday
    data['month'] = data['pickup_datetime'].dt.month
    data['year'] = data['pickup_datetime'].dt.year
    data.drop(['key', 'pickup_datetime'], axis=1, inplace=True)
    return data

In [10]:
def haversine_distance(row):
    pick_up_lat = row['pickup_latitude']
    pick_up_lon = row['pickup_longitude']
    drop_off_lat = row['dropoff_latitude']
    drop_off_lon = row['dropoff_longitude']
    pick_up_lat, pick_up_lon, drop_off_lat, drop_off_lon = map(radians, [pick_up_lat, pick_up_lon, drop_off_lat, drop_off_lon])
    result = haversine_distances([[pick_up_lat, pick_up_lon], [drop_off_lat, drop_off_lon]])
    distance_km = result[0, 1] * 6371000/1000
    return distance_km

Calculating distance in a sphere based on a given latitude and longitude values

In [11]:
df['distance'] = df.apply(haversine_distance, axis=1)
df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance
0,2015-05-07 19:52:06,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1,1.683323
1,2009-07-17 20:04:56,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1,2.45759
2,2009-08-24 21:45:00,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1,5.036377
3,2009-06-26 8:22:21,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3,1.661683
4,2014-08-28 17:47:00,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5,4.47545


In [12]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance
count,199976.0,199976.0,199976.0,199976.0,199976.0,199976.0,199976.0
mean,11.362584,-72.527837,39.935991,-72.526236,39.924406,1.683457,20.770821
std,9.897088,11.437314,7.720481,13.115146,6.793455,1.306932,382.009478
min,0.01,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0,0.0
25%,6.0,-73.992065,40.734794,-73.991407,40.733825,1.0,1.215394
50%,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0,2.121116
75%,12.5,-73.967155,40.767158,-73.96366,40.768001,2.0,3.875248
max,499.0,57.418457,1644.421482,1153.572603,872.697628,6.0,16409.239135


In [13]:
df = preprocess_data(df)
df.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance,day,hour,weekday,month,year
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1,1.683323,7,19,3,5,2015
1,7.7,-73.994355,40.728225,-73.99471,40.750325,1,2.45759,17,20,4,7,2009
2,12.9,-74.005043,40.74077,-73.962565,40.772647,1,5.036377,24,21,0,8,2009
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3,1.661683,26,8,4,6,2009
4,16.0,-73.925023,40.744085,-73.973082,40.761247,5,4.47545,28,17,3,8,2014


In [None]:
(df['distance'] < 100).sum(), (df['distance'] > 100).sum()

(199507, 469)

A mere 0.2% of the rides have distances exceeding 100 kilometers. Consequently, it is reasonable to exclude data points with distances surpassing this threshold.

In [None]:
df = df[df['distance'] < 100]

In [None]:
df.to_csv("data_preprocessed.csv", index=False)