In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style('whitegrid')

In [3]:
df = pd.read_csv("../data/raw/NYC.csv")

df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [4]:
df.shape

(1458644, 11)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   id                  1458644 non-null  object 
 1   vendor_id           1458644 non-null  int64  
 2   pickup_datetime     1458644 non-null  object 
 3   dropoff_datetime    1458644 non-null  object 
 4   passenger_count     1458644 non-null  int64  
 5   pickup_longitude    1458644 non-null  float64
 6   pickup_latitude     1458644 non-null  float64
 7   dropoff_longitude   1458644 non-null  float64
 8   dropoff_latitude    1458644 non-null  float64
 9   store_and_fwd_flag  1458644 non-null  object 
 10  trip_duration       1458644 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB


In [6]:
df.isnull().sum()

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dtype: int64

In [7]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])

In [8]:
df = df[df['trip_duration'] > 0]
df = df[df['passenger_count'] > 0]

df = df[
    (df['pickup_latitude'].between(40.5, 41.0)) &
    (df['pickup_longitude'].between(-74.5, -73.5)) &
    (df['dropoff_latitude'].between(40.5, 41.0)) &
    (df['dropoff_longitude'].between(-74.5, -73.5))
]

In [9]:
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['pickup_day'] = df['pickup_datetime'].dt.day
df['pickup_month'] = df['pickup_datetime'].dt.month
df['pickup_weekday'] = df['pickup_datetime'].dt.day_name()

df['is_weekend'] = df['pickup_weekday'].isin(['Saturday', 'Sunday']).astype(int)

In [10]:
from math import radians, sin, cos, sqrt, atan2

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c

df['trip_distance_km'] = df.apply(
    lambda row: haversine(
        row['pickup_latitude'],
        row['pickup_longitude'],
        row['dropoff_latitude'],
        row['dropoff_longitude']
    ),
    axis=1
)

In [11]:
final_df = df[[
    'id',
    'vendor_id',
    'pickup_datetime',
    'pickup_hour',
    'pickup_weekday',
    'is_weekend',
    'passenger_count',
    'trip_distance_km',
    'trip_duration'
]]

In [12]:
final_df.to_csv("../data/cleaned/nyc_taxi_cleaned.csv", index=False)

TypeError: 'tuple' object is not callable

In [14]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1458134 entries, 0 to 1458643
Data columns (total 9 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   id                1458134 non-null  object        
 1   vendor_id         1458134 non-null  int64         
 2   pickup_datetime   1458134 non-null  datetime64[ns]
 3   pickup_hour       1458134 non-null  int32         
 4   pickup_weekday    1458134 non-null  object        
 5   is_weekend        1458134 non-null  int64         
 6   passenger_count   1458134 non-null  int64         
 7   trip_distance_km  1458134 non-null  float64       
 8   trip_duration     1458134 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int32(1), int64(4), object(2)
memory usage: 105.7+ MB


In [15]:
final_df.isnull().sum()

id                  0
vendor_id           0
pickup_datetime     0
pickup_hour         0
pickup_weekday      0
is_weekend          0
passenger_count     0
trip_distance_km    0
trip_duration       0
dtype: int64

In [16]:
final_df

Unnamed: 0,id,vendor_id,pickup_datetime,pickup_hour,pickup_weekday,is_weekend,passenger_count,trip_distance_km,trip_duration
0,id2875421,2,2016-03-14 17:24:55,17,Monday,0,1,1.498521,455
1,id2377394,1,2016-06-12 00:43:35,0,Sunday,1,1,1.805507,663
2,id3858529,2,2016-01-19 11:35:24,11,Tuesday,0,1,6.385098,2124
3,id3504673,2,2016-04-06 19:32:31,19,Wednesday,0,1,1.485498,429
4,id2181028,2,2016-03-26 13:30:55,13,Saturday,1,1,1.188588,435
...,...,...,...,...,...,...,...,...,...
1458639,id2376096,2,2016-04-08 13:31:04,13,Friday,0,4,1.225080,778
1458640,id1049543,1,2016-01-10 07:35:15,7,Sunday,1,1,6.049836,655
1458641,id2304944,2,2016-04-22 06:57:41,6,Friday,0,1,7.824606,764
1458642,id2714485,1,2016-01-05 15:56:26,15,Tuesday,0,1,1.092564,373
