In [1]:
import pandas as pd
import datetime as dt
import json
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

In [2]:
# Read Train data with limitation
train_path = 'train.csv'
train_df = pd.read_csv(train_path)
#train_df = pd.read_csv(train_path, nrows = 5000, parse_dates=["pickup_datetime"])

In [3]:
train_df.head(5)

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


## Preprocessing data

#### Cleaning of illogical data

##### Removing Data with NA values

In [4]:
# Remove Training data with NA values
processed_train_df = train_df.dropna(inplace = False)

##### Removing Data with FARE < 0

In [5]:
# Removing fare amounts that are less than 0
processed_train_df = processed_train_df[processed_train_df["fare_amount"] > 0 ]

##### Removing Data with Longitude AND Latitude == 0 

In [6]:
# Remove Training data with "0" values for longitude, Latittube for pickup and drop off
processed_train_df = processed_train_df[(processed_train_df["pickup_longitude"] != 0) & (processed_train_df["pickup_latitude"] != 0) & (processed_train_df["dropoff_longitude"] != 0) & (processed_train_df["dropoff_latitude"] != 0) ]

##### Removing PASSENGER_COUNT < 1 and > 10

In [7]:
# Removing passenger counts that are more than 
processed_train_df = processed_train_df[(processed_train_df["passenger_count"] > 0) & (processed_train_df["passenger_count"] < 10) ]

In [8]:
processed_train_df.dtypes

key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [9]:
processed_train_df.isnull().values.any()

False

In [10]:
print("Number of rows dropped : ", train_df.shape[0] - processed_train_df.shape[0])
print("Number of rows remainingg: ", processed_train_df.shape[0] )

Number of rows dropped :  1299031
Number of rows remainingg:  54124825


In [11]:
processed_train_df.head(20)

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1
5,2011-01-06 09:50:45.0000002,12.1,2011-01-06 09:50:45 UTC,-74.000964,40.73163,-73.972892,40.758233,1
6,2012-11-20 20:35:00.0000001,7.5,2012-11-20 20:35:00 UTC,-73.980002,40.751662,-73.973802,40.764842,1
7,2012-01-04 17:22:00.00000081,16.5,2012-01-04 17:22:00 UTC,-73.9513,40.774138,-73.990095,40.751048,1
8,2012-12-03 13:10:00.000000125,9.0,2012-12-03 13:10:00 UTC,-74.006462,40.726713,-73.993078,40.731628,1
9,2009-09-02 01:11:00.00000083,8.9,2009-09-02 01:11:00 UTC,-73.980658,40.733873,-73.99154,40.758138,2


In [None]:
processed_train_df['pickup_datetime'] = pd.to_datetime(processed_train_df['pickup_datetime'])

In [None]:
# Getting the time, date and day
# 2010-01-05 16:52:16 UTC
processed_train_df['date'] = [d.date() for d in processed_train_df['pickup_datetime']]
processed_train_df['time'] = [d.time() for d in processed_train_df['pickup_datetime']]
processed_train_df['day'] = [d.dayofweek for d in processed_train_df['pickup_datetime']]
# Monday = 0, Sunday = 6 
processed_train_df['IsWeekend'] = processed_train_df['day'] >= 4
processed_train_df = processed_train_df.drop(columns = 'pickup_datetime')

In [None]:
processed_train_df.head(5)

In [None]:
cal = calendar()
holidays = cal.holidays(start=processed_train_df['date'].min(), end=processed_train_df['date'].max())
processed_train_df['holiday'] = processed_train_df['date'].isin(holidays)
date_holiday = processed_train_df[(processed_train_df["holiday"])]
date_holiday