# CZ4041 - Course Project - Taxi Fare

# Exploratory Data Analysis

## Importing the Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sb

## Loading the Datasets

In [2]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

MemoryError: Unable to allocate 2.06 GiB for an array with shape (5, 55423856) and data type float64

In [None]:
#displaying the first 5 rows of train 
df_train.head()

In [None]:
#displaying the first 5 rows of test
df_test.head()

In [None]:
#shape of df_train
print("df_train shape:",df_train.shape)
#shape of df_test
print("df_test shape:", df_test.shape)

The test has one column lesser than train set as the test set does not contain the labels.

In [None]:
columns = df_train.columns
print("Columns:", list(columns))
print("Length of Columns:", len(columns))

## Data Cleaning

### Deleting the rows where pickup/dropoff_datetime, pickup/dropoff_longitude, pickup/dropoff_latitude are 0

In [None]:
df_train_copy = df_train.copy()
train_wo_zeros = df_train_copy[(df_train_copy.pickup_datetime != 0) & (df_train_copy.pickup_latitude != 0) & (df_train_copy.dropoff_latitude != 0) & (df_train_copy.pickup_longitude != 0) & (df_train_copy.dropoff_longitude != 0)]
train_wo_zeros

In [None]:
print("Shape of df initially:", df_train.shape)
print("Shape after removing rows with 4 0s:", train_wo_zeros.shape)
print("Number of rows with 4 0s:", df_train.shape[0] - train_wo_zeros.shape[0])

In [None]:
#Resetting the index, inplace to make sure the indexes are all correct
train_wo_zeros.reset_index(inplace=True)
train_wo_zeros.tail()

### Dealing with Null Values

In [None]:
#checking which columns have NaN values
train_wo_zeros.isnull().any()

Therefore, dropoff_longitude and dropoff_latitude have NaN in them.

In [None]:
nan_rows_lat = train_wo_zeros[train_wo_zeros['dropoff_latitude'].isnull()]
nan_rows_long = train_wo_zeros[train_wo_zeros['dropoff_longitude'].isnull()]

print("nan_rows_lat:", nan_rows_lat)
print("# of rows:", nan_rows_lat.shape[0])

print("nan_rows_long:", nan_rows_long)
print("# of rows:", nan_rows_long.shape[0])

In [None]:
nan_rows_lat_idx = list(nan_rows_lat.index)
nan_rows_long_idx = list(nan_rows_long.index)

In [None]:
#viewing one particular row to analyse it further
train_wo_zeros.iloc[nan_rows_lat_idx[0]]

In [None]:
#checking if both nan_rows_lat_idx and nan_rows_long_idx are the same
if nan_rows_lat_idx == nan_rows_long_idx:
    print("Both the Lists are identical! Therefore, just using one list will do!")
else:
    print("Both lists are not identical! Hence, we need to account for repeated row indices!")

The lists nan_rows_lat_idx and nan_rows_long_idx contain the indices of the rows that contain both dropoff_latitude and dropoff_longitude as NaN. There is no case where either one of them is a NaN and the other one if a valid value, this is because, form the test above, both the lists are identical, whoch means the row indices are exact same, hence, there exists only the case where either both of the column's vlaues are NaN or None of it. Hence, we will just be using the nan_rows_lat_idx to remove the rows that contain NaN.

In [None]:
#creating a copy of the existing df
train_wo_nan = train_wo_zeros.copy()

In [None]:
#dropping the rows with NaN
train_wo_nan.drop(axis=0, index=nan_rows_lat_idx,inplace=True)

#resetting the index
train_wo_nan.reset_index(inplace=True)

In [None]:
#Math to check 
NUM_NAN_ROWS = len(nan_rows_lat_idx)

print("Number of rows before dropping NaN rows:", train_wo_zeros.shape[0])

if train_wo_zeros.shape[0] - train_wo_nan.shape[0] == NUM_NAN_ROWS:
    print("Successfully dropped ", NUM_NAN_ROWS, "rows!")
    print("Number of rows in train_wo_nan:", train_wo_nan.shape[0])
else:
    print("Error! Dropped Wrong Number of Rows!")

In [None]:
#removing columns that were unintentionally added at the backend 
train_wo_nan.drop(['level_0','index'], axis = 1, inplace=True)

In [None]:
train_wo_nan

In [None]:
#cross-checking if there are any NaN values at all
train_wo_nan.isnull().any()

### Dealing with passenger_count == 0

In [None]:
#checking if there are any rows where the passenger_count == 0
(train_wo_nan['passenger_count'] == 0).any()

Since the output is True, it means that there are rows where the passenger_count == 0. Let's look deeper into this!

In [None]:
count = (train_wo_nan['passenger_count'] == 0).sum()
print("Number of 0s in passenger_count:", count)

In [None]:
zero_values = train_wo_nan[train_wo_nan['passenger_count'] == 0]
zero_values

In [None]:
#Obtaining the indices of rows where passenger_count == 0 and converting it into a list
zero_values_idx = list(zero_values.index)
zero_values_idx

In [None]:
pass_count_dropped = train_wo_nan.copy()

In [None]:
#dropping the rows
pass_count_dropped.drop(axis=0, index=zero_values_idx,inplace=True)

#resetting the index
pass_count_dropped.reset_index(inplace=True)

pass_count_dropped

In [None]:
NUM_PASS_COUNT_0 = len(zero_values_idx)
print("Number of rows before dropping passenger_count == 0 rows:", train_wo_nan.shape[0])

if (train_wo_nan.shape[0] - pass_count_dropped.shape[0] == NUM_PASS_COUNT_0):
    print("Successfully dropped ", NUM_PASS_COUNT_0, "rows!")
    print("Number of rows in pass_count_dropped:", pass_count_dropped.shape[0])
else:
    print("Error! Dropped Wrong Number of Rows!")

In [None]:
#Dropping unnecessary columns from the df 
pass_count_dropped.drop(['index'], axis = 1, inplace=True)
pass_count_dropped

In [None]:
#checking if passenger_count == 0 still exists
(pass_count_dropped['passenger_count'] == 0).any()

Since, the output is False, it means that there are no more rows where passenger_count == 0.

### Visualising Passenger Count

We would like to visualise the distribution of passenger_count to check if there are any logical discrepancies. According to this source, by law an NYC Taxi is allowed to carry only 4 passengers and 5 passengers, if it is a 5 seater car. Passenger Count of greater than 5 is only allowed if children aged 7 or above are seated on the laps of the adult passenger. 

Hence, if the maximum number of adults seated is 5, then the maximum number of passengers in a taxi will be 5 + 5 = 10, assuming that each adult seated has a child he/she is holding onto.

Hence, to ensure that this logic is not violated, we aim to visualise the distribution of the passenger_count variable and remove rows that contain values gretaer than 10.

Link: https://www.google.com/urlsa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwiew7PRucj2AhUjSWwGHZjiD_QQFnoECAsQAw&url=https%3A%2F%2Fwww1.nyc.gov%2Fsite%2Ftlc%2Fpassengers%2Fpassenger-frequently-askedquestions.page&usg=AOvVaw0lJJHgeBfD9Fi_4wir5Bgl

In [None]:
#extracting only the passeneger_count variable as a dataframe
passenger_count = pass_count_dropped[['passenger_count']]

print(type(passenger_count))

In [None]:
#analysing the inputs in the passenger_count column
passenger_count.value_counts()

In [None]:
#Plotting a Countplot
sb.set_theme(style="darkgrid")
ax = sb.countplot(x="passenger_count",data=passenger_count)

As we can see, majority of the records have only 1 passenger. However, there seem to be records that have 34, 129 and 208 passengers in the taxi. This is not logically possible, hence, this rows will be removed.

In [None]:
final_pass_count = pass_count_dropped.copy()

In [None]:
illogical_rows= final_pass_count[(final_pass_count['passenger_count'] == 34) | (final_pass_count['passenger_count'] == 129) | (final_pass_count['passenger_count'] == 208)]
illogical_rows

In [None]:
print("Number of illogical rows:", illogical_rows.shape[0])

In [None]:
illogical_rows_idx = list(illogical_rows.index)
NUM_ILLOGICAL_ROWS = len(illogical_rows_idx)
NUM_ILLOGICAL_ROWS

In [None]:
#dropping the rows
final_pass_count.drop(axis=0, index=illogical_rows_idx,inplace=True)

#resetting the index
final_pass_count.reset_index(inplace=True)

final_pass_count

In [None]:
print("Number of rows before dropping passenger_count == 34,129,208 rows:", pass_count_dropped.shape[0])

if (pass_count_dropped.shape[0] - final_pass_count.shape[0] == NUM_ILLOGICAL_ROWS):
    print("Successfully dropped ", NUM_ILLOGICAL_ROWS, "rows!")
    print("Number of rows in final_pass_count:", final_pass_count.shape[0])
else:
    print("Error! Dropped Wrong Number of Rows!")

In [None]:
#Dropping unnecessary columns from the df 
final_pass_count.drop(['index'], axis = 1, inplace=True)
final_pass_count

In [None]:
#Visualising passenger_count again to cross-check
final_pass_count['passenger_count'].value_counts()

In [None]:
#Plotting a Countplot
sb.set_theme(style="darkgrid")
ax = sb.countplot(x="passenger_count",data=final_pass_count)

### Cheking if the label has any Negative Values

In [None]:
final_fare_amt = final_pass_count.copy()

In [None]:
#checking if there is any negative value in fare_amt
(final_fare_amt['fare_amount'] < 0).any()

Therefore, there are negative values in the label, which are illogical.

In [None]:
negative_rows = final_fare_amt[final_fare_amt['fare_amount'] < 0]
negative_rows

In [None]:
negative_rows_idx = list(negative_rows.index)
negative_rows_idx

In [None]:
#looking at a specific row 
final_fare_amt.iloc[negative_rows_idx[0]]

In [None]:
#dropping the rows
final_fare_amt.drop(axis=0, index=negative_rows_idx,inplace=True)

#resetting the index
final_fare_amt.reset_index(inplace=True)

final_fare_amt

In [None]:
NUM_NEGATIVE_FARE = len(negative_rows_idx)
print("Number of rows before dropping fare_amt < 0:", final_pass_count.shape[0])

if (final_pass_count.shape[0] - final_fare_amt.shape[0] == NUM_NEGATIVE_FARE):
    print("Successfully dropped ", NUM_NEGATIVE_FARE, "rows!")
    print("Number of rows in final_fare_amount:", final_fare_amt.shape[0])
else:
    print("Error! Dropped Wrong Number of Rows!")

In [None]:
#Dropping unnecessary columns from the df 
final_fare_amt.drop(['index'], axis = 1, inplace=True)
final_fare_amt

In [None]:
#cross-checking if there is any negative value in fare_amt
(final_fare_amt['fare_amount'] < 0).any()

Things to do:
1. Need to create a binary variable called weekend - to check if its a weekend or not
2. Need to create a binary variable called holiday - to check if it is a public holiday or not
3. Need to code for the equation -> need to read up on what model can be used!