In [6]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from random import randrange, uniform

from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping
from keras import optimizers
from keras import regularizers

# Import libraries
import numpy as np
import pandas as pd
import csv
import re
import seaborn as sns
import matplotlib.pyplot as plt
import gc
import time
from math import radians, cos, sin, asin, sqrt
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon 


In [7]:
# helper function to compute the bearing angle from pickup point towards the destination point

def get_direction(lat1, lon1, lat2, lon2):
  lon1=lon1.to_numpy()
  lat1=lat1.to_numpy()
  lon2=lon2.to_numpy()
  lat2=lat2.to_numpy()
  diff_lon = np.deg2rad(lon2-lon1)
  x = np.sin(diff_lon) * np.cos(lat2)
  y = np.cos(lat1) * np.sin(lat2) - (np.sin(lat1) * np.cos(lat2) * np.cos(diff_lon))
  initial_bearing = np.arctan2(x, y)

  # Now we have the initial bearing but math.atan2 return values from -pi to + pi (in degrees)
  direction = np.degrees (initial_bearing)
  #if the value falls out of range, normalize it     
  initial_bearing = np.degrees (initial_bearing)
  direction = (initial_bearing + 360) % 360
  return direction


#(Source: https://www.igismap.com/haversine-formula-calculate-geographic-distance-earth/)
def haversine(lat1, lon1, lat2, lon2):
    
  # converting decimal degrees to radians 

  lon1=np.deg2rad(lon1.to_numpy())
  lat1=np.deg2rad(lat1.to_numpy())
  lon2=np.deg2rad(lon2.to_numpy())
  lat2=np.deg2rad(lat2.to_numpy())

  # haversine formula 
  dlon = lon2 - lon1 
  dlat = lat2 - lat1 
  a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
  c = 2 * np.arcsin(np.sqrt(a)) 
    
  r = 6372.8 # Radius of earth in kilometers. Use 3956 for miles
  return np.around(c * r, decimals=2) 


# Load file with polygon coordinates
def merge(list1, list2):  
  merged_list = [(list1[i], list2[i]) for i in range(0, len(list1))] 
  return merged_list 


# Below function checks whether pick_up points are at airports
def in_airport(x1,y1,x2,y2,airport):
  for icoord, (x,y) in enumerate(zip([x1,x2], [y1,y2])):  
    point = Point(float(x), float(y))  
    found = 0 
    polygon = Polygon(airport) 
    if polygon.contains(point) == True:
      found = 1
      return found
  return found  

# Below function checks whether the ride in inside manhattan
def in_manhattan(manhattan, x, y):   
  point = Point(float(x), float(y))  
  found = 0
  polygon = Polygon(manhattan) 
  if polygon.contains(point) == 1:
    found = 1   
    return found
  return found

In [8]:
# Taking inflation into consideration
def inflation(infl, df1, df2):
  inflated_fare = [0 for i in range(len(df1))]
  inflated_fare = df2['year'].map(infl) * df1#.fare_amount 
  res = inflated_fare
  return res
  
def inverse_inflation(infl, df1, df2):
  inflated_fare = [0 for i in range(len(df1))]
  inflated_fare = df1 / df2['year'].map(infl)
  res = inflated_fare
  return res

#Scaling inflation to year 2009 values => till year 2015 because the data supports till year 2015
# for years 2016 to 2021 => we estimated using the same scale
infl = {
    2009: 1/1.    ,
    2010: 1/1.0164,
    2011: 1/1.0485,
    2012: 1/1.0702,
    2013: 1/1.0859,
    2014: 1/1.1035,
    2015: 1/1.1048,
    2016: 1/1.1103,
    2017: 1/1.1275,
    2018: 1/1.1418,
    2019: 1/1.1501,
    2020: 1/1.1583,
    2021: 1/1.1705,
}


In [9]:
# Loading data coordinates from text files for NYC airports
JFK = open('JFK2.txt','r')
LAGUARDIA = open('LaGuardia2.txt','r')
NEWARK = open('Newark2.txt','r')
airports = []

jfk_coords = (40.639722, -73.778889)
lga_coords = (40.77725, -73.872611)
nwk_coords = (40.6925, -74.168611) 

for airport in [JFK, LAGUARDIA,NEWARK]:
  for line in airport: 
    line = line.split(',')  
    polygon_y = [ np.float64(i) for i in line[::2] ]  
    polygon_x = [ np.float64(i) for i in line[1::2] ]   
    airports.append(merge(polygon_x, polygon_y))  

# Loading data coordinates from text file for Manhattan
manhattan_ = open('Manhattan.txt','r')  
for line in manhattan_:  
  line = line.split(',')   
  polygon_y = [ np.float64(i) for i in line[::2] ]  
  polygon_x = [ np.float64(i) for i in line[1::2] ]   
  manhattan = (merge(polygon_x, polygon_y))
manhattan_.close()


In [None]:
# Preprocessing data by taking each csv file with 1M rows of 8 columns each to more than 20 columns for Neural Network

for i in range(0,56):
    file_name = "./input_files/input"+str(i)+".csv"
    training_pd=pd.read_csv(file_name,parse_dates=['pickup_datetime'])
    
#     if the fare's are <0 and >30 making it an NaN so that we can replace with the mean value of the column later

    training_pd.loc[training_pd['fare_amount']<0 , 'fare_amount']=np.nan
    training_pd.loc[training_pd['fare_amount'] > 30, 'fare_amount']=np.nan

    #For the case of irregular number of passenges ie., count > 8 are converted to NaN
    training_pd.loc[training_pd['passenger_count'] > 8,'passenger_count'] = np.nan
    df = training_pd

    coutliers = [ 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
    for list in coutliers:
        #Detect and replace with NA
        #Extract quartiles
        q75, q25 = np.percentile(df[list], [75 ,25])

        #Calculate IQR
        iqr = q75 - q25

        # #Calculate inner and outer fence
        minimum = q25 - (iqr*1.5)
        maximum = q75 + (iqr*1.5)

        # #Replace with NA
        df.loc[df[list] < minimum,list] = np.nan
        df.loc[df[list] > maximum,list] = np.nan

        # #Calculate missing value
        missing_val = pd.DataFrame(df.isnull().sum())
        
    #As it is found Mean is very close to original method we will proceed with imputation via mean
    df.dtypes
    
    
    df['year'] = df.pickup_datetime.apply(lambda t: t.year)
    df['month'] = df.pickup_datetime.apply(lambda t: t.month)
    df['weekday'] = df.pickup_datetime.apply(lambda t: t.weekday())
    df['weekend'] = df['weekday'].apply(lambda t: 1 if t in [5,6] else 0)
    df['hour'] = df.pickup_datetime.apply(lambda t: t.hour) 
    df['minute'] = df.pickup_datetime.apply(lambda t: t.minute)

    df['month_sin'] = np.sin((df['month'] - 1) * (2. * np.pi / 12))
    df['weekday_sin'] = np.sin((df['weekday'] - 1) * (2. * np.pi / 7))
    df['hour_sin'] = np.sin((df['hour'] + df['minute'] / 60) * (2. * np.pi / 24))

    # abs of delta of longitude and latitude pickup-dropoff
    df['delta_longitude'] = abs(df.pickup_longitude - df.dropoff_longitude)
    df['delta_latitude'] = abs(df.pickup_latitude - df.dropoff_latitude)
    
    # peak_hours, and inflated fare
    df['peak_hours'] = df['hour'].apply(lambda x: 1 if x in [18,19,20] else 0)
    df['inflated_fare'] = inflation(infl, df['fare_amount'], df) 
    
    # Create columns 'direction', 'distance_km'
    df['direction'] = get_direction(df['pickup_latitude'], df['pickup_longitude'], \
                                          df['dropoff_latitude'], df['dropoff_longitude'])

    df['distance_km'] = haversine(df['pickup_latitude'], df['pickup_longitude'], \
                                          df['dropoff_latitude'], df['dropoff_longitude'])
    
        # Find how many fares begin/end at an airport
    df['JFK']=df.apply(lambda x: in_airport(x['dropoff_latitude'], x['dropoff_longitude'], x['pickup_latitude'], x['pickup_longitude'], airports[0]),axis=1)
    df['LGA']=df.apply(lambda x: in_airport(x['dropoff_latitude'], x['dropoff_longitude'], x['pickup_latitude'], x['pickup_longitude'], airports[1]),axis=1)
    df['NWK']=df.apply(lambda x: in_airport(x['dropoff_latitude'], x['dropoff_longitude'], x['pickup_latitude'], x['pickup_longitude'], airports[2]),axis=1)

    # Find how many fares begin/end at Manhattan
    df['dropoff_manhattan'] = df.apply(lambda x: in_manhattan(manhattan, x['dropoff_latitude'], x['dropoff_longitude']),axis=1)
    df['pickup_manhattan'] = df.apply(lambda x: in_manhattan(manhattan, x['pickup_latitude'], x['pickup_longitude']),axis=1)
        
    df['fare_amount'] = df['fare_amount'].fillna(df['fare_amount'].mean())
    df['pickup_longitude']= df['pickup_longitude'].fillna(df['pickup_longitude'].mean())
    df['pickup_latitude']= df['pickup_latitude'].fillna(df['pickup_latitude'].mean())
    df['dropoff_longitude']= df['dropoff_longitude'].fillna(df['dropoff_longitude'].mean())
    df['dropoff_latitude']= df['dropoff_latitude'].fillna(df['dropoff_latitude'].mean())

    #And for category variables imputation is done with mode
    df['passenger_count'] = df['passenger_count'].fillna(int(df['passenger_count'].mode()))
    
    df = df[(df['fare_amount']>0) & (df['fare_amount']<200)]
    df = df[(df['passenger_count']>0) & (df['passenger_count']<=12)]
    df = df[-((df['fare_amount']>70) & (df['distance_km']<5))]
    df = df[df['distance_km']!=0]

    df = df[(df['pickup_longitude']<-65) & (-85<df['pickup_longitude'])]
    df = df[(30<df['pickup_latitude']) & (df['pickup_latitude']<55)]

    df = df[(df['dropoff_longitude']<-65) & (-85<df['dropoff_longitude'])]
    df = df[(30<df['dropoff_latitude']) & (df['dropoff_latitude']<55)]
    
    final_data = df.drop(columns=['key','pickup_datetime'])
    final_data.head()

    name = './preprocessed_input/file'+str(i)+'.csv'
    final_data.to_csv(name,index=False, mode='a',)
