In [1]:
import pyodbc
import pandas as pd
import numpy as np
from datetime import datetime
import calendar
from datetime import timedelta
import datetime as dt

In [2]:
# sql query
query_str='''
select passenger_count 
      ,pickup_datetime  
      ,pickup_longitude
      ,pickup_latitude
      ,dropoff_longitude
      ,dropoff_latitude
      ,fare_amount 
      from [dbo].[nyctaxi_sample]
'''
servername = 'LNOR010941'
db = 'NYCTaxi_Sample'
conn_str = 'Driver=SQL Server;Server=' + servername + \
    ';Database=' + db + ';Trusted_Connection=True;'
cnxn = pyodbc.connect(conn_str)
#cursor = cnxn.cursor()

# Load query into dataframe
df=pd.read_sql(query_str,cnxn)

In [3]:
df=df.loc[df['fare_amount']>=0]
df['dropoff_latitude_n']=pd.to_numeric(df['dropoff_latitude'])
df['dropoff_longitude_n']=pd.to_numeric(df['dropoff_longitude'])
df['pickup_latitude_n']=pd.to_numeric(df['pickup_latitude'])
df['pickup_longitude_n']=pd.to_numeric(df['pickup_longitude'])
df=df.drop(columns=['dropoff_latitude','dropoff_longitude','pickup_latitude','pickup_longitude'])

In [4]:
boundary={'min_lng':-74.263242,
              'min_lat':40.573143,
              'max_lng':-72.986532, 
              'max_lat':41.709555}

#We will mark the outlier locations as 1 and remove them for further analysi
df.loc[~((df.pickup_longitude_n >= boundary['min_lng'] ) & (df.pickup_longitude_n <= boundary['max_lng']) &
            (df.pickup_latitude_n >= boundary['min_lat']) & (df.pickup_latitude_n <= boundary['max_lat']) &
            (df.dropoff_longitude_n >= boundary['min_lng']) & (df.dropoff_longitude_n <= boundary['max_lng']) &
            (df.dropoff_latitude_n >=boundary['min_lat']) & (df.dropoff_latitude_n <= boundary['max_lat'])),'is_outlier_loc']=1
df.loc[((df.pickup_longitude_n >= boundary['min_lng'] ) & (df.pickup_longitude_n <= boundary['max_lng']) &
            (df.pickup_latitude_n >= boundary['min_lat']) & (df.pickup_latitude_n <= boundary['max_lat']) &
            (df.dropoff_longitude_n >= boundary['min_lng']) & (df.dropoff_longitude_n <= boundary['max_lng']) &
            (df.dropoff_latitude_n >=boundary['min_lat']) & (df.dropoff_latitude_n <= boundary['max_lat'])),'is_outlier_loc']=0

print("Outlier vs Non Outlier Counts")
print(df['is_outlier_loc'].value_counts())

# Let us drop rows, where location is outlier
df=df.loc[df['is_outlier_loc']==0]
df.drop(['is_outlier_loc'],axis=1,inplace=True)

Outlier vs Non Outlier Counts
0.0    1698334
1.0       5603
Name: is_outlier_loc, dtype: int64


In [5]:
city_long_border = (-74.03, -73.75)
city_lat_border = (40.63, 40.85)

nyc_airports={'JFK':{'min_lng':-73.8352,
     'min_lat':40.6195,
     'max_lng':-73.7401, 
     'max_lat':40.6659},
              
    'EWR':{'min_lng':-74.1925,
            'min_lat':40.6700, 
            'max_lng':-74.1531, 
            'max_lat':40.7081

        },
    'LaGuardia':{'min_lng':-73.8895, 
                  'min_lat':40.7664, 
                  'max_lng':-73.8550, 
                  'max_lat':40.7931
        
    }

}
def isAirport(latitude,longitude):
    res=0
    for airport_name in nyc_airports:
        if latitude>=nyc_airports[airport_name]['min_lat'] and latitude<=nyc_airports[airport_name]['max_lat'] and longitude>=nyc_airports[airport_name]['min_lng'] and longitude<=nyc_airports[airport_name]['max_lng']:
            res=1
    return res

In [6]:
def coordinates2distance(lat1,lat2,long1,long2):
    R=6373.0 #radius of earth in km
    p = 0.017453292519943295 # Pi/180 to convert to radians
    lat1=lat1*p
    lat2=lat2*p
    long1=long1*p
    long2=long2*p
    dlon=(long2-long1)
    dlat=(lat2-lat1)
    a=(np.sin(dlat/2)**2)+(np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2)
    c=2*np.arctan2(np.sqrt(a),np.sqrt(1-a))
    return R*c

In [7]:
df['distance']=coordinates2distance(df['pickup_latitude_n'].values,
                                    df['dropoff_latitude_n'].values,
                                    df['pickup_longitude_n'].values,
                                    df['dropoff_longitude_n'].values)

In [8]:
lower_manhattan_boundary={'min_lng': -74.0194,
                          'min_lat':40.6997,
                          'max_lng':-73.9716,
                          'max_lat':40.7427}

def isLowerManhattan(lat,lng):
    if lat>=lower_manhattan_boundary['min_lat'] and lat<=lower_manhattan_boundary['max_lat'] and lng>=lower_manhattan_boundary['min_lng'] and lng<=lower_manhattan_boundary['max_lng']:
        return 1
    else:
        return 0

In [9]:
def encodeDays(day_of_week):
    day_dict={'Sunday':0,'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6}
    return day_dict[day_of_week]

In [10]:
lgr=(-73.8733, 40.7746)
jfk=(-73.7900, 40.6437)
ewr=(-74.1843, 40.6924)

In [11]:
df['pickup_datetime']=pd.to_datetime(df['pickup_datetime'],format='%Y-%m-%d %H:%M:%S UTC')
df['pickup_date']=  df['pickup_datetime'].dt.date
df['pickup_day']= df['pickup_datetime'].apply(lambda x:x.day)
df['pickup_hour']= df['pickup_datetime'].apply(lambda x:x.hour)
df['pickup_day_of_week']= df['pickup_datetime'].apply(lambda x:calendar.day_name[x.weekday()])
df['pickup_month']= df['pickup_datetime'].apply(lambda x:x.month)
df['pickup_year']= df['pickup_datetime'].apply(lambda x:x.year)
airportpickup=df.apply(lambda row:isAirport(row['pickup_latitude_n'],row['pickup_longitude_n']),axis=1)
airportdropoff=df.apply(lambda row:isAirport(row['dropoff_latitude_n'],row['dropoff_longitude_n']),axis=1)
df['is_airport']=airportpickup.combine(airportdropoff, lambda x1, x2: x1 if x1 > x2 else x2)
df['pickup_day_of_week']= df['pickup_day_of_week'].apply(lambda x:encodeDays(x))

df['pickup_distance_jfk']=coordinates2distance(df['pickup_latitude_n'].values,
                                    jfk[1],
                                    df['pickup_longitude_n'].values,
                                    jfk[0])

df['dropoff_distance_jfk']=coordinates2distance(df['dropoff_latitude_n'].values,
                                    jfk[1],
                                    df['dropoff_longitude_n'].values,
                                    jfk[0])

df['pickup_distance_ewr']=coordinates2distance(df['pickup_latitude_n'].values,
                                    ewr[1],
                                    df['pickup_longitude_n'].values,
                                    ewr[0])

df['dropoff_distance_ewr']=coordinates2distance(df['dropoff_latitude_n'].values,
                                    ewr[1],
                                    df['dropoff_longitude_n'].values,
                                    ewr[0])
df['pickup_distance_lgr']=coordinates2distance(df['pickup_latitude_n'].values,
                                    ewr[1],
                                    df['pickup_longitude_n'].values,
                                    ewr[0])

df['dropoff_distance_lgr']=coordinates2distance(df['dropoff_latitude_n'].values,
                                    lgr[1],
                                    df['dropoff_longitude_n'].values,
                                    lgr[0])
print("Shape of  df data",  df.shape)

In [14]:
df=df.drop(['pickup_datetime',
              'pickup_date'],axis=1)

In [None]:
df.head()

In [18]:
df.to_csv('nyc_data_clean_and_features_rev2.csv',index=False)

Unnamed: 0,passenger_count,fare_amount,dropoff_latitude_n,dropoff_longitude_n,pickup_latitude_n,pickup_longitude_n,distance,pickup_day,pickup_hour,pickup_day_of_week,pickup_month,pickup_year,is_airport,pickup_distance_jfk,dropoff_distance_jfk,pickup_distance_ewr,dropoff_distance_ewr,pickup_distance_lgr,dropoff_distance_lgr
0,1,5.5,40.754913,-74.002075,40.748764,-73.98864,1.322624,17,12,0,11,2013,0,20.425164,21.745211,17.645632,16.861508,17.645632,11.06746
1,1,5.5,40.763477,-73.977753,40.753613,-73.980888,1.128517,20,6,3,11,2013,0,20.213616,20.691607,18.449548,19.121147,18.449548,8.885643
2,1,5.5,40.757526,-73.979164,40.750877,-73.986847,0.982869,21,12,6,12,2013,0,20.437318,20.365351,17.87095,18.748058,17.87095,9.118316
3,1,5.5,40.736359,-73.988503,40.72953,-73.978096,1.160337,8,11,0,12,2013,0,18.515769,19.65978,17.869532,17.216362,17.869532,10.597714
4,1,5.5,40.780495,-73.981415,40.769955,-73.987923,1.294193,21,10,1,10,2013,0,21.811073,22.180383,18.665183,19.707963,18.665183,9.129984
