### Code for Preprocessing the Taxi Trip data for Neural Nets

In [3]:
# imports
import pandas as pd
import numpy as np
import datetime
import zipfile
import json
import re
import math
from mpl_toolkits.basemap import Basemap
from sklearn.preprocessing import LabelEncoder

In [2]:
# function to convert posix format to datetime format
# def datetime_parser(dt):
#     return datetime.date.fromtimestamp(float(dt))

#### Reading the data

In [4]:
#df=pd.read_csv('train.csv',nrows=100000,parse_dates=[5],date_parser=datetime_parser)
df = pd.read_csv('train.csv', nrows= 100)

In [5]:
# converting the TIMESTAMP from posix to datetime format
df.TIMESTAMP =  pd.to_datetime(df.TIMESTAMP, unit='s')

In [6]:
# checking the list of columns
list(df.columns.values)

['TRIP_ID',
 'CALL_TYPE',
 'ORIGIN_CALL',
 'ORIGIN_STAND',
 'TAXI_ID',
 'TIMESTAMP',
 'DAY_TYPE',
 'MISSING_DATA',
 'POLYLINE']

In [7]:
# creating the basemap for convertion of latitude and longitude to meters
bm=Basemap(llcrnrlat=37,llcrnrlon=-9.5,urcrnrlat=41.5 ,urcrnrlon=-6.5,epsg=3763)
x=math.ceil(bm.xmax/500)
y=math.ceil(bm.ymax/500)

In [8]:
# function to convert coordinates into meters and outputs the new x and y's and also the grid they belong to
# It does so by reading each line of the data.
def grid(f,trip_path,trip_label,bm):    
    for line in iter(f.readline,''):
        l=[]
        la = []
        coord=line.split(",",8)[8]        
        for c in re.finditer("(-?\d+.\d+),(-?\d+.\d+)",coord):
            c_m=bm(float(c.group(1)),float(c.group(2)))
            square_ind=(math.ceil(c_m[0]/500),math.ceil(c_m[1]/500))
            square_label=x*((math.ceil(c_m[1]/500))-1)+math.ceil(c_m[0]/500)
            if (square_ind[0]*square_ind[1])<0:
                continue
            if (l):
                if (square_ind!=l[-1]):
                    l.append(square_ind)
                    la.append(square_label)
            else:
                l.append(square_ind)
                la.append(square_label)
        trip_path[line.split(",",8)[0].strip('"')]=l
        trip_label[line.split(",",8)[0].strip('"')]=la
    f.close()

### Converting the Latitude and Longitude to meters and the grid they belong to

In [11]:
# running the function for 1.7million rows
trip_path = {}
trip_label = {}
grid(open("sample.csv",'r'),trip_path,trip_label,bm)

In [12]:
# defining function for getting the first and last grid label
def final_dest(x):
    if bool(trip_label.get(x)):
        return trip_label[str(x)][len(trip_label[str(x)])-1]
    
def start_dest(x):
    if bool(trip_label.get(x)):
        return trip_label[str(x)][0]

##### Adding the first and last grid label on the dataframe

In [13]:
df['Destination_grid'] = df.TRIP_ID.apply(lambda x: final_dest(str(x)))
df['Starting_grid'] = df.TRIP_ID.apply(lambda x: start_dest(str(x)))

##### Preprocessing the data

In [14]:
# We are removing the columns which will not be used in the analysis
df.drop(['TIMESTAMP','MISSING_DATA','POLYLINE','DAY_TYPE','TRIP_ID','ORIGIN_CALL','ORIGIN_STAND'], axis=1, inplace=True)

In [15]:
# checking the data
df.head()

Unnamed: 0,CALL_TYPE,TAXI_ID,Destination_grid,Starting_grid
0,C,20000589,475800,474768
1,B,20000596,477862,476833
2,C,20000320,474252,474252
3,C,20000520,474770,475810
4,C,20000337,478893,478900


#### One hot encoding the categorical variable

In [16]:
# encoding the CALL_TYPE column
w1 = pd.get_dummies(df.CALL_TYPE)

# concatenating it the original dataframe
df = pd.concat([df, w1], axis=1)

In [17]:
# checking the data
del df['CALL_TYPE']
df.head()

Unnamed: 0,TAXI_ID,Destination_grid,Starting_grid,A,B,C
0,20000589,475800,474768,0,0,1
1,20000596,477862,476833,0,1,0
2,20000320,474252,474252,0,0,1
3,20000520,474770,475810,0,0,1
4,20000337,478893,478900,0,0,1


In [23]:
le = LabelEncoder()
df['Destination_grid'] = le.fit_transform(df['Destination_grid'].values)
df['Starting_grid'] = le.fit_transform(df['Starting_grid'].values)
#df['CALL_TYPE'] = le.fit_transform(df['CALL_TYPE'].values)
#df['ORIGIN_CALL'] = le.fit_transform(df['ORIGIN_CALL'].values)
#df['ORIGIN_STAND'] = le.fit_transform(df['ORIGIN_STAND'].values)
df['TAXI_ID'] = le.fit_transform(df['TAXI_ID'].values)
#df['DAY_TYPE'] = le.fit_transform(df['DAY_TYPE'].values)

In [24]:
df.head()

Unnamed: 0,TAXI_ID,Destination_grid,Starting_grid,A,B,C
0,46,21,4,0,0,1
1,47,52,27,0,1,0
2,25,6,1,0,0,1
3,40,10,23,0,0,1
4,26,59,49,0,0,1


In [20]:
# saving the 1st dataset for inputting it in neural net
df.to_csv('input_train1.csv', inplace=False)

In [89]:
# reinvering the label encoding in TAXI_ID and coverting it in categorical variable
df['TAXI_ID'] = le.inverse_transform(df['TAXI_ID'])
w = pd.get_dummies(df.TAXI_ID)

In [21]:
# adding the dummy variable to the original dataset
df = pd.concat([df, w], axis=1)
del df['TAXI_ID']

In [None]:
# saving the 2nd dataset for inputting in neural net
df.to_csv('input_train2.csv', inplace=False)