In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
class Model(nn.Module):
    def __init__(self,cat_cols,cont_cols,layer_count,output_features,embed_size_list,p=0.5): 
        '''
        args:
        cat_cols is the tensor of all categorical values (pre-embedding)
        n_cont=number of continuous variables (for batch normalization)
        cont_cols is the tensor of all continuous values
        input_features - number of parameters of input
        layer_count - a tuple of number of nodes of each hidden layer
        output_features = number of outputs expected
        embed_size_list is list of embedding sizes for the categorical values
        p = basically, the % of nodes to be nullified during dropout layer
       
       Approach: In the constructor, create all the layers (Linear, ReLU, Batch and Dropout) for each hidden layer as per layer_count)
       and add them to sequential(). Fwd() will have all the data manipulation and final embedding
       
        '''
        super().__init__()
        #self.input_features=input_features
        self.output_features=output_features
        self.layer_count=layer_count
        self.embed_size_list=embed_size_list
        #Create embeddings from categorical columns as seen in the test case
        self.embeddings=[nn.Embedding(base_dim, target_dim) for base_dim, target_dim in embed_size_list]
#         print(self.embeddings)
        n_cont=cont_cols.shape[1]
        self.batch_norm=nn.BatchNorm1d(n_cont)
        self.dropout=nn.Dropout(p=0.5)
        #To create the layers we need to start with input sizes
        n_cont=cont_cols.shape[1]
        #We need to find the total number of columns in the data. 
        #cat_cols currently only has 4 columns which post embedding will go to 23 
        n_in=sum(nf for ni,nf in self.embed_size_list)+n_cont
#         print('n_in',n_in)
        self.layers=[]
        for l in self.layer_count:
            self.layers.append(nn.Linear(n_in,l))
            self.layers.append(nn.ReLU(inplace=True))
            self.layers.append(nn.Dropout(p))
            self.layers.append(nn.BatchNorm1d(l))            
            n_in=l
        self.layers.append(nn.Linear(self.layer_count[-1],self.output_features))
#         print(self.layers)
        self.final_layers=nn.Sequential(*self.layers)
#         print(self.final_layers)
        
    def forward(self,cat_cols, cont_cols):
        '''
        1. Create the embedding for cat and create one final input value for the forward path (only one time this gets created)
        2. Create the dropout for this input layer
        3. Create a batch norm for this layer\
        4. Pass this into the all the layers moving forward (starting with the first linear layer)
        '''
        #Creating the embedding for categorical columns
        embeds=[]
        for i,e in enumerate(self.embeddings):
#             print('e' ,e)
            embeds.append(e(cat_cols[:,i]))
        cat_final=torch.cat(embeds,axis=1)
#         print(cat_final.shape) #17 columns cos 12 for hours, 2 for AM/PM and 4 for days post embedding
#         print(cont_cols.shape) # 6 columns one for each of the 6 features
        
        #Batch normalize the continuous variables first
        cont_cols=self.batch_norm(cont_cols)
        cat_final=self.dropout(cat_final)
        self.X=torch.cat((cat_final, cont_cols),axis=1)
#         print('X shape', self.X.shape)
#         print('X dtype',self.X.dtype)
#         print('X class',type(self.X))
        #Dropout for the complete data set
        #self.X=self.dropout(self.X)
        self.X=self.final_layers(self.X)
        return self.X
            

        #Creating final data set with cat and cols

In [3]:
samp_model=torch.load('uber_model.pkl')
samp_model.eval()

Model(
  (batch_norm): BatchNorm1d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (final_layers): Sequential(
    (0): Linear(in_features=23, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): Dropout(p=0.4, inplace=False)
    (3): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): Dropout(p=0.4, inplace=False)
    (7): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): Linear(in_features=100, out_features=1, bias=True)
  )
)

In [4]:
#Creating the input data
#pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
samp_pickup_datetime='2010-04-15 16:00:00'
samp_pickup_longitude=-73.9
samp_pickup_latitude=40.5
samp_dropoff_longitude=-73.92
samp_dropoff_latitude=40.52
samp_passenger_count=2


In [5]:
def haversine_distance(df, lat1, long1, lat2, long2):
    """
    Calculates the haversine distance between 2 sets of GPS coordinates in df
    """
    r = 6371  # average radius of Earth in kilometers
       
    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])
    
    delta_phi = np.radians(df[lat2]-df[lat1])
    delta_lambda = np.radians(df[long2]-df[long1])
     
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c) # in kilometers
    return d


In [6]:
#convert date time as before
dict1={'samp_pickup_datetime':'2010-04-15 16:00:00','samp_pickup_longitude':-73.9,'samp_pickup_latitude':40.5,
       'samp_dropoff_longitude':-73.92,'samp_dropoff_latitude':40.52,'samp_passenger_count':2
      }
samp_df=pd.DataFrame(dict1,columns=dict1.keys(),index=[0])

In [7]:
samp_df['samp_pickup_datetime']=pd.to_datetime(samp_df['samp_pickup_datetime'])
samp_df['distance']=haversine_distance(samp_df,'samp_pickup_latitude', 'samp_pickup_longitude', 'samp_dropoff_latitude', 'samp_dropoff_longitude')
samp_df['hours']=samp_df['samp_pickup_datetime'].dt.hour
samp_df['AM_PM']=np.where(samp_df['hours']<12,"AM","PM")
samp_df['weekday']=samp_df['samp_pickup_datetime'].dt.strftime('%a')

In [8]:
cats=['hours','weekday','AM_PM']
for cat in cats:
    samp_df[cat]=samp_df[cat].astype('category')
cat_cols=torch.tensor(np.stack([samp_df[cat].cat.codes for cat in cats],axis=1),dtype=torch.long)

In [9]:
conts=['samp_pickup_longitude',
       'samp_pickup_latitude', 'samp_dropoff_longitude', 'samp_dropoff_latitude',
       'samp_passenger_count', 'distance']
cont_cols=torch.tensor(np.stack([samp_df[col].values for col in conts],axis=1),dtype=torch.float)

In [10]:
with torch.no_grad():
    y=samp_model.forward(cat_cols,cont_cols)
print('Predicted fare is {:4.4g} '.format(y.item()))

Predicted fare is 17.3 
