In [55]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

In [56]:
df=pd.read_csv('../input/taxi-data11/NYCTaxiFares.csv')

In [57]:
df.head()

In [58]:
df.info()

In [59]:
df['fare_amount'].describe()

In [60]:
# permet de calculer les distances parcourues
#https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points

def haversine(df, lat1,long1,lat2, long2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    """
    r=6371
    # convert decimal degrees to radians 
    phi1=np.radians(df[lat1])
    phi2=np.radians(df[lat2])
    
    delta_phi=np.radians(df[lat2]-df[lat1])
    delta_lambda=np.radians(df[long2]-df[long1])

    # haversine formula 
    a=np.sin(delta_phi/2)**2+np.cos(phi1)*np.cos(phi2)*np.sin(delta_lambda/2)**2
    c=2*np.arctan2(np.sqrt(a),np.sqrt(1-a))
    d=(r*c)
    return d

In [61]:
df.columns

In [62]:
df['dist_km']=haversine(df,'pickup_latitude','pickup_longitude', 'dropoff_latitude','dropoff_longitude')

In [63]:
df.head()

In [64]:
df.info()

In [65]:
df['pickup_datetime']=pd.to_datetime(df.pickup_datetime)
df['pickup_datetime'][0]

In [66]:
df.head()

In [67]:
# UTC a 4 heures d’avance sur New York.
#https://24timezones.com/fr/difference/utc/new_york
df['EDTdate']=df['pickup_datetime']-pd.Timedelta(hours=4)

In [68]:
df['Hour']=df.EDTdate.dt.hour

In [69]:
df['AM_PM']=np.where(df['Hour']>12,'pm','am')

In [70]:
df.head()

In [71]:
df['Weekday']=df.EDTdate.dt.strftime("%a")

In [72]:
df.head()

In [73]:
df.select_dtypes(exclude='object').columns

In [74]:
# division en données catégoriel et contunues. les weeknds et les heures sont classées en données catégorielles
cats_cols=['Hour','AM_PM','Weekday']
count_cols=['fare_class', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'dist_km']
y_cols=['fare_amount']

In [75]:
df.groupby('Weekday').mean()['fare_amount'].plot()

In [76]:
df.groupby(['Hour','AM_PM']).mean()['fare_amount'].plot(kind='bar')

In [77]:
df.info()

In [78]:
# on converti en catégory
df[cats_cols]=df[cats_cols].astype('category')

In [79]:
df.info()

In [80]:
df['AM_PM'].cat.codes

In [81]:
# convertis en données numpy array pour ensuite les convertir en tensor
hr=df['Hour'].cat.codes.values
ampm=df['AM_PM'].cat.codes.values
wkdy=df['Weekday'].cat.codes.values

In [82]:
cats=np.stack([hr,ampm,wkdy],axis=1)

In [83]:
cats=torch.tensor(cats,dtype=torch.int64)

In [84]:
conts=np.stack([df[col].values for col in count_cols],axis=1)


In [85]:
conts=torch.tensor(conts,dtype=torch.float)

In [86]:
conts

In [87]:
y=torch.tensor(df[y_cols].values,dtype=torch.float)

In [88]:
cats.shape,y.shape,conts.shape

In [89]:
cat_size=[len(df[col].cat.categories) for col in cats_cols]

In [90]:
cat_size

In [91]:
emb_szs=[(size,min(50,(size+1)//2)) for size in cat_size]

In [92]:
emb_szs

In [93]:
catz=cats[:2]
catz

In [94]:
selfembeds=nn.ModuleList([nn.Embedding(ni,nf) for ni, nf in emb_szs])

In [95]:
selfembeds

In [96]:
embeddinggz=[]

for i,e in enumerate(selfembeds):
    embeddinggz.append(e(catz[:,i]))

In [97]:
embeddinggz

In [98]:
z=torch.cat(embeddinggz,1)

In [99]:
selfembdrop=nn.Dropout(0.4)
z=selfembdrop(z)

In [100]:
z

In [101]:
class TabularModel(nn.Module):

    def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni,nf in emb_szs]) #embeddings
        self.emb_drop = nn.Dropout(p) # dropuot
        self.bn_cont = nn.BatchNorm1d(n_cont) # normalization
        
        layerlist = [] #storing the layers
        n_emb = sum((nf for ni,nf in emb_szs)) # sum of total embeddings
        n_in = n_emb + n_cont # number of inputs
        
        # create identical layers with sequence of operations, e.g.
        # layers = [100, 50, 25]
        for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
        layerlist.append(nn.Linear(layers[-1],out_sz)) #final layer
            
        # assign layers to atributes
        self.layers = nn.Sequential(*layerlist)
    
    def forward(self, x_cat, x_cont):
        embeddings = []
        for i,e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)   # categorical embeddings
         
        x_cont = self.bn_cont(x_cont) # continuous features
        x = torch.cat([x, x_cont], 1)  # concatenate categorial and continuous features
        x = self.layers(x)            # apply layers
        return x              

In [102]:
layerlist = [] #storing the layers
p = 0.5
layers = [100, 50, 25]
n_in = 200
for i in layers:
            layerlist.append(nn.Linear(n_in,i)) 
            layerlist.append(nn.ReLU(inplace=True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in = i
            
print(layerlist)
nn.Sequential(*layerlist)

In [103]:
torch.manual_seed(33)
model=TabularModel(emb_szs,conts.shape[1],1,[200,100],p=0.5)

In [104]:
model

In [105]:
criterion=nn.MSELoss()
optimizer=torch.optim.Adam(model.parameters(),lr=0.001)

In [109]:
batch_size=70000
test_size=int(batch_size*0.2)

In [110]:
batch_size = 70000 # two batches
test_size = int(batch_size * .2)

#data already shuffled
cat_train = cats[:batch_size-test_size]
cat_test = cats[batch_size-test_size:batch_size]
con_train = conts[:batch_size-test_size]
con_test = conts[batch_size-test_size:batch_size]
y_train = y[:batch_size-test_size]
y_test = y[batch_size-test_size:batch_size]

In [111]:
import time

start_time=time.time()
epochs=200

losses=[]

for i in range(epochs):
    i+=1
    
    y_pred=model(cat_train,con_train)
    loss=torch.sqrt(criterion(y_pred,y_train))
    losses.append(loss)
        # a neat trick to save screen space:
    if i%25 == 1:
        print(f'epoch: {i:3}  loss: {loss.item():10.8f}')

    optimizer.zero_grad()   # reset gradients to 0
    loss.backward()
    optimizer.step()

print(f'epoch: {i:3}  loss: {loss.item():10.8f}') # print the last line
print(f'\nDuration: {time.time() - start_time:.0f} seconds') # print the time elapsed

In [122]:
fi_los = [fl.item() for fl in losses]
plt.plot(range(epochs), fi_los)
plt.ylabel('RMSE Loss')
plt.xlabel('Epoch');

In [125]:
with torch.no_grad():
    y_val=model(cat_test,con_test)
    loss=torch.sqrt(criterion(y_val,y_test))

In [128]:
print(f'{"PREDICTED":>12} {"ACTUAL":>8} {"DIFF":>8}')
for i in range(10):
    diff = np.abs(y_val[i].item()-y_test[i].item())
    print(f'{i+1:2}. {y_val[i].item():8.4f} {y_test[i].item():8.4f} {diff:8.4f}')

In [129]:
torch.save(model.state_dict(),'taxi.pt')