In [1]:
#!conda install seaborn

In [2]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns

ModuleNotFoundError: No module named 'torch'

### To-do
&nbsp;&nbsp; 1. Pull in the data (This includes distance calculation) <br>
&nbsp;&nbsp; 2. Run around with the categorical data and extract data from timestamped details <br>
&nbsp;&nbsp; 3. Do basic exploratory analysis <br>
&nbsp;&nbsp; 4. Create categorical data matrix and continuous data matrix <br>
&nbsp;&nbsp; 5. Create tensors and embedding sizes for categorical data <br>
&nbsp;&nbsp; 6. Create model, optimizer and criterion <br>
&nbsp;&nbsp; 7. Build train model <br>
&nbsp;&nbsp; 8. Run test <br>
&nbsp;&nbsp; 9. Save model, load model <br>
&nbsp;&nbsp;10. Create program to directly run data transformation and through model

#### 1. Pull in the data

In [None]:
df=pd.read_csv('..//Data//NYCTaxiFares.csv')

In [None]:
df.head()

In [None]:
def haversine_distance(df, lat1, long1, lat2, long2):
    """
    Calculates the haversine distance between 2 sets of GPS coordinates in df
    """
    r = 6371  # average radius of Earth in kilometers
       
    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])
    
    delta_phi = np.radians(df[lat2]-df[lat1])
    delta_lambda = np.radians(df[long2]-df[long1])
     
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c) # in kilometers

    return d

In [None]:
df['distance_km']=haversine_distance(df,'pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude')

In [None]:
df.head()

#### 2. Run around with the categorical data and extract data from timestamped details

In [None]:
df['pickup_datetime']=pd.to_datetime(df['pickup_datetime'])

In [None]:
df['pickup_datetime'][:5]

In [None]:
#converting EST to NewYork time since all the data is taken from NYC
df['date_timeEST']=df['pickup_datetime']-pd.Timedelta(hours=4)

In [None]:
#Creating various columns that are extracted data from the timestamp
df['hours']=df['date_timeEST'].dt.hour
df['weekday']=df['date_timeEST'].dt.strftime('%a')
df['AM_PM']=np.where(df['hours']<12,"AM","PM")

In [None]:
df.head()

#### 3. Data Exploration

In [None]:
df_cont=df.copy()
df_cont.drop(['pickup_datetime','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','fare_class','hours','weekday','AM_PM'],axis=1,inplace=True)

In [None]:
#correlation of all significant continuous variables
df_cont.corr()

In [None]:
#Distribution of data of all continuous variables
for cont in df_cont.columns:
    plt.hist(df[cont])
    plt.xlabel('X-Axis')
    plt.ylabel('Y-Axis')
    plt.title(cont)
    plt.show()

In [None]:
df_cat=df.copy()
df_cat.drop(['fare_amount','passenger_count','distance_km'],axis=1,inplace=True)

In [None]:
#Distribution of data of all continuous variables
for cat in df_cat.columns:
    plt.hist(df[cat])
    plt.xlabel('X-Axis')
    plt.ylabel('Y-Axis')
    plt.title(cat)
    plt.show()

#### 4. Create categorical data matrix and continuous data matrix

In [None]:
df.columns

In [None]:
#Creating an index of all categorical columns in index
cats=['hours','weekday','AM_PM']

In [None]:
#To use [category dtype].cat.codes, convert all non continuous objects to 'category' codes
for cat in cats:
    df[cat]=df[cat].astype('category')

In [None]:
df.dtypes

In [None]:
#Creating a matrix of all categorical columns of all int index values for each category 
#e.g. weekdays will have 7 codes (0-6), hours will have 24 codes (0-23)
cat_cols=np.stack([df[cat].cat.codes for cat in cats],axis=1)

In [None]:
#creating index of continuous columns
conts=['pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'distance_km']

In [None]:
cont_cols=np.stack([df[col].values for col in conts],axis=1)

In [None]:
label=df['fare_amount'].values

In [None]:
print('category shape',cat_cols.shape)
print('continuous shape',cont_cols.shape)

#### 5.Create tensors and embedding sizes for categorical data

In [None]:
cat_tensor=torch.tensor(cat_cols,dtype=torch.long)
cont_tensor=torch.tensor(cont_cols,dtype=torch.float)
label_tensor=torch.tensor(label,dtype=torch.float)
print('category shape',cat_tensor.shape)
print('continuous shape',cont_tensor.shape)

##### 5a. Creating the embedding object
&nbsp;&nbsp; 1.  Creating embedding sizes : 
We need to create an embedding matrix of dimensions [original no.of cats, new numbers of cats] This is because we use only one-hotkey encoding, then the dimensions of each vector will by original no. of unique values of the categorical column which will be very large. Hence, we want to squeeze this to a more reasonable number of dimensions

In [None]:
cat_len=[df[col].nunique() for col in cats]

In [None]:
cat_len

In [None]:
embed_size=[(cat,min(50,int((cat+1)//2))) for cat in cat_len]

In [None]:
embed_size

In [None]:
cat_embeddings = [nn.Embedding(initial,target) for initial,target in embed_size]

##### 5b. Trying to create an embedding matrix for the sample to see how it goes

In [None]:
cat_sample=torch.tensor(cat_cols[:4,:],dtype=torch.long)

In [None]:
print(cat_sample)

In [None]:
cat_embeddings

In [None]:
#Attempting only for the hours data - to convert a coded 24-dimension feature into an embedding
embed_hour=cat_embeddings[0]
embed_hour

In [None]:
#test sample of hours data
test_hour=torch.tensor(cat_sample[:,0],dtype=torch.long)

In [None]:
test_hour

In [None]:
print(embed_hour(test_hour))
#We see that a 24 dimension variable now has only 12, which is a victory!!

In [None]:
#Now attempting the same for all features in cat_sample
embeddings_test=[]
for i, embed in enumerate(cat_embeddings):
    embeddings_test.append(embed(cat_sample[:,i]))

In [None]:
for i,e in enumerate(embeddings_test):
    print(cats[i])
    print(e.shape)
    print(e)

###### SUCCESS ON SAMPLE!!

#### 6.Create model, optimizer and criterion

In [None]:
cont_tensor.shape

In [None]:
class Model(nn.Module):
    def __init__(self,cat_cols,cont_cols,layer_count,output_features,embed_size_list,p=0.5): 
        '''
        args:
        cat_cols is the tensor of all categorical values (pre-embedding)
        n_cont=number of continuous variables (for batch normalization)
        cont_cols is the tensor of all continuous values
        input_features - number of parameters of input
        layer_count - a tuple of number of nodes of each hidden layer
        output_features = number of outputs expected
        embed_size_list is list of embedding sizes for the categorical values
        p = basically, the % of nodes to be nullified during dropout layer
       
       Approach: In the constructor, create all the layers (Linear, ReLU, Batch and Dropout) for each hidden layer as per layer_count)
       and add them to sequential(). Fwd() will have all the data manipulation and final embedding
       
        '''
        super().__init__()
        #self.input_features=input_features
        self.output_features=output_features
        self.layer_count=layer_count
        self.embed_size_list=embed_size_list
        #Create embeddings from categorical columns as seen in the test case
        self.embeddings=[nn.Embedding(base_dim, target_dim) for base_dim, target_dim in embed_size_list]
#         print(self.embeddings)
        n_cont=cont_cols.shape[1]
        self.batch_norm=nn.BatchNorm1d(n_cont)
        self.dropout=nn.Dropout(p=0.5)
        #To create the layers we need to start with input sizes
        n_cont=cont_cols.shape[1]
        #We need to find the total number of columns in the data. 
        #cat_cols currently only has 4 columns which post embedding will go to 23 
        n_in=sum(nf for ni,nf in self.embed_size_list)+n_cont
#         print('n_in',n_in)
        self.layers=[]
        for l in self.layer_count:
            self.layers.append(nn.Linear(n_in,l))
            self.layers.append(nn.ReLU(inplace=True))
            self.layers.append(nn.Dropout(p))
            self.layers.append(nn.BatchNorm1d(l))            
            n_in=l
        self.layers.append(nn.Linear(self.layer_count[-1],self.output_features))
#         print(self.layers)
        self.final_layers=nn.Sequential(*self.layers)
#         print(self.final_layers)
        
    def forward(self,cat_cols, cont_cols):
        '''
        1. Create the embedding for cat and create one final input value for the forward path (only one time this gets created)
        2. Create the dropout for this input layer
        3. Create a batch norm for this layer\
        4. Pass this into the all the layers moving forward (starting with the first linear layer)
        '''
        #Creating the embedding for categorical columns
        embeds=[]
        for i,e in enumerate(self.embeddings):
#             print('e' ,e)
            embeds.append(e(cat_cols[:,i]))
        cat_final=torch.cat(embeds,axis=1)
#         print(cat_final.shape) #17 columns cos 12 for hours, 2 for AM/PM and 4 for days post embedding
#         print(cont_cols.shape) # 6 columns one for each of the 6 features
        
        #Batch normalize the continuous variables first
        cont_cols=self.batch_norm(cont_cols)
        cat_final=self.dropout(cat_final)
        self.X=torch.cat((cat_final, cont_cols),axis=1)
#         print('X shape', self.X.shape)
#         print('X dtype',self.X.dtype)
#         print('X class',type(self.X))
        #Dropout for the complete data set
        #self.X=self.dropout(self.X)
        self.X=self.final_layers(self.X)
        return self.X
            

        #Creating final data set with cat and cols
          
        

In [None]:
torch.manual_seed(33)
model = Model(cat_tensor,cont_tensor,layer_count=[200,100],output_features=1,embed_size_list=embed_size,p=0.4)

In [None]:
#Testing if forward() works
z = model.forward(cat_tensor,cont_tensor)
print(z)

In [None]:
criterion=nn.MSELoss()

In [None]:
optimizer=torch.optim.Adam(model.parameters(),lr=0.001)

#### 7.Create train, test split

Kaggle data set says that the data is already shuffled. If the data is not shuffled, then we need to do a train test split

In [None]:
label.shape

In [None]:
test_size=int(0.2*df.shape[0])
print('test size',test_size)
print('train size',df.shape[0]-test_size)
cat_cols
X_train_cont=cont_tensor[:test_size,:]
X_train_cat=cat_tensor[:test_size,:]
X_test_cont=cont_tensor[test_size:,:]
X_test_cat=cat_tensor[test_size:]
label_train=label_tensor[:test_size].reshape(-1,1)
label_test=label_tensor[test_size:].reshape(-1,1)
print('X_train_cont',X_train_cont.shape)
print('X_train_cat',X_train_cat.shape)
print('X_test_cont',X_test_cont.shape)
print('X_test_cat',X_test_cat.shape)
print('label_train',label_train.shape)
print('label_test',label_test.shape)

#### 8. Building the train model

In [None]:
epochs=300
losses3=[]
for e in range(epochs):
    e+=1
    y_pred=model.forward(X_train_cat,X_train_cont)
    loss=criterion(y_pred,label_train)**0.5
    losses3.append(loss)
    if e%10==0:
        print(f' Loss at epoch {e} is {loss}')
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    

In [None]:
plt.plot(range(epochs),losses,'g')
plt.plot(range(epochs),losses1,'r')
plt.plot(range(epochs),losses3,'b')
plt.xlabel('Epochs')
plt.ylabel('Losses')
plt.title('Loss for lr=0.1 (red) and lr=0.001 (green)')
plt.show()


#### 9. Running on test data

In [None]:
with torch.no_grad():
    y_preds=model.forward(X_test_cat,X_test_cont)

In [None]:
loss_test=criterion(y_preds, label_test)
loss_test**0.5

#### 10. Analysing the error spread

In [None]:
error=(y_preds-label_test).numpy()

In [None]:
print(pd.DataFrame(error).describe())
print('Mean of error is {:8.4f}'.format(error.mean() ))
print('Std dev of error is {:8.4f}'.format(error.std()))
print('Range of error is {:8.4f}'.format(error.max()-error.min()))
print('Max of error is {:8.4f} at sample {}'.format(error.max(), (error.argmax())))
print('Max of error is {:8.4f} at sample {}'.format(error.min(), (error.argmin())))
plt.hist(error)
plt.title('Distribution of the error variable for test case')
plt.show()

In [None]:
one_std_dev_p=error.mean()+error.std()
one_std_dev_n=error.mean()-error.std()
ranges=((error<=one_std_dev_p) & (error>=one_std_dev_n)).sum()

In [None]:
print('% of error within one standard deviation on either side is: {:8.3f}%'.format(ranges/len(error)*100))

#### 11. Saving the model

In [None]:
if len(losses)==epochs:
    torch.save(model.state_dict(),'uber_model_weights.pt')
    torch.save(model,'uber_model.pkl')
else:
    print("You haven't trained this model! Only trained models should be saved!")

#### 12. Create program to directly run data transformation and through model

<b> Refer Nikhil_Full_ANN_prediction_operation.ipynb </b>