
**A deep neural network model to predict New York city taxi fare price**



In [None]:
# Initial Python environment setup...
import numpy as np # linear algebra
import pandas as pd # CSV file I/O (e.g. pd.read_csv)
import keras 
import matplotlib.pyplot as plt 
import tensorflow
from tensorflow import keras 
from keras.layers import Dense, Activation
from sklearn.model_selection import train_test_split
from keras.optimizers import RMSprop, Adagrad
import seaborn as sns
import os # reading the input files we have access to
print(os.listdir('../input'))

Section I - Importing the data into the environment. 

The training data set contains 55M rows, while the test data contains 10K rows. When  importing such large volume of the data, the kernel crashes. So the model presented herein is trained only  20M rows. It is assumed that this random sample of 20M rows is representative of entire populaion. It would not be wrong to assume that the accuracy of the model can be further improved upon utilization of the all the avaiable data set for training the model. 

In [None]:
train=pd.read_csv('../input/train.csv', nrows = 10_000_000)
test=pd.read_csv('../input/test.csv')
train.dtypes #checking the data types present

Section II - Processing the data
Any data set might contain missing values and those should be identified.  

In [None]:
#Check for any missing values and drop if any 
print(train.isnull().sum())
train= train.dropna(how = 'any', axis = 'rows')

#check for any outliers in the data by generating descriptive statistics such as mean, maximum, etc. 
train.describe()


In [None]:
#The maximum passenger count is 208, which practical doesnt make senses. Maximum carrying capacity 
#of passenger can be 10 and minimum capacity can be 1. 
#Eliminate all points above 10 and less than one. 

#Fare amount is less than zero. Such outliers are to be eliminated. Few kernels have eliminted high fares. 
#It is possible to have such high values in case of large waiting timings, epensive cars. 
#Only consider lower limit while filtering the fare amount. 

train=train[train['fare_amount'].between(left=0,right=250)]
train.describe()
train=train[train['passenger_count'].between(left=1,right=7)]
train.describe()


Distance traveled in a ride should always be greater than zero. Eliminate all such points.

In [None]:
# Absolute difference in latitude and longitude
train['abs_lat_diff'] = (train['dropoff_latitude'] - train['pickup_latitude']).abs()
train['abs_lon_diff'] = (train['dropoff_longitude'] - train['pickup_longitude']).abs()

In [None]:
print('Old size: %d' % len(train))
train=train[(train["abs_lat_diff"]>0 )& (train["abs_lon_diff"]>0)]
train=train[(train["abs_lat_diff"]<5 )& (train["abs_lon_diff"]<5)]
print('New size: %d' % len(train))
                    

The plot above suggest that there might be few rides where pickup and drop off location are same. Eliminate them

Feature extraction


In [None]:
R = 6378

def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    
    
    source: https://stackoverflow.com/a/29546836

    """
    # Convert latitude and longitude to radians
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # Find the differences
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    # Apply the formula 
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    # Calculate the angle (in radians)
    c = 2 * np.arcsin(np.sqrt(a))
    # Convert to kilometers
    km = R * c
    
    return km

train.head(6)

In [None]:
train['haversine'] =  haversine_np(train['pickup_longitude'], train['pickup_latitude'],
                         train['dropoff_longitude'], train['dropoff_latitude']) 


In [None]:
train.head(6)

In [None]:
#train=train.drop("abs_lat_diff",1)
#train=train.drop("abs_lon_diff",1)
#train.head(6)


In [None]:



def add_time_features(df):
    df['pickup_datetime'] =  pd.to_datetime(df['pickup_datetime'], format='%Y-%m-%d %H:%M:%S %Z')
    df['year'] = df['pickup_datetime'].apply(lambda x: x.year)
    df['month'] = df['pickup_datetime'].apply(lambda x: x.month)
    df['day'] = df['pickup_datetime'].apply(lambda x: x.day)
    df['hour'] = df['pickup_datetime'].apply(lambda x: x.hour)
    df['weekday'] = df['pickup_datetime'].apply(lambda x: x.weekday())
    df['pickup_datetime'] =  df['pickup_datetime'].apply(lambda x: str(x))
    # Drop 'pickup_datetime' as we won't need it anymore
    df = df.drop('pickup_datetime', axis=1)
    
    return df

In [None]:

add_time_features(train)

In [None]:
#Drop the key column
train=train.iloc[0:,1:16]

In [None]:
train.head(6)

In [None]:
plt.scatter(train["weekday"], train["fare_amount"])

#def activity(row):
    #if ((row["hour"]<=6) | (row["hour"]>=22)):
        #return 1
    #if ((row["hour"]>=6) & (row["hour"]<=10)):
        #return 2
    #if ((row["hour"]>=11) & (row["hour"]<=14)):
        #return 3
    #if ((row["hour"]>=17) & (row["hour"]<22)):
        #return 3
    
#train['activity'] = train.apply (lambda x: activity(x), axis=1)



In [None]:


#correlations=train.corr()
correlations =train.corr()
fig=plt.figure(figsize=(16,16))
sns.heatmap(correlations, annot=True, fmt=".1f")
plt.show()

X_train=train.iloc[0:,7:16]
Y_train=train.iloc[0:,0]
Y_train.head(6)

In [None]:
X_train.head(6)

In [None]:
random_seed = 10
from tensorflow import keras
from keras.layers import Dense, Activation
from sklearn.model_selection import train_test_split
from keras.optimizers import RMSprop, SGD

X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.1, random_state=random_seed)

In [None]:
from keras import Sequential
from keras.layers import BatchNormalization, Dropout

model = Sequential()
model.add(Dense(64, input_dim=8, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation="relu"))



In [None]:
model.compile(optimizer = 'nadam' , loss = "mean_squared_error", metrics=["accuracy"])

In [None]:
model.fit(X_train, Y_train, epochs=4, batch_size=512, validation_data=(X_val,Y_val))

In [None]:
test=test=pd.read_csv('../input/test.csv')
test.head(6)

In [None]:
test['abs_lat_diff'] = (train['dropoff_latitude'] - train['pickup_latitude']).abs()
test['abs_lon_diff'] = (train['dropoff_longitude'] - train['pickup_longitude']).abs()

In [None]:
test['haversine'] =  haversine_np(test['pickup_longitude'], test['pickup_latitude'],
                         test['dropoff_longitude'], test['dropoff_latitude']) 


In [None]:
add_time_features(test)

In [None]:

test.head(6)

In [None]:
test=test.iloc[0:,7:16]


In [None]:
test.head(6)
test.isnull().sum()

In [None]:
Submissions=model.predict(test)

In [None]:
submission=pd.read_csv('../input/sample_submission.csv')

In [None]:
submission['fare_amount']=Submissions

In [None]:
submission.to_csv('submission_3.csv',index=False)
submission.head(10)