In [1]:
import numpy as np
import pandas as pd
import os 

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load the data

In [None]:
test_dataset =  pd.read_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/test.csv')
train_dataset =  pd.read_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/train.csv', nrows= 20_000_000)
print(test_dataset.shape)
print(train_dataset.shape)

# Data preprocessing

## Handling missing values and outliers

In [None]:
# There are only 139 NaN in training data and no missing values in test data, so we are dropping rows from training data
train_dataset = train_dataset.drop(train_dataset[train_dataset.isnull().any(1)].index, axis = 0)

# There are 832 negative values in "fare_amount" column and we are removing those
train_dataset = train_dataset.drop(train_dataset[train_dataset['fare_amount'] < 0].index, axis=0)

# In training dataset there are 15 cases where "passenger_count" is 208 which doesn't look right, so removing, in test there is none
train_dataset = train_dataset.drop(train_dataset[train_dataset['passenger_count']==208].index, axis = 0)

# As latitude ranges between -90 and +90, so we are removing values outside this range(outliers)
train_dataset = train_dataset.drop(train_dataset[train_dataset['pickup_latitude'] < -90].index, axis=0)
train_dataset = train_dataset.drop(train_dataset[train_dataset['pickup_latitude'] > 90].index, axis=0)

# Like latitude, longitude also ranges between -180 and +180, so removing the rest
train_dataset = train_dataset.drop(train_dataset[train_dataset['pickup_longitude'] < -180].index, axis=0)
train_dataset = train_dataset.drop(train_dataset[train_dataset['pickup_longitude'] > 180].index, axis=0)

# Same operations for "dropoff_latitude" and "dropoff_longitude"
train_dataset = train_dataset.drop(train_dataset[train_dataset['dropoff_latitude'] < -90].index, axis=0)
train_dataset = train_dataset.drop(train_dataset[train_dataset['dropoff_latitude'] > 90].index, axis=0)
train_dataset = train_dataset.drop(train_dataset[train_dataset['dropoff_longitude'] < -180].index, axis=0)
train_dataset = train_dataset.drop(train_dataset[train_dataset['dropoff_longitude'] > 180].index, axis=0)

# Change the "key" and "pickup_datetime" column datatype to date-time from object
train_dataset['key'] = pd.to_datetime(train_dataset['key'], infer_datetime_format=True)
train_dataset['pickup_datetime']  = pd.to_datetime(train_dataset['pickup_datetime'], infer_datetime_format=True)
test_dataset['key'] = pd.to_datetime(test_dataset['key'], infer_datetime_format=True)
test_dataset['pickup_datetime']  = pd.to_datetime(test_dataset['pickup_datetime'], infer_datetime_format=True)

print("Final shape for training and test data after data cleaning: {} and {}".format(train_dataset.shape, test_dataset.shape))

## Save the cleaned data to load/use later

In [2]:
#train_dataset.to_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/train_dataset_20M_after_cleaning.csv')
#test_dataset.to_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/test_dataset_20M_after_cleaning.csv')

test_dataset =  pd.read_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/test_dataset_20M_after_cleaning.csv', index_col=[0])
train_dataset =  pd.read_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/train_dataset_20M_after_cleaning.csv', nrows= 20_000_000, index_col=[0])
print(train_dataset.shape, test_dataset.shape)

train_dataset['key'] = pd.to_datetime(train_dataset['key'], infer_datetime_format=True)
train_dataset['pickup_datetime']  = pd.to_datetime(train_dataset['pickup_datetime'], infer_datetime_format=True)
test_dataset['key'] = pd.to_datetime(test_dataset['key'], infer_datetime_format=True)
test_dataset['pickup_datetime']  = pd.to_datetime(test_dataset['pickup_datetime'], infer_datetime_format=True)

(19998006, 8) (9914, 7)


## Feature engineering

1. Extract "Year", "Month", "Date", "Day of Week" and "Hour" columns from "pickup_datetime"

2. Create a "Distance" column from pickup and dropoff, latitude and longitude

In [3]:
# Extract "Year", "Month", "Date", "Day of Week" and "Hour" columns from "pickup_datetime"
train_dataset['Year'] = train_dataset['pickup_datetime'].dt.year
train_dataset['Month'] = train_dataset['pickup_datetime'].dt.month
train_dataset['Date'] = train_dataset['pickup_datetime'].dt.day
train_dataset['Day of Week'] = train_dataset['pickup_datetime'].dt.dayofweek
train_dataset['Hour'] = train_dataset['pickup_datetime'].dt.hour

test_dataset['Year'] = test_dataset['pickup_datetime'].dt.year
test_dataset['Month'] = test_dataset['pickup_datetime'].dt.month
test_dataset['Date'] = test_dataset['pickup_datetime'].dt.day
test_dataset['Day of Week'] = test_dataset['pickup_datetime'].dt.dayofweek
test_dataset['Hour'] = test_dataset['pickup_datetime'].dt.hour

# Remove "key" and "pickup_datetime" as we have captured this info above
train_dataset = train_dataset.drop(['key','pickup_datetime'], axis = 1)
test_dataset = test_dataset.drop(['key','pickup_datetime'], axis = 1)

# Creating a "Distance" column is a logical choice as it will help to determine the fare
# We will use Haversine distance to calculate distance from pickup and dropoff, latitude and longitude
def calculate_distance(lat1, lon1, lat2, lon2, dataframe):

    # Convert degrees to radians
    lon1_radians = np.radians(dataframe[lon1])
    lon2_radians = np.radians(dataframe[lon2])
    lat1_radians = np.radians(dataframe[lat1])
    lat2_radians = np.radians(dataframe[lat2])

    # Haversine formula
    dlon = lon2_radians - lon1_radians
    dlat = lat2_radians - lat1_radians
    a = np.sin(dlat / 2)**2 + np.cos(lat1_radians) * np.cos(lat2_radians) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    # calculate the result
    dataframe["Distance"]= (c * 6371)

calculate_distance('pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', train_dataset)
calculate_distance('pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', test_dataset)

print(train_dataset.head())
print(test_dataset.head())

   fare_amount  pickup_longitude  pickup_latitude  dropoff_longitude  dropoff_latitude  passenger_count  Year  Month  Date  Day of Week  Hour  Distance
0          4.5        -73.844311        40.721319         -73.841610         40.712278                1  2009      6    15            0    17  1.030764
1         16.9        -74.016048        40.711303         -73.979268         40.782004                1  2010      1     5            1    16  8.450134
2          5.7        -73.982738        40.761270         -73.991242         40.750562                2  2011      8    18            3     0  1.389525
3          7.7        -73.987130        40.733143         -73.991567         40.758092                1  2012      4    21            5     4  2.799270
4          5.3        -73.968095        40.768008         -73.956655         40.783762                1  2010      3     9            1     7  1.999157
   pickup_longitude  pickup_latitude  dropoff_longitude  dropoff_latitude  passenger_cou

## Seperate training labels

In [4]:
X_train = train_dataset.iloc[:, train_dataset.columns != 'fare_amount']
y_train = train_dataset['fare_amount'].values
X_test = test_dataset

# Train and test different models

## Using simple Linear Regressor

In [None]:
from sklearn.linear_model import LinearRegression

# Initialise, train and test the model
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

# Save the results
submission = pd.read_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/sample_submission.csv')
submission['fare_amount'] = y_pred
submission.to_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/submission_lr_1.csv', index=False)
submission.head(10)

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,12.779795
1,2015-01-27 13:08:24.0000003,12.777705
2,2011-10-08 11:53:44.0000002,11.239468
3,2012-12-01 21:12:12.0000002,11.804325
4,2012-12-01 21:12:12.0000003,11.808343
5,2012-12-01 21:12:12.0000005,11.805898
6,2011-10-06 12:10:20.0000001,11.197601
7,2011-10-06 12:10:20.0000003,11.208312
8,2011-10-06 12:10:20.0000002,11.196773
9,2014-02-18 15:22:20.0000002,12.172163


## Using Polynomial Regressor

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

# Initialise, train and test the model
polynomial_features = PolynomialFeatures(degree = 3)
X_train_poly = polynomial_features.fit_transform(X_train)
regressor = LinearRegression()
regressor.fit(X_train_poly, y_train)
y_pred = regressor.predict(polynomial_features.fit_transform(X_test))

# Save the results
submission = pd.read_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/sample_submission.csv')
submission['fare_amount'] = y_pred
submission.to_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/submission_plr_2.csv', index=False)
submission.head(10)

## Using Support Vector Regressor(SVR)

In [None]:
from sklearn.svm import SVR

# Initialise, train and test the model
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

# Save the results
submission = pd.read_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/sample_submission.csv')
submission['fare_amount'] = y_pred
submission.to_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/submission_svr_1.csv', index=False)
submission.head(10)

## Using Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialise, train and test the model
regressor_1 = RandomForestRegressor(n_estimators = 200, random_state = 42, max_samples = 0.05, max_features = 0.6)
regressor_1.fit(X_train, y_train)
y_pred = regressor_1.predict(X_test)

# Save the results
submission = pd.read_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/sample_submission.csv')
submission['fare_amount'] = y_pred
submission.to_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/submission_rf_3.csv', index=False)
submission.head(10)

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.5175
1,2015-01-27 13:08:24.0000003,10.345
2,2011-10-08 11:53:44.0000002,4.442
3,2012-12-01 21:12:12.0000002,8.7985
4,2012-12-01 21:12:12.0000003,16.047
5,2012-12-01 21:12:12.0000005,11.152
6,2011-10-06 12:10:20.0000001,5.0305
7,2011-10-06 12:10:20.0000003,48.98905
8,2011-10-06 12:10:20.0000002,11.81605
9,2014-02-18 15:22:20.0000002,6.21


## Using XGBoost

In [None]:
import xgboost

# Initialise, train and test the model
regressor_2 = xgboost.XGBRegressor(objective = "reg:squarederror",
                  n_estimators = 100, 
                  seed = 42,
                  eta = 0.1,
                  eval_metric = "rmse",
                  max_depth = 7,
                  verbose = False)
regressor_2.fit(X_train, y_train, verbose = False)
y_pred = regressor_2.predict(X_test, num_iteration = regressor_2.best_iteration_)

# Save the results
submission = pd.read_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/sample_submission.csv')
submission['fare_amount'] = y_pred
submission.to_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/submission_xgboost_2.csv', index=False)
submission.head(10)



Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.249455
1,2015-01-27 13:08:24.0000003,10.932947
2,2011-10-08 11:53:44.0000002,4.674202
3,2012-12-01 21:12:12.0000002,8.953639
4,2012-12-01 21:12:12.0000003,15.907702
5,2012-12-01 21:12:12.0000005,11.079843
6,2011-10-06 12:10:20.0000001,5.152694
7,2011-10-06 12:10:20.0000003,48.422676
8,2011-10-06 12:10:20.0000002,11.859878
9,2014-02-18 15:22:20.0000002,6.789435


## Using LightGBM

In [None]:
import lightgbm

# Initialise, train and test the model
regressor_3 = lightgbm.LGBMRegressor(boosting_type = "gbdt", 
                                     learning_rate = 0.1, 
                                     n_estimators = 200, 
                                     objective = "regression", 
                                     subsample = 0.1,
                                     random_state = 42,
                                     metric = "rmse",
                                     verbose = 0)
regressor_3.fit(X_train, y_train, verbose = False)
y_pred = regressor_3.predict(X_test)

# Save the results
submission = pd.read_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/sample_submission.csv')
submission['fare_amount'] = y_pred
submission.to_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/submission_lgbm_3.csv', index=False)
submission.head(10)

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.690266
1,2015-01-27 13:08:24.0000003,11.026318
2,2011-10-08 11:53:44.0000002,4.953806
3,2012-12-01 21:12:12.0000002,8.370679
4,2012-12-01 21:12:12.0000003,15.599452
5,2012-12-01 21:12:12.0000005,10.860681
6,2011-10-06 12:10:20.0000001,5.247845
7,2011-10-06 12:10:20.0000003,49.53
8,2011-10-06 12:10:20.0000002,11.316202
9,2014-02-18 15:22:20.0000002,6.671705


## Using Neural Network

In [None]:
import tensorflow as tf
from google.colab import files
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras import regularizers

print("Tensorflow version: {}\n".format(tf.__version__))

# Define the model
regressor=Sequential()
regressor.add(Dense(units = X_train.shape[1], activation='relu', input_dim = X_train.shape[1]))
regressor.add(Dense(units = 20, activation='relu'))
regressor.add(Dense(units = 20, activation='relu'))
regressor.add(Dense(units=1))
print(regressor.summary())

# Specify other hyper-parameters then train and test
regressor.compile(optimizer = "Adam", loss='mean_squared_error', metrics=['accuracy'])
regressor.fit(X_train, y_train, validation_split=0.0005, batch_size=10000, epochs=50, shuffle=True)
y_pred = regressor.predict(X_test)

# Save the results
submission = pd.read_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/sample_submission.csv')
submission['fare_amount'] = y_pred
submission.to_csv('/content/drive/MyDrive/Newyork taxi fair prediction challenge/submission_nn_1.csv', index=False)
submission.head(10)

Tensorflow version: 2.7.0

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 11)                132       
                                                                 
 dense_9 (Dense)             (None, 20)                240       
                                                                 
 dense_10 (Dense)            (None, 20)                420       
                                                                 
 dense_11 (Dense)            (None, 1)                 21        
                                                                 
Total params: 813
Trainable params: 813
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,8.517562
1,2015-01-27 13:08:24.0000003,8.704256
2,2011-10-08 11:53:44.0000002,6.043541
3,2012-12-01 21:12:12.0000002,8.083719
4,2012-12-01 21:12:12.0000003,15.7506
5,2012-12-01 21:12:12.0000005,10.362946
6,2011-10-06 12:10:20.0000001,6.165036
7,2011-10-06 12:10:20.0000003,54.22485
8,2011-10-06 12:10:20.0000002,11.427979
9,2014-02-18 15:22:20.0000002,6.350249
