In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import warnings
import xgboost as xgb
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
%matplotlib inline

In [None]:
%%time
df = pd.read_csv('../input/new-york-city-taxi-fare-prediction/train.csv', nrows=10000000)
df.info()

In [None]:
df.head()

# Data Preprocessing

In [None]:
df['pickup_datetime']=pd.to_datetime(df['pickup_datetime'],format='%Y-%m-%d %H:%M:%S UTC')

In [None]:
df.dropna(how='any',axis='rows',inplace=True)
df.info()

In [None]:
df.shape

# Train test dataset

In [None]:
features = df[['pickup_datetime', 'pickup_longitude', 'pickup_latitude', 
               'dropoff_longitude', 'dropoff_latitude', 'passenger_count']]
price = df['fare_amount']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, price, test_size=0.3, random_state=42)
X_train.shape, X_test.shape

# Baseline model with raw features

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

def adjusted_r2_score(y_true, y_pred, X_test):
    r2 = r2_score(y_true=y_true, y_pred=y_pred)
    adjusted_r2 = 1 - (1-r2)*(len(y_true)-1)/(len(y_true) - X_test.shape[1]-1)
    return adjusted_r2

In [None]:
%%time

xgr = xgb.XGBRegressor(objective='reg:linear', n_estimators=50, max_depth=5, n_jobs=-1, random_state=42)
xgr.fit(X_train.drop(columns=['pickup_datetime']), y_train)

y_pred = xgr.predict(X_test.drop(columns=['pickup_datetime']))

rsq_baseline_xgb = r2_score(y_true=y_test, y_pred=y_pred)
adj_rsq_baseline_xgb = adjusted_r2_score(y_true=y_test, y_pred=y_pred, X_test=X_test)
rmse_baseline_xgb = mean_squared_error(y_true=y_test, y_pred=y_pred) ** 0.5
print('R-sq:', rsq_baseline_xgb)
print('Adj. R-sq:', adj_rsq_baseline_xgb)
print('RMSE:', rmse_baseline_xgb)

# Exploratory data analysis

In [None]:
df.describe()

**Visualize trip fare amount**

In [None]:
sns.kdeplot(df['fare_amount'].values,shade=True)

**We can see that the values are positively skewed, some are of very high value and there are some negative values too. We have to remove those outliers**

**Visualizing pick  up coordinates**

In [None]:
fig = plt.figure(figsize = (14, 5))
title = fig.suptitle("Distribution of trips across the US", fontsize=14)
ax1 = fig.add_subplot(1,2, 1)
p = sns.kdeplot((df[(df['pickup_latitude']>= 30) & (df['pickup_latitude'] <= 50)]['pickup_latitude'].values),
                shade=True,
                ax=ax1)
t= ax1.set_title("Distribution of latitude")

ax2 = fig.add_subplot(1,2, 2)
p = sns.kdeplot((df[(df['pickup_longitude']>= -125) & (df['pickup_longitude'] <= -65)]['pickup_longitude'].values),
                shade=True,
                ax=ax2)
t = ax2.set_title("Distribution of longitude")

**Visualization of passenger count**

In [None]:
sns.kdeplot(df['passenger_count'].values,shade=True)

**In the above graph, there are some values like 200 passengers, so those are typically outlier**

**Fixed co ordinates**

In [None]:
lat_long = {
    'min_lat':30,
    'max_lat':50,    
    'min_long':-125,
    'max_long':-65, 
}

**Things gained from the EDA:**

**Fare amount should be positive and < 1000**

**A typical taxi or cab can maybe take at the max 8 people and minimum 1 person**

**Pickup and Dropoff Co-ordinates to be in the US bounding box**

# Filtering the data

In [None]:
filter = (df['fare_amount'].between(0.01, 1000) 
                   & df['passenger_count'].between(1, 8)
                   & df['pickup_latitude'].between(lat_long['min_lat'], lat_long['max_lat'])
                   & df['dropoff_latitude'].between(lat_long['min_lat'], lat_long['max_lat']) 
                   & df['pickup_longitude'].between(lat_long['min_long'], lat_long['max_long'])
                   & df['dropoff_longitude'].between(lat_long['min_long'], lat_long['max_long']))

df = df[filter]

features = df[['pickup_datetime', 'pickup_longitude', 'pickup_latitude', 
               'dropoff_longitude', 'dropoff_latitude', 'passenger_count']]
price = df['fare_amount']

X_train, X_test, y_train, y_test = train_test_split(features, price, test_size=0.3, random_state=42)
X_train.shape, X_test.shape

In [None]:
X_train.head()

**Now all the features are within the given range**

# Model after outlier removal

In [None]:
%%time

xgr = xgb.XGBRegressor(objective='reg:linear', n_estimators=50, max_depth=5, n_jobs=-1, random_state=42)
xgr.fit(X_train.drop(columns=['pickup_datetime']), y_train)

y_pred = xgr.predict(X_test.drop(columns=['pickup_datetime']))

rsq_baseline2_xgb = r2_score(y_true=y_test, y_pred=y_pred)
adj_rsq_baseline2_xgb = adjusted_r2_score(y_true=y_test, y_pred=y_pred, X_test=X_test)
rmse_baseline2_xgb = mean_squared_error(y_true=y_test, y_pred=y_pred) ** 0.5
print('R-sq:', rsq_baseline2_xgb)
print('Adj. R-sq:', adj_rsq_baseline2_xgb)
print('RMSE:', rmse_baseline2_xgb)

**This model gives the better metrics than the previous one**

# Manhattan distance as feature

**Instead of taking the latitude and logitude values, we will calculate the distance between two co ordinates and take as the feature**

In a plane with  𝑝1  at ( 𝑥1,𝑦1 ) and  𝑝2  at ( 𝑥2,𝑦2 ),

Manhattan Distance, M = | 𝑥1−𝑥2 | + | 𝑦1−𝑦2 |

In [None]:
def manhattan(start_coord, end_coord):
    
    pickup_lat, pickup_long = start_coord
    dropoff_lat, dropoff_long = end_coord    
    distance = np.abs(dropoff_lat - pickup_lat) + np.abs(dropoff_long - pickup_long)
    return distance

In [None]:
X_train['manhattan_dist'] = X_train.apply(lambda row: manhattan(start_coord=(row['pickup_latitude'], 
                                                                             row['pickup_longitude']),
                                                                end_coord=(row['dropoff_latitude'], 
                                                                           row['dropoff_longitude'])), axis=1)

X_test['manhattan_dist'] = X_test.apply(lambda row: manhattan(start_coord=(row['pickup_latitude'], 
                                                                             row['pickup_longitude']),
                                                                end_coord=(row['dropoff_latitude'], 
                                                                           row['dropoff_longitude'])), axis=1)
X_train.head()

In [None]:
%%time

xgr = xgb.XGBRegressor(objective='reg:linear', n_estimators=50, max_depth=5, n_jobs=-1, random_state=42)
xgr.fit(X_train.drop(columns=['pickup_datetime']), y_train)

y_pred = xgr.predict(X_test.drop(columns=['pickup_datetime']))

rsq_manhattan_xgb = r2_score(y_true=y_test, y_pred=y_pred)
adj_rsq_manhattan_xgb = adjusted_r2_score(y_true=y_test, y_pred=y_pred, X_test=X_test)
rmse_manhattan_xgb = mean_squared_error(y_true=y_test, y_pred=y_pred) ** 0.5
print('R-sq:', rsq_manhattan_xgb)
print('Adj. R-sq:', adj_rsq_manhattan_xgb)
print('RMSE:', rmse_manhattan_xgb)

**Thus we experiemented with the various features created and the accuracy was pretty much increased**