In [None]:
import numpy as np
import pandas as pd 


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import seaborn as sns
import matplotlib.pyplot as plt
import folium
from folium.plugins import HeatMap

# Data validation, cleaning & feature engineering

So far I have already checked the database, to reduce the size I will load it without the Key column because it seems to be used for the unique identifier.

In [None]:
fields = ['pickup_datetime', 'fare_amount', 'pickup_longitude', 'pickup_latitude', 
          'dropoff_longitude', 'dropoff_latitude', 'passenger_count']


train = pd.read_csv("/kaggle/input/new-york-city-taxi-fare-prediction/train.csv", nrows = 1000000, 
                    skipinitialspace=True, usecols=fields, parse_dates=["pickup_datetime"])
print(f'{train.shape} shape')
train.head()

In [None]:
print(train.info())
train.describe()

In [None]:
test = pd.read_csv("/kaggle/input/new-york-city-taxi-fare-prediction/test.csv", parse_dates=["pickup_datetime"])
print(f'{test.shape} shape')
test.head() 

In [None]:
test.info()

In [None]:
test.describe()

In [None]:
train.describe()

First, remove the Key column for test database too.

In [None]:
test = test.drop(columns=['key'])

In [None]:
train.head()

In [None]:
sns.countplot(x='passenger_count',data=train)

As we can see, fare_amount is negative, which is illogical. Also, 208 passengers is unrealistic.
Let's take 6 as the maximum number of passengers.

In [None]:
train[train['passenger_count']>6]

In [None]:
train = train.drop(train[train['passenger_count']==208].index)

In [None]:
train = train.drop(train[train['fare_amount']<0].index)

In general, we know that taxi prices are greatly affected by the time period. That's why it's important to split the datetime and extract more valuable variables.



In [None]:
train['year'] = train['pickup_datetime'].dt.year
train['month'] = train['pickup_datetime'].dt.month
train['day'] = train['pickup_datetime'].dt.day
train['hour'] = train['pickup_datetime'].dt.hour
train['minute'] = train['pickup_datetime'].dt.minute

test['year'] = test['pickup_datetime'].dt.year
test['month'] = test['pickup_datetime'].dt.month
test['day'] = test['pickup_datetime'].dt.day
test['hour'] = test['pickup_datetime'].dt.hour
test['minute'] = test['pickup_datetime'].dt.minute

We don't need the Datetime column anymore, so let's remove it.

In [None]:
train = train.drop(columns=['pickup_datetime'])
test = test.drop(columns=['pickup_datetime'])

In [None]:
train.describe()

In [None]:
variables = ['year', 'month', 'day', 'hour']

for var in variables:
    plt.figure()
    sns.regplot(x=var, y='fare_amount', data=train).set(title=f'Regression plot of {var} and Fare amount')

Regplot chart does not show any significant differences, I will use also the countplot.

In [None]:
variables = ['year', 'month', 'day', 'hour']

for var in variables:
    plt.figure()
    sns.countplot(x=var, data=train).set(title=f'Count plot of {var}')

It seems that we have data from 2009-2014 and part from 2015. And the most important thing that we see is the least traffic in 3-5AM hours and the busiest hour is 6-7PM in the evening.

In [None]:
train.shape

There are also unrealistic latitude/longitude data in the description. We can only take the latitude/longitude of New York. 
> Longitude: 71° 47' 25" W to 79° 45' 54" W Latitude: 40° 29' 40" N to 45° 0' 42" N. 

In [None]:
train.describe()

In [None]:
train = train[(train['pickup_latitude']>40)&(train['pickup_latitude']<45)]
train = train[(train['dropoff_latitude']>40)&(train['dropoff_latitude']<45)]

train = train[(train['pickup_longitude']<-71)&(train['pickup_longitude']>-79)]
train = train[(train['dropoff_longitude']<-71)&(train['dropoff_longitude']>-79)]


In [None]:
train.describe()

In [None]:
train.shape

In [None]:
# Creating a base map
ny_map = folium.Map(location=[40.71,-74.00], tiles='cartodbpositron', zoom_start=20)

HeatMap(data=train[['pickup_latitude', 'pickup_longitude']], radius=10).add_to(ny_map)

ny_map

In [None]:
HeatMap(data=train[['dropoff_latitude', 'dropoff_longitude']], radius=10).add_to(ny_map)

ny_map

These plots show that Manhatten area is the bussiest place.

# Modelling

In [None]:
train.head()

In [None]:
test.head()

In [None]:
x = train.loc[:, train.columns != 'fare_amount']
x_test = test
y = train['fare_amount'].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()

linreg.fit(x_train,y_train)
y_pred = linreg.predict(x_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(rmse)

In [None]:
from xgboost import XGBRegressor

xg = XGBRegressor()
xg.fit(x_train, y_train)
xg_pred = xg.predict(x_test)

In [None]:
mse = mean_squared_error(y_test, xg_pred)
rmse = np.sqrt(mse)
print(rmse)

Since the RMSE of XGBRegressor is the lowest, we will use it for prediction.

In [None]:
# results = pd.DataFrame({'Actual': y_test, 'Predicted': xg_pred})
# print(results) 

In [None]:
pred = xg.predict(test)

In [None]:
submission = pd.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/sample_submission.csv')
submission['fare_amount'] = pred
submission.to_csv('submission.csv', index=False)
submission.head()