In [0]:
import os
import pandas as pd
import numpy as np
from fancyimpute import KNN

In [145]:
train = pd.read_csv("https://raw.githubusercontent.com/Priyankk18k/CabPrediction/master/train_cab.csv")
test = pd.read_csv("https://raw.githubusercontent.com/Priyankk18k/CabPrediction/master/test.csv")
train.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1.0
1,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1.0
2,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2.0
3,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1.0
4,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1.0


In [146]:
train.describe(include='all')


Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,16043.0,16067,16067.0,16067.0,16067.0,16067.0,16012.0
unique,468.0,16021,,,,,
top,6.5,2013-01-11 12:28:00 UTC,,,,,
freq,759.0,2,,,,,
mean,,,-72.462787,39.914725,-72.462328,39.897906,2.62507
std,,,10.578384,6.826587,10.575062,6.187087,60.844122
min,,,-74.438233,-74.006893,-74.429332,-74.006377,0.0
25%,,,-73.992156,40.734927,-73.991182,40.734651,1.0
50%,,,-73.981698,40.752603,-73.980172,40.753567,1.0
75%,,,-73.966838,40.767381,-73.963643,40.768013,2.0


In [147]:
print(train.dtypes)
print(train.shape)


fare_amount           object
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count      float64
dtype: object
(16067, 7)


In [0]:
train['fare_amount'] = pd.to_numeric(train['fare_amount'], errors='coerce')
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'], errors='coerce')

# Data Cleaning

In [0]:
#Changing the values to its absolute for easier calculation further
train = train[abs(train["pickup_latitude"]) < 90] #One value is 401 deleting that
train = train[abs(train["dropoff_latitude"]) < 90] #non greater than 90
train = train[abs(train["pickup_longitude"]) < 180]
train = train[abs(train["dropoff_longitude"]) < 180]


In [0]:
#Deleting pasangers count where count > 7, as a cab can max hold 8 passengers and i want to avoid outliers so deleting it not imputing
index_pc = train.loc[train.passenger_count > 7,:].index
train.drop(index_pc, axis = 0, inplace = True)

In [0]:
#deleting values greater than 100 or negative
index_fc = train.loc[train.fare_amount > 100,:].index
train.drop(index_fc, axis = 0, inplace = True)

index_fc1 = train.loc[train.fare_amount < 0, :].index
train.drop(index_fc1, axis = 0, inplace = True)

In [0]:
train = train.dropna(subset=['fare_amount'])
train = train.dropna(subset=['pickup_datetime'])

In [153]:
#replacing all values that has 0 to nan so that we can impute them during missing value analysis
train=train.replace(0,np.nan)
print(train.shape)

(16009, 7)


# Feature Engineering

In [0]:

#Datetime breakup and changinng the datatype to int

train['year']=train['pickup_datetime'].dt.year
train['month']=train['pickup_datetime'].dt.month
train['weekday']=train['pickup_datetime'].dt.weekday
train['hour']=train['pickup_datetime'].dt.hour
train=train.drop('pickup_datetime',axis=1)


# Missing Value Analysis

In [155]:
print(train.isnull().sum())
print(train.isnull().count())
# Finding out the percentage of the null columns
total = train.isnull().sum().sort_values(ascending=False)
percent = ((train.isnull().sum()/train.isnull().count()).sort_values(ascending=False))*100
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data = missing_data.sort_values('Total', ascending = True)
missing_data

fare_amount            1
pickup_longitude     310
pickup_latitude      310
dropoff_longitude    309
dropoff_latitude     307
passenger_count      112
year                   0
month                  0
weekday                0
hour                   0
dtype: int64
fare_amount          16009
pickup_longitude     16009
pickup_latitude      16009
dropoff_longitude    16009
dropoff_latitude     16009
passenger_count      16009
year                 16009
month                16009
weekday              16009
hour                 16009
dtype: int64


Unnamed: 0,Total,Percent
hour,0,0.0
weekday,0,0.0
month,0,0.0
year,0,0.0
fare_amount,1,0.006246
passenger_count,112,0.699606
dropoff_latitude,307,1.917671
dropoff_longitude,309,1.930164
pickup_latitude,310,1.936411
pickup_longitude,310,1.936411


In [0]:
train=pd.DataFrame(KNN(k=5).fit_transform(train),columns=train.columns, index=train.index)
## Best imputing is done by KNN That's why imputing with KNN not with mean,median

In [0]:

train['passenger_count']=train['passenger_count'].astype(int)
train['year']=train['year'].astype(int)
train['weekday']=train['weekday'].astype(int)
train['hour']=train['hour'].astype(int)
train['month']=train['month'].astype(int)

In [0]:
train['abs_longi']=abs(train['pickup_longitude']-train['dropoff_longitude'])
train['abs_lat']=abs(train['pickup_latitude']-train['dropoff_latitude'])

In [0]:

#Calculatig distance from log and lati with circle formula
def great_circle_distance(lon1,lat1,lon2,lat2):
    R = 6371000 # Approximate mean radius of earth (in mt)
    # Convert decimal degrees to radians
    lon1,lat1,lon2,lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    
    # Distance of lons and lats in radians
    dis_lon = lon2 - lon1
    dis_lat = lat2 - lat1
    
    # Haversine implementation
    a = np.sin(dis_lat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dis_lon/2)**2
    c = 2*np.arctan2(np.sqrt(a), np.sqrt(1-a))
    dis_m = R*c # Distance in meters
    dis_km = dis_m/1000 # Distance in km
    return dis_km
train['distance'] = great_circle_distance(train.pickup_longitude, train.pickup_latitude, train.dropoff_longitude, train.dropoff_latitude)
train.head(10)

# Univariate Analysis

In [0]:
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

In [0]:
fig_size = plt.rcParams['figure.figsize']
fig_size[0] = 15
fig_size[1] = 10

plt.subplot(3,4,1)
plt.plot(train['fare_amount'])
plt.title('fare_amount')

plt.subplot(3,4,2)
plt.plot(train['dropoff_longitude'])
plt.title('dropoff_longitude')

plt.subplot(3,4,3)
plt.plot(train['dropoff_latitude'])
plt.title('dropoff_latitude')

plt.subplot(3,4,4)
plt.plot(train['passenger_count'])
plt.title('passenger_count')

plt.subplot(3,4,5)
plt.plot(train['pickup_latitude'])
plt.title('pickup_latitude')

plt.subplot(3,4,6)
plt.plot(train['pickup_longitude'])
plt.title('pickup_longitude')

plt.subplot(3,4,7)
plt.plot(train['abs_longi'])
plt.title('abs_longi')

plt.subplot(3,4,8)
plt.plot(train['abs_lat'])
plt.title('abs_lat')

plt.subplot(3,4,9)
plt.plot(train['distance'])
plt.title('distance')

plt.tight_layout()

# Bivariate Analysis

In [0]:
fig, axarr = plt.subplots(2, 2, figsize=(15, 15))
plt.subplots_adjust(hspace=.3)
sns.barplot(x='year',y='fare_amount',data=train,ax=axarr[0][0])
sns.barplot(x='month',y='fare_amount',data=train,ax=axarr[0][1])
sns.barplot(x='weekday',y='fare_amount',data=train,ax=axarr[1][0])
sns.barplot(x='hour',y='fare_amount',data=train,ax=axarr[1][1])

In [0]:
sns.lmplot(x='distance',y='fare_amount',data=train)
sns.lmplot(x='abs_lat',y='fare_amount',data=train)
sns.lmplot(x='abs_longi',y='fare_amount',data=train)
sns.lmplot(x='passenger_count',y='fare_amount',data=train)

In [0]:
year = pd.pivot_table(train, values = 'fare_amount', index = 'year', aggfunc = ['mean'])
fig, ax = plt.subplots(figsize = (10,5))
year.plot(kind = 'bar', legend = False, color = 'firebrick', ax = ax)
ax.set(title = 'year vs mean fare_amount', ylabel= 'fare_amount')
plt.show()

# Outlier Analysis

In [0]:
fig_size = plt.rcParams['figure.figsize']
fig_size[0] = 15
fig_size[1] = 10

plt.subplot(3,4,1)
plt.boxplot(train['fare_amount'])
plt.title('fare_amount')

plt.subplot(3,4,2)
plt.boxplot(train['dropoff_longitude'])
plt.title('dropoff_longitude')

plt.subplot(3,4,3)
plt.boxplot(train['dropoff_latitude'])
plt.title('dropoff_latitude')

plt.subplot(3,4,4)
plt.boxplot(train['passenger_count'])
plt.title('passenger_count')

plt.subplot(3,4,5)
plt.boxplot(train['pickup_latitude'])
plt.title('pickup_latitude')

plt.subplot(3,4,6)
plt.boxplot(train['pickup_longitude'])
plt.title('pickup_longitude')

plt.subplot(3,4,7)
plt.boxplot(train['abs_longi'])
plt.title('abs_longi')

plt.subplot(3,4,8)
plt.boxplot(train['abs_lat'])
plt.title('abs_lat')

plt.subplot(3,4,9)
plt.boxplot(train['distance'])
plt.title('distance')

plt.tight_layout()

# Feature Selection

In [0]:
sns.set(rc={'figure.figsize':(8,5)})

train['passenger_count'] = train['passenger_count'].astype(int)
colname = ['distance','abs_longi','abs_lat','fare_amount','passenger_count']
heat_map = train[colname]
sns.heatmap(heat_map.corr(), vmin=-1.00, vmax=1.00, annot=True)

In [0]:
#considering taxi ride is limited to certain distance will keep this at 100Km
train = train[train["distance"] < 100]
train = train[train["distance"] > 0]

#Droping the 66.00839 from abs_longi and 39.99361 from abs_lat which is not contributing much but just a noice, which i understood from my visualization
train['abs_longi'] = train['abs_longi'].replace(66.00839, np.nan)
train['abs_lat'] = train['abs_lat'].replace(39.99361, np.nan)

train = train.dropna(subset=['abs_longi'])
train = train.dropna(subset=['abs_lat'])

# Modelling

In [0]:
from sklearn import metrics
#function to check performance  
def performance(actual, predict):
    print('MSE:', metrics.mean_squared_error(actual, predict)) 
    print('RMSE:', np.sqrt(metrics.mean_squared_error(actual, predict))) 
    print('MAPE:',np.mean(np.abs((actual-predict)/actual))*100)
    print('R-Sq:', metrics.r2_score(actual, predict))

In [0]:
##Sampling: dividing  Test and train data  using sklearn 
from sklearn.model_selection import train_test_split,KFold, cross_val_score, cross_val_predict
import copy
#moving the amount to last variable
train['amount']= copy.copy(train['fare_amount'])
train = train.drop(['fare_amount'], axis=1)

#Random sample selection 
X_train, x_test = train_test_split(train, test_size = 0.20, random_state = 100)
train.shape, x_test.shape , X_train.shape, test.shape

In [0]:
# Decision Tree
from sklearn.tree import DecisionTreeRegressor

dt1 = DecisionTreeRegressor(max_depth =3, random_state=123).fit(X_train.iloc[:,11:12],X_train.iloc[:,12])
prediction_dt1 = dt1.predict(x_test.iloc[:,11:12])

#error matrix
performance(x_test.iloc[:,12],prediction_dt1)
print('                           ')
print('Perdicted Vs Actual value: ')
prediction_dt1[1], x_test.iloc[1,12]


In [0]:
dt1.score(X_train.iloc[:,11:12],X_train.iloc[:,12])*100


In [0]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor

rf1 = RandomForestRegressor(n_estimators = 500, random_state = 126).fit(X_train.iloc[:,11:12],X_train.iloc[:,12])
prediction_rf1 = rf1.predict(x_test.iloc[:,11:12])

#error matrix
performance(x_test.iloc[:,12],prediction_rf1)
print('                           ')
print('Perdicted Vs Actual value: ')
prediction_rf1[1], x_test.iloc[1,12]

In [0]:
rf1.score(X_train.iloc[:,11:12],X_train.iloc[:,12])*100


In [0]:
from sklearn.linear_model import LinearRegression
#1.Simple Linear Regression Model 3


ln1 = LinearRegression().fit(X_train.iloc[:,11:12],X_train.iloc[:,12])
prediction_slr1 = ln1.predict(x_test.iloc[:,11:12])

#error matrix
performance(x_test.iloc[:,12],prediction_slr1)
print('                           ')
print('Perdicted Vs Actual value: ')
prediction_slr1[1], x_test.iloc[1,12]

In [0]:
ln1.score(X_train.iloc[:,11:12],X_train.iloc[:,12])


In [0]:
from sklearn import ensemble
clf = ensemble.GradientBoostingRegressor(n_estimators = 400, max_depth = 5, min_samples_split = 2,
          learning_rate = 0.1, loss = 'ls')
clf.fit(X_train.iloc[:,11:12],X_train.iloc[:,12])
prediction_one = clf.predict(x_test.iloc[:,11:12])
performance(x_test.iloc[:,12],prediction_one)
print('                           ')
print('Perdicted Vs Actual value: ')
prediction_one[1], x_test.iloc[1,12]



In [0]:
clf.score(X_train.iloc[:,11:12],X_train.iloc[:,12])


In [0]:
from sklearn import linear_model
#Ridge model


ln2 = linear_model.Ridge(alpha=.5).fit(X_train.iloc[:,11:12],X_train.iloc[:,12])
prediction_slr2 = ln2.predict(x_test.iloc[:,11:12])

#error matrix
performance(x_test.iloc[:,12],prediction_slr2)
print('                           ')
print('Perdicted Vs Actual value: ')
prediction_slr2[1], x_test.iloc[1,12]

In [0]:
ln2.score(X_train.iloc[:,11:12],X_train.iloc[:,12])


In [0]:
# Now we will predict with all the variable 
dt2 = DecisionTreeRegressor(max_depth =5, random_state=123).fit(X_train.iloc[:,4:12],X_train.iloc[:,12])
prediction_dt2 = dt2.predict(x_test.iloc[:,4:12])

#error matrix
performance(x_test.iloc[:,12],prediction_dt2)
print('                           ')
print('Perdicted Vs Actual value: ')
prediction_dt2[1], x_test.iloc[1,12]

In [0]:
dt2.score(X_train.iloc[:,4:12],X_train.iloc[:,12])


In [0]:
rf2 = RandomForestRegressor(n_estimators = 100, random_state = 126).fit(X_train.iloc[:,4:12],X_train.iloc[:,12])
prediction_rf2 = rf2.predict(x_test.iloc[:,4:12])

#error matrix
performance(x_test.iloc[:,12],prediction_rf2)
print('                           ')
print('Perdicted Vs Actual value: ')
prediction_rf2[1], x_test.iloc[1,12]

In [0]:
rf2.score(X_train.iloc[:,4:12],X_train.iloc[:,12])


In [0]:
# Here we can find out which feature are most important in predicting the cab fare
#Feature importance plotting
names=list(train)
names = names[4:12]

sns.barplot(x = names ,y = rf2.feature_importances_)  

plt.title('Feature Importance')
plt.xlabel('Features')
plt.ylabel('Importance')

# Now implementing with the test dataset

In [0]:
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'], errors='coerce')
test = test.dropna(subset=['pickup_datetime'])

In [0]:
test['year']=test['pickup_datetime'].dt.year
test['month']=test['pickup_datetime'].dt.month
test['weekday']=test['pickup_datetime'].dt.weekday
test['hour']=test['pickup_datetime'].dt.hour

In [0]:
test['abs_longi']=abs(test['pickup_longitude']-test['dropoff_longitude'])
test['abs_lat']=abs(test['pickup_latitude']-test['dropoff_latitude'])

In [0]:
test['distance'] = great_circle_distance(test.pickup_longitude, test.pickup_latitude, test.dropoff_longitude, test.dropoff_latitude)


In [0]:

test = test.drop(['pickup_datetime','pickup_longitude', 'pickup_latitude','dropoff_longitude', 'dropoff_latitude'], axis=1)

In [0]:
rf = RandomForestRegressor(n_estimators = 100, random_state = 127).fit(X_train.iloc[:,4:12],X_train.iloc[:,12])
amount = rf.predict(test)

test ['amount'] = amount

In [0]:
test.head()

In [0]:
test.to_csv("Predicted_Fare.csv")
