# Taxi Fare Prediction

## Importing Libraries

In [1]:
import numpy as np 
import pandas as pd  
import matplotlib.pyplot as plt 
%matplotlib inline 
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression

## Load and review data

In [2]:
df= pd.read_csv("Taxi.csv")

In [3]:
df.shape

(50000, 8)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   unique_id             50000 non-null  object 
 1   amount                50000 non-null  float64
 2   date_time_of_pickup   50000 non-null  object 
 3   longitude_of_pickup   50000 non-null  float64
 4   latitude_of_pickup    50000 non-null  float64
 5   longitude_of_dropoff  50000 non-null  float64
 6   latitude_of_dropoff   50000 non-null  float64
 7   no_of_passenger       50000 non-null  int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 3.1+ MB


In [5]:
df.sample(10)

Unnamed: 0,unique_id,amount,date_time_of_pickup,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
16529,12:15.0,14.5,2014-10-11 10:12:15 UTC,-73.96209,40.758941,-73.983518,40.728929,1
14932,51:00.0,18.1,2009-11-26 01:51:00 UTC,-73.968355,40.762398,-73.861597,40.756105,1
22483,23:00.0,16.0,2014-12-03 22:23:00 UTC,-73.99048,40.74602,-74.016882,40.708932,2
1314,06:50.0,11.5,2012-09-20 20:06:50 UTC,0.0,0.0,0.0,0.0,1
11530,52:00.0,6.1,2010-12-07 13:52:00 UTC,-73.973722,40.760655,-73.965862,40.766157,1
40704,58:15.0,9.0,2014-09-12 16:58:15 UTC,-74.016044,40.715175,-74.008024,40.738902,2
31207,34:01.0,11.5,2014-02-16 01:34:01 UTC,-73.983673,40.73896,-74.009383,40.725881,1
11760,03:00.0,8.5,2014-08-15 07:03:00 UTC,-73.98318,40.73065,-73.99765,40.74661,6
36219,26:12.0,5.0,2013-01-30 22:26:12 UTC,0.0,0.0,0.0,0.0,1
2392,41:50.0,45.0,2011-12-09 07:41:50 UTC,-73.788794,40.641368,-74.012271,40.716938,1


In [6]:
df = df.drop(labels=['unique_id','date_time_of_pickup'], axis=1)
df.head()

Unnamed: 0,amount,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1


## Dealing with Missing Values

In [7]:
df.describe()

Unnamed: 0,amount,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,11.364171,-72.509756,39.933759,-72.504616,39.926251,1.66784
std,9.685557,10.39386,6.224857,10.40757,6.014737,1.289195
min,-5.0,-75.423848,-74.006893,-84.654241,-74.006377,0.0
25%,6.0,-73.992062,40.73488,-73.991152,40.734372,1.0
50%,8.5,-73.98184,40.752678,-73.980082,40.753372,1.0
75%,12.5,-73.967148,40.76736,-73.963584,40.768167,2.0
max,200.0,40.783472,401.083332,40.851027,43.41519,6.0


In [8]:
df.dtypes

amount                  float64
longitude_of_pickup     float64
latitude_of_pickup      float64
longitude_of_dropoff    float64
latitude_of_dropoff     float64
no_of_passenger           int64
dtype: object

In [9]:
df = df.replace(0, np.NaN)

In [10]:
df.head(15)

Unnamed: 0,amount,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1.0
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1.0
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2.0
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1.0
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1.0
5,12.1,-74.000964,40.73163,-73.972892,40.758233,1.0
6,7.5,-73.980002,40.751662,-73.973802,40.764842,1.0
7,16.5,-73.9513,40.774138,-73.990095,40.751048,1.0
8,9.0,-74.006462,40.726713,-73.993078,40.731628,1.0
9,8.9,-73.980658,40.733873,-73.99154,40.758138,2.0


In [11]:
df.mean()

amount                  11.364853
longitude_of_pickup    -73.921659
latitude_of_pickup      40.711346
longitude_of_dropoff   -73.920941
latitude_of_dropoff     40.703692
no_of_passenger          1.673362
dtype: float64

In [12]:
for columns in df:
    df[columns] = df[columns].fillna(df[columns].mean())
df.head(15)

Unnamed: 0,amount,longitude_of_pickup,latitude_of_pickup,longitude_of_dropoff,latitude_of_dropoff,no_of_passenger
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1.0
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1.0
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2.0
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1.0
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1.0
5,12.1,-74.000964,40.73163,-73.972892,40.758233,1.0
6,7.5,-73.980002,40.751662,-73.973802,40.764842,1.0
7,16.5,-73.9513,40.774138,-73.990095,40.751048,1.0
8,9.0,-74.006462,40.726713,-73.993078,40.731628,1.0
9,8.9,-73.980658,40.733873,-73.99154,40.758138,2.0


In [13]:
# df['longitude_of_pickup'] = df['longitude_of_pickup'].fillna(df['longitude_of_pickup'].mean())
# df['latitude_of_pickup'] = df['latitude_of_pickup'].fillna(df['latitude_of_pickup'].mean())
# df['longitude_of_dropoff'] = df['longitude_of_dropoff'].fillna(df['longitude_of_dropoff'].mean())
# df['latitude_of_dropoff'] = df['latitude_of_dropoff'].fillna(df['latitude_of_dropoff'].mean())
# df['amount'] = df['amount'].fillna(df['amount'].mean())
# df['no_of_passenger'] = df['no_of_passenger'].fillna(df['no_of_passenger'].mean())
# df.head(15)

## Normalization

In [14]:
df['amount']=(df['amount'] - df['amount'].min())/(df['amount'].max() - df['amount'].min())

## Removing outliers

In [15]:
def remove_outlier(df_in):
    for i in df_in.columns:
        q1 = df_in[i].quantile(0.25)
        q3 = df_in[i].quantile(0.75)
        iqr = q3-q1 #Interquartile range
        fence_low  = q1-1.5*iqr
        fence_high = q3+1.5*iqr
        df_out = df_in.loc[(df_in[i] > fence_low) & (df_in[i] < fence_high)]
    return df_out

In [16]:
df=remove_outlier(df)

In [17]:
df.shape

(44542, 6)

## Spliting data 

In [18]:
X = df.drop(['amount'], axis=1)
y = df[['amount']]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,random_state=1)

## Fit linear model


In [20]:
reg_model = LinearRegression()
reg_model.fit(X_train, y_train)

LinearRegression()

In [21]:
reg_model.score(X_train, y_train)

0.000662448956375572

In [22]:
reg_model.score(X_test, y_test)

-0.0013663471838476493

## Using Decision tree

In [23]:
from sklearn.tree import DecisionTreeRegressor

In [24]:
model=DecisionTreeRegressor(criterion="squared_error",max_depth=6)

In [25]:
model.fit(X_train,y_train)

DecisionTreeRegressor(max_depth=6)

In [26]:
model.score(X_train,y_train)

0.6545700916004173

In [27]:
model.score(X_test,y_test)

0.6150993475575564

## Using KNN

In [28]:
from sklearn.neighbors import KNeighborsRegressor

In [29]:
knn_model = KNeighborsRegressor(n_neighbors=8)

In [30]:
knn_model.fit(X_train,y_train)

KNeighborsRegressor(n_neighbors=8)

In [31]:
knn_model.score(X_train,y_train)

0.7851806993504471

In [32]:
knn_model.score(X_test,y_test)

0.7433010292421363

In [33]:
from lazypredict.Supervised import LazyClassifier, LazyRegressor

In [34]:
clf = LazyRegressor(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

 21%|█████████████████▊                                                                 | 9/42 [00:07<00:29,  1.12it/s]Exception ignored in: <generator object tqdm.__iter__ at 0x000002427293D2E0>
Traceback (most recent call last):
  File "C:\Users\shres\anaconda3\lib\site-packages\tqdm\std.py", line 1210, in __iter__
    self.close()
  File "C:\Users\shres\anaconda3\lib\site-packages\tqdm\std.py", line 1304, in close
    fp_write('')
  File "C:\Users\shres\anaconda3\lib\site-packages\tqdm\std.py", line 1301, in fp_write
    self.fp.write(_unicode(s))
  File "C:\Users\shres\anaconda3\lib\site-packages\tqdm\utils.py", line 145, in inner
    return func(*args, **kwargs)
  File "C:\Users\shres\anaconda3\lib\site-packages\ipykernel\iostream.py", line 555, in write
    self._schedule_flush()
  File "C:\Users\shres\anaconda3\lib\site-packages\ipykernel\iostream.py", line 461, in _schedule_flush
    self.pub_thread.schedule(_schedule_in_thread)
  File "C:\Users\shres\anaconda3\lib\site-package

KeyboardInterrupt: 

In [None]:
print(models)