In [138]:
import pandas as pd
import numpy as np

In [139]:
train=pd.read_csv("train.csv", nrows=1000000,parse_dates=['pickup_datetime'])
test=pd.read_csv("test.csv")

In [140]:
train=train.drop(train[train['fare_amount']>200].index,axis=0)
train=train.drop(train[train['fare_amount']<3.5].index,axis=0)

In [141]:
# Dropping those entries whose Latitude & Longitudes are unrealistic and fall out of New York's range of Latitudes and Longitudes

train=train.drop(train[train['dropoff_longitude']>-73.77].index,axis=0)
train=train.drop(train[train['dropoff_longitude']<-74.03].index,axis=0)
train=train.drop(train[train['pickup_longitude']>-73.77].index,axis=0)
train=train.drop(train[train['pickup_longitude']<-74.03].index,axis=0)

train=train.drop(train[train['dropoff_latitude']>40.91].index,axis=0)
train=train.drop(train[train['dropoff_latitude']<40.49].index,axis=0)
train=train.drop(train[train['pickup_latitude']>40.91].index,axis=0)
train=train.drop(train[train['pickup_latitude']<40.49].index,axis=0)

In [142]:
# Dropping entries with Passenger Count < 0 and > 6
train=train.drop(train[train['passenger_count']<0].index,axis=0)
train=train.drop(train[train['passenger_count']>6].index,axis=0)

In [143]:
# Dropping entries with NaNs in Location


train=train.drop(train[train['dropoff_latitude']==np.NaN].index,axis=0)
train=train.drop(train[train['dropoff_longitude']==np.NaN].index,axis=0)
train=train.drop(train[train['pickup_longitude']==np.NaN].index,axis=0)
train=train.drop(train[train['pickup_latitude']==np.NaN].index,axis=0)


In [144]:
# Grouping the Pickup & Dropoff Coordinates

train['pickup_coords']=train[['pickup_latitude','pickup_longitude']].apply(tuple,axis=1)
train['dropoff_coords']=train[['dropoff_latitude','dropoff_longitude']].apply(tuple,axis=1)


In [145]:
# Calculating the Distance using Haversine Formula in Kilometers

import haversine as hs
from haversine import Unit
train['haversine_distance']=train.apply(lambda x: hs.haversine(x.pickup_coords,x.dropoff_coords,unit=Unit.KILOMETERS),axis=1)

In [146]:
train["year"] = train.pickup_datetime.dt.year - 2000
train["month"] = train.pickup_datetime.dt.month
train["week"] = train.pickup_datetime.dt.isocalendar().week

train['day_of_year'] = train['pickup_datetime'].dt.dayofyear
train["day_of_month"] = train.pickup_datetime.dt.day
train["day_of_week"] = train.pickup_datetime.dt.weekday

train["hour"] = train.pickup_datetime.dt.hour
train['minute'] =train['pickup_datetime'].dt.minute
train['second'] = train['pickup_datetime'].dt.second

In [147]:
# Dropping those entries which have the same Pickup and Dropoff Coordinates

train=train.drop(train[train['pickup_coords']==train['dropoff_coords']].index)

In [148]:
train=train.drop('key',axis=1)
train=train.drop('pickup_datetime',axis=1)
train=train.drop(['pickup_coords','dropoff_coords'],axis=1)

In [149]:
test=test.drop(test[test['dropoff_latitude']==np.NaN].index,axis=0)
test=test.drop(test[test['dropoff_longitude']==np.NaN].index,axis=0)
test=test.drop(test[test['pickup_longitude']==np.NaN].index,axis=0)
test=test.drop(test[test['pickup_latitude']==np.NaN].index,axis=0)

test['pickup_coords'] = test[['pickup_latitude', 'pickup_longitude']].apply(tuple, axis=1)
test['dropoff_coords'] = test[['dropoff_latitude', 'dropoff_longitude']].apply(tuple, axis=1)

test['haversine_distance']=test.apply(lambda x: hs.haversine(x.pickup_coords,x.dropoff_coords,unit=Unit.KILOMETERS),axis=1)

test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])
test["year"] = test.pickup_datetime.dt.year - 2000
test["month"] = test.pickup_datetime.dt.month
test["week"] = test.pickup_datetime.dt.isocalendar().week

test['day_of_year'] = test['pickup_datetime'].dt.dayofyear
test["day_of_month"] = test.pickup_datetime.dt.day
test["day_of_week"] = test.pickup_datetime.dt.weekday

test["hour"] = test.pickup_datetime.dt.hour
test['minute'] = test['pickup_datetime'].dt.minute
test['second'] = test['pickup_datetime'].dt.second


In [150]:
test=test.drop(['key','pickup_datetime','pickup_coords','dropoff_coords'],axis=1)

In [151]:
# Train Test Split
train=train.dropna()
from sklearn.model_selection import train_test_split
X=train.iloc[:,train.columns!='fare_amount']
y=train.iloc[:,train.columns=='fare_amount']

X_Final_test=test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=42, shuffle=True) 

In [157]:
# Model Training
from sklearn.linear_model import LinearRegression
regressor = LinearRegression(fit_intercept=True)
regressor.fit(X_train, y_train)

In [158]:
y_Final_test=regressor.predict(X_Final_test)

In [159]:
import pickle
with open('my_model.pkl', 'wb') as f:
    pickle.dump(regressor, f)


In [155]:
X_train.dtypes

pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
passenger_count         int64
haversine_distance    float64
year                    int32
month                   int32
week                   UInt32
day_of_year             int32
day_of_month            int32
day_of_week             int32
hour                    int32
minute                  int32
second                  int32
dtype: object

In [156]:
X_train.describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,haversine_distance,year,month,week,day_of_year,day_of_month,day_of_week,hour,minute,second
count,759650.0,759650.0,759650.0,759650.0,759650.0,759650.0,759650.0,759650.0,759650.0,759650.0,759650.0,759650.0,759650.0,759650.0,759650.0
mean,-73.975701,40.750948,-73.974496,40.751444,1.687223,3.341757,11.754317,6.274012,25.481888,175.435133,15.696867,3.039849,13.521257,29.567382,15.937695
std,0.033576,0.026493,0.0317,0.03033,1.309085,3.478244,1.864864,3.438791,14.955824,104.866051,8.683474,1.948914,6.507264,17.320148,19.455184
min,-74.029592,40.492546,-74.029952,40.490235,0.0,8.4e-05,9.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,-73.992278,40.736662,-73.991507,40.735805,1.0,1.307609,10.0,3.0,13.0,85.0,8.0,1.0,9.0,15.0,0.0
50%,-73.982122,40.75342,-73.980593,40.753977,1.0,2.198878,12.0,6.0,24.0,167.0,16.0,3.0,14.0,30.0,4.0
75%,-73.968598,40.767494,-73.965571,40.768427,2.0,3.951732,13.0,9.0,39.0,268.0,23.0,5.0,19.0,45.0,32.0
max,-73.770115,40.909623,-73.770015,40.909836,6.0,31.455661,15.0,12.0,53.0,366.0,31.0,6.0,23.0,59.0,59.0
