In [1]:
!python -V

Python 3.9.12


In [2]:
import pandas as pd

In [2]:
!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-8.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[K     |████████████████████████████████| 29.4 MB 25.7 MB/s eta 0:00:01[K     |████████████████████████████████| 29.4 MB 25.7 MB/s 
Installing collected packages: pyarrow
Successfully installed pyarrow-8.0.0


In [1]:
# importing packages
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [77]:
# loading dataset
df = pd.read_parquet('./data/fhv_tripdata_2021-01.parquet')

In [78]:
df.shape

(1154112, 7)

In [79]:
df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [80]:
# creating the duration column
df['duration'] = df.dropOff_datetime - df.pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60) #turning the duration into minutes
df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,17.0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,17.0
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013,110.0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,15.216667


In [81]:
# getting the mean of the duration
duration_mean = df.duration.mean()
duration_mean

19.1672240937939

In [14]:
# checking duration distribution 
#sns.histplot(data=df, x="duration")

In [82]:
# getting records where duration is between 1 & 60 mins
df2 = df[(df.duration >= 1) & (df.duration <= 60)]
df2.shape

(1109826, 8)

In [83]:
df2.dtypes

dispatching_base_num              object
pickup_datetime           datetime64[ns]
dropOff_datetime          datetime64[ns]
PUlocationID                     float64
DOlocationID                     float64
SR_Flag                           object
Affiliated_base_number            object
duration                         float64
dtype: object

In [84]:
#checking for missing values
df2.isnull().sum()

dispatching_base_num            0
pickup_datetime                 0
dropOff_datetime                0
PUlocationID               927008
DOlocationID               147907
SR_Flag                   1109826
Affiliated_base_number        773
duration                        0
dtype: int64

In [85]:
# filling missing values
df2[['PUlocationID', 'DOlocationID']].fillna(value = '-1', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[['PUlocationID', 'DOlocationID']].fillna(value = '-1', inplace=True)


In [86]:
df2.isnull().sum()

dispatching_base_num            0
pickup_datetime                 0
dropOff_datetime                0
PUlocationID               927008
DOlocationID               147907
SR_Flag                   1109826
Affiliated_base_number        773
duration                        0
dtype: int64

In [87]:
#fraction of missing values of pickup location after filling NANs
(927008/1109826)*100

83.52732770722618

In [88]:
# turning pickup and dropoff ids to categorical
categorical = ['PUlocationID', 'DOlocationID']
df2[categorical] = df2[categorical].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[categorical] = df2[categorical].astype(str)


In [89]:
train_dicts = df2[categorical].to_dict(orient='records') #creating a dictionary

dv = DictVectorizer() #calling dictvectorizer
X_train = dv.fit_transform(train_dicts) #onehot encoding

target = 'duration'
y_train = df2[target].values

In [90]:
#getting Xtrian shape
X_train.shape

(1109826, 525)

In [91]:
lr = LinearRegression() #linear regression
lr.fit(X_train, y_train) #fitting model 

y_pred = lr.predict(X_train) #predicting model

mean_squared_error(y_train, y_pred, squared=False)

10.528519107203781

In [98]:
# downloading feb dataset
df3 = pd.read_parquet('./data/fhv_tripdata_2021-02.parquet')

In [99]:
df3.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00013,2021-02-01 00:01:00,2021-02-01 01:33:00,,,,B00014
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173.0,82.0,,B00021
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173.0,56.0,,B00021
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82.0,129.0,,B00021
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,,225.0,,B00037


In [100]:
df3['duration'] = df3.dropOff_datetime - df3.pickup_datetime
df3.duration = df3.duration.apply(lambda td: td.total_seconds() / 60) #turning the duration into minutes
# getting records where duration is between 1 & 60 mins
df3 = df3[(df3.duration >= 1) & (df3.duration <= 60)]
df3.shape

(990113, 8)

In [101]:
df3.isnull().sum()

dispatching_base_num           0
pickup_datetime                0
dropOff_datetime               0
PUlocationID              848661
DOlocationID              134760
SR_Flag                   990113
Affiliated_base_number         0
duration                       0
dtype: int64

In [102]:
#filling missing values
df3[['PUlocationID', 'DOlocationID']].fillna(value = '-1', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3[['PUlocationID', 'DOlocationID']].fillna(value = '-1', inplace=True)


In [103]:
df3.isnull().sum()

dispatching_base_num           0
pickup_datetime                0
dropOff_datetime               0
PUlocationID              848661
DOlocationID              134760
SR_Flag                   990113
Affiliated_base_number         0
duration                       0
dtype: int64

In [104]:
df3[categorical] = df3[categorical].astype(str)

In [105]:
test_dicts = df3[categorical].to_dict(orient='records') #creating a dictionary
X_test = dv.transform(test_dicts) #onehot encoding
target = 'duration'
y_test = df3[target].values

In [106]:
pred = lr.predict(X_test) #predicting model

mean_squared_error(y_test, pred, squared=False)

11.014283141597323