In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error


%matplotlib inline

In [2]:
df1 = pd.read_parquet("fhv_tripdata_2021-01.parquet")

### Q1. Downloading the data

In [3]:
df1.shape

(1154112, 7)

In [4]:
df1.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [5]:
df1.dtypes

dispatching_base_num              object
pickup_datetime           datetime64[ns]
dropOff_datetime          datetime64[ns]
PUlocationID                     float64
DOlocationID                     float64
SR_Flag                           object
Affiliated_base_number            object
dtype: object

### Q2. Computing duration

In [6]:
df1["duration"] = df1["dropOff_datetime"] - df1["pickup_datetime"]

In [7]:
df1["duration"] = df1["duration"].apply(lambda td: td.total_seconds() / 60)

In [8]:
# Average duration in Jan 2021 FHV (in minutes)
df1["duration"].mean()

19.1672240937939

### Q3. Missing values

In [9]:
df1 = df1[(df1["duration"] >= 1) & (df1["duration"] <=60)]

In [10]:
df1.shape

(1109826, 8)

In [11]:
old_records = 1154112 
new_records = 1109826

# number of rows dropped
old_records - new_records

44286

In [12]:
# fraction of missing values for the pickup location ID
(df1.PUlocationID.isnull().sum() / df1.shape[0]) * 100

83.52732770722618

In [13]:
# fill missing values
features = ["PUlocationID", "DOlocationID"]
df1[features] = df1[features].fillna(-1)

# change object type to string
df1[features] = df1[features].astype(str)

### Q4. One-hot encoding

In [14]:
train_dicts = df1[["PUlocationID", "DOlocationID"]].to_dict(orient='records')

In [15]:
train_dicts

[{'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '61.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '71.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '91.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '39.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '37.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '39.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '89.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '177.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '225.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '63.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '67.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '22.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '61.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '14.0'},
 {'PUlocationID': '-1.0', 'DO

In [16]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [17]:
# Dimensionality after OHE
X_train.shape[1]

525

### Q5. Training a model

In [18]:
target = 'duration'
y_train = df1[target].values

In [19]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [20]:
y_pred = model.predict(X_train)

In [21]:
mean_squared_error(y_train, y_pred, squared=False)

10.528519107211306

### Q6. Evaluating the model

In [22]:
df2 = pd.read_parquet("fhv_tripdata_2021-02.parquet")
df2["duration"] = df2["dropOff_datetime"] - df2["pickup_datetime"]
df2["duration"] = df2["duration"].apply(lambda td: td.total_seconds() / 60)
df2 = df2[(df2["duration"] >= 1) & (df2["duration"] <=60)]

df2[features] = df2[features].fillna(-1)
# change object type to string
df2[features] = df2[features].astype(str)

val_dicts = df2[features].to_dict(orient='records')

X_val = dv.transform(val_dicts)
y_val = df2[target].values

y_pred2 = model.predict(X_val)

mean_squared_error(y_val, y_pred2, squared=False)

11.01428319227392