In [1]:
import pandas as pd

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
import pickle

In [4]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

## Q1. Downloading the data

In [5]:
df = pd.read_parquet('data/fhv_tripdata_2021-01.parquet')
total_rows = df.shape[0]
print(total_rows)

1154112


## Q2. Computing duration

In [6]:
df.dropOff_datetime = pd.to_datetime(df.dropOff_datetime)
df.pickup_datetime = pd.to_datetime(df.pickup_datetime)

df['duration'] = df.dropOff_datetime - df.pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [7]:
total_duration_row_count, average_duration = df.describe()['duration'][['count', 'mean']]
print(average_duration)

19.167224093791006


## Data preparation

In [8]:
df = df[(df.duration >= 1) & (df.duration <= 60)]

In [9]:
updated_duration_row_count = df.describe()['duration']['count']

print(total_duration_row_count - updated_duration_row_count)

44286.0


## Q3. Missing values

In [10]:
total_PUlocation_row_count = df.describe()['PUlocationID']['count']

In [11]:
df['PUlocationID'] = df.PUlocationID.fillna(-1)
df['DOlocationID'] = df.DOlocationID.fillna(-1)

In [12]:
updated_PUlocation_row_count =  df.describe()['PUlocationID']['count']

In [13]:
print(round(100 - (total_PUlocation_row_count / updated_PUlocation_row_count * 100), 4), "%", sep='')

83.5273%


## Q4. One-hot encoding

In [14]:
categorical = ['PUlocationID', 'DOlocationID']
df[categorical] = df[categorical].astype(str)

train_dicts = df[categorical].to_dict(orient='records')

In [15]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [16]:
print(X_train.shape[1])

525


## Q5. Training a model

In [17]:
target = 'duration'
y_train = df[target].values

In [18]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

In [19]:
mean_squared_error(y_train, y_pred, squared=False)

10.52851938900075

## Q6. Evaluating the model

In [20]:
df_val = pd.read_parquet('data/fhv_tripdata_2021-02.parquet')

In [21]:
df_val.dropOff_datetime = pd.to_datetime(df_val.dropOff_datetime)
df_val.pickup_datetime = pd.to_datetime(df_val.pickup_datetime)

In [22]:
df_val['duration'] = df_val.dropOff_datetime - df_val.pickup_datetime
df_val.duration = df_val.duration.apply(lambda td: td.total_seconds() / 60)

In [23]:
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]

In [24]:
df_val['PUlocationID'] = df_val.PUlocationID.fillna(-1)
df_val['DOlocationID'] = df_val.DOlocationID.fillna(-1)

In [25]:
df_val[categorical] = df_val[categorical].astype(str)

val_dicts = df_val[categorical].to_dict(orient='records')

In [26]:
X_val = dv.transform(val_dicts)
y_val = df_val[target].values

In [27]:
y_pred = lr.predict(X_val)

In [28]:
mean_squared_error(y_val, y_pred, squared=False)

11.014286408853847