In [73]:
import pandas as pd

In [74]:
import seaborn as sns
import matplotlib.pyplot as plt

In [75]:
from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [76]:
df_train = pd.read_parquet('../data/fhv_tripdata_2021-01.parquet')

In [77]:
# Q1. Read the data for January. How many records are there?
df_train.count() # 1154112

dispatching_base_num      1154112
pickup_datetime           1154112
dropOff_datetime          1154112
PUlocationID               195845
DOlocationID               991892
SR_Flag                         0
Affiliated_base_number    1153227
dtype: int64

In [78]:
# Q2. What's the average trip duration in January?
df_train['duration'] = df_train['dropOff_datetime'] - df_train['pickup_datetime']
df_train['duration'] = df_train.duration.apply(lambda td: td.total_seconds()/60)
df_train['duration'].mean() # 19.16

19.1672240937939

In [79]:
# Q2. How many records did you drop?
before = df_train.duration.count()
df_train = df_train[(df_train.duration >=1) & (df_train.duration <=60)]
after = df_train.duration.count()
before-after # 44286

44286

In [80]:
# Q3. What's the fractions of missing values for the pickup location ID? I.e. fraction of "-1"s after you filled the NAs.
df_train = df_train.fillna(-1)
(df_train.PUlocationID == -1).mean() # 83%

0.8352732770722617

In [81]:
# Q4. What's the dimensionality of this matrix? (The number of columns).
categorical = ['PUlocationID', 'DOlocationID']
df_train[categorical] = df_train[categorical].astype(str)  

dv = DictVectorizer()
train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts) 
X_train # 525

<1109826x525 sparse matrix of type '<class 'numpy.float64'>'
	with 2219652 stored elements in Compressed Sparse Row format>

In [82]:
# Q5. What's the RMSE on train?
target = 'duration'
y_train = df_train[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False) # 10.52

10.52851910722287

In [84]:
# Q6. What's the RMSE on validation?
df_val = pd.read_parquet('../data/fhv_tripdata_2021-02.parquet')
df_val['duration'] = df_val['dropOff_datetime'] - df_val['pickup_datetime']
df_val['duration'] = df_val.duration.apply(lambda td: td.total_seconds()/60)
df_val = df_val[(df_val.duration >=1) & (df_val.duration <=60)]

df_val = df_val.fillna(-1)

df_val[categorical] = df_val[categorical].astype(str)  
val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

y_val = df_val['duration'].values
y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False) # 11.01

11.014283229248326