In [1]:
!wget https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-01.parquet -O jan21.parquet
!wget https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet -O feb21.parquet


--2022-05-19 21:04:44--  https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-01.parquet
Resolving nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)... 52.216.93.171
Connecting to nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)|52.216.93.171|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11886281 (11M) [binary/octet-stream]
Saving to: ‘jan21.parquet’


2022-05-19 21:04:45 (18.0 MB/s) - ‘jan21.parquet’ saved [11886281/11886281]

--2022-05-19 21:04:45--  https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet
Resolving nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)... 52.216.93.171
Connecting to nyc-tlc.s3.amazonaws.com (nyc-tlc.s3.amazonaws.com)|52.216.93.171|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10645466 (10M) [binary/octet-stream]
Saving to: ‘feb21.parquet’


2022-05-19 21:04:46 (16.2 MB/s) - ‘feb21.parquet’ saved [10645466/10645466]



In [2]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator

In [3]:
def read_parquet(filename: str) -> pd.DataFrame:
  df = pd.read_parquet(filename)
  print(f'DataFrame shape - {df.shape}', df.columns, sep='\n')
  df['duration'] = df.dropOff_datetime - df.pickup_datetime
  df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
  print(f'Mean duration - {df.duration.mean()}')
  df = df[(df.duration>=1) & (df.duration<=60)]
  df['PUlocationID'] = df['PUlocationID'].fillna(-1)
  df['DOlocationID'] = df['DOlocationID'].fillna(-1)
  print(f'Fraction of NA PUlocationID = {len(df[df.PUlocationID == -1])/len(df)}')
  categorical = ['PUlocationID', 'DOlocationID']
  df[categorical] = df[categorical].astype(str)
  return df

In [4]:
def train_and_val(model: BaseEstimator, train_df: pd.DataFrame, val_df: pd.DataFrame = None):
  train_dicts = train_df[features].to_dict(orient='records')
  dv = DictVectorizer()
  X_train = dv.fit_transform(train_dicts)
  print(f'Train feature shape - {X_train.shape}')
  y_train = train_df[target].values
  model.fit(X_train, y_train)
  print(f'Train RMSE - {mean_squared_error(y_train, model.predict(X_train), squared=False)}')
  if val_df is not None:
    val_dicts = val_df[features].to_dict(orient='records')
    X_val = dv.transform(val_dicts)
    y_val = val_df[target].values
    print(f'Validation RMSE - {mean_squared_error(y_val, model.predict(X_val), squared=False)}')
 

In [5]:
train_df = read_parquet('./jan21.parquet')
val_df = read_parquet('./feb21.parquet')


DataFrame shape - (1154112, 7)
Index(['dispatching_base_num', 'pickup_datetime', 'dropOff_datetime',
       'PUlocationID', 'DOlocationID', 'SR_Flag', 'Affiliated_base_number'],
      dtype='object')
Mean duration - 19.1672240937939
Fraction of NA PUlocationID = 0.8352732770722617
DataFrame shape - (1037692, 7)
Index(['dispatching_base_num', 'pickup_datetime', 'dropOff_datetime',
       'PUlocationID', 'DOlocationID', 'SR_Flag', 'Affiliated_base_number'],
      dtype='object')
Mean duration - 20.70698622520125
Fraction of NA PUlocationID = 0.8571354986754037


In [6]:
features = ['PUlocationID', 'DOlocationID']
target = 'duration'
lr = LinearRegression()
train_and_val(lr, train_df, val_df)

Train feature shape - (1109826, 525)
Train RMSE - 10.528519107210744
Validation RMSE - 11.014283196111764
