In [1]:
import pandas as pd

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [4]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    df['duration'] = df['dropOff_datetime'] - df['pickup_datetime']
    df['duration'] = df.duration.apply(lambda td: td.total_seconds()/60)
    
    df = df[(df.duration >=1) & (df.duration <=60)]
    df = df.fillna(-1)
    
    categorical = ['PUlocationID', 'DOlocationID']
    df[categorical] = df[categorical].astype(str)  

    return df

In [5]:
df_train = read_dataframe('../data/fhv_tripdata_2021-01.parquet')
df_val = read_dataframe('../data/fhv_tripdata_2021-02.parquet')

In [7]:
dv = DictVectorizer()
categorical = ['PUlocationID', 'DOlocationID']
train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts) 

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [8]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [9]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)
mean_squared_error(y_train, y_pred, squared=False) # 10.52

10.52851910722287

In [11]:
y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False) # 11.01

11.014283229248326