In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error

In [2]:
df_jan = pd.read_parquet('./fhv_tripdata_2021-01.parquet')

### Q1

In [3]:
n_records = len(df_jan)
print('January dataframe records:',n_records)

January dataframe records: 1154112


### Q2

In [4]:
df_jan['duration'] = df_jan.dropOff_datetime - df_jan.pickup_datetime
df_jan.duration = df_jan.duration.apply(lambda x: x.total_seconds() / 60)

In [5]:
print('Average duration:',df_jan.duration.mean())

Average duration: 19.167224093791006


In [6]:
df_jan = df_jan[(df_jan.duration >= 1) & (df_jan.duration <= 60)]

In [7]:
print(f'Dropped {n_records - len(df_jan)} outliers')

Dropped 44286 outliers


### Q3

In [8]:
df_jan['PUlocationID'] = df_jan['PUlocationID'].fillna(-1)
df_jan['DOlocationID'] = df_jan['DOlocationID'].fillna(-1)

In [9]:
print('Fraction of missing values for PUlocationID:',
      len(df_jan.loc[df_jan['PUlocationID'] == -1]) / len(df_jan))

Fraction of missing values for PUlocationID: 0.8352732770722617


### Q4

In [10]:
categorical = ['PUlocationID', 'DOlocationID']
df_jan[categorical] = df_jan[categorical].astype(str)

In [11]:
train_dicts = df_jan[categorical].to_dict(orient='records')
dv = DictVectorizer()

X_train = dv.fit_transform(train_dicts)

print('Dimensionality X_train :',X_train.get_shape())

Dimensionality X_train : (1109826, 525)


### Q5

In [12]:
target = 'duration'
y_train = df_jan[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

print('Mean Squared Error on Train:',mean_squared_error(y_train, y_pred, squared=False))

Mean Squared Error on Train: 10.52851938406329


### Q6

In [13]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PUlocationID', 'DOlocationID']
    
    for cat in categorical:
        df[cat] = df[cat].fillna(-1)
    
    df[categorical] = df[categorical].astype(str)    

    return df

In [14]:
df_feb = read_dataframe('./fhv_tripdata_2021-02.parquet')

In [15]:
val_dicts = df_feb[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [16]:
preds = lr.predict(X_val)
actual = df_feb['duration'].values

print('Mean Squared Error on Validation:',mean_squared_error(actual, preds, squared=False))

Mean Squared Error on Validation: 11.014286963512422
