In [38]:
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer

import warnings
from matplotlib_inline import backend_inline
from pandas.errors import SettingWithCopyWarning

backend_inline.set_matplotlib_formats('svg')
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [39]:
df = pd.read_parquet('../data/yellow_tripdata_2023-01.parquet')
len(df.columns)

19

## Q1. Downloading the data
Read the data for January. How many columns are there?
* 16
* 17
* 18
* 19 ✔

In [40]:
df['duration']=df['tpep_dropoff_datetime']-df['tpep_pickup_datetime']
df['duration']= df['duration'].dt.total_seconds() / 60
df['duration'].std()

42.59435124195458

## Q2. Computing duration
What's the standard deviation of the trips duration in January?
* 32.59
* 42.59 ✔
* 52.59
* 62.59

In [41]:
df_size= len(df)
df_new=df[(df['duration']>=1) & (df['duration']<=60)].copy()
len(df_new)/df_size

0.9812202822125979

## Q3. Dropping outliers
What fraction of the records left after you dropped the outliers?

* 90%
* 92%
* 95%
* 98% ✔

In [42]:
df_new.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee', 'duration'],
      dtype='object')

In [59]:
categorical = ['PULocationID', 'DOLocationID']

df_new[categorical] = df_new[categorical].fillna(-1).astype('str')

In [60]:
df_new.isna().sum()

VendorID                     0
tpep_pickup_datetime         0
tpep_dropoff_datetime        0
passenger_count          71105
trip_distance                0
RatecodeID               71105
store_and_fwd_flag       71105
PULocationID                 0
DOLocationID                 0
payment_type                 0
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
improvement_surcharge        0
total_amount                 0
congestion_surcharge     71105
airport_fee              71105
duration                     0
dtype: int64

In [61]:
train_dicts = df_new[categorical].to_dict(orient='records')
train_dicts[:10]


[{'PULocationID': '161', 'DOLocationID': '141'},
 {'PULocationID': '43', 'DOLocationID': '237'},
 {'PULocationID': '48', 'DOLocationID': '238'},
 {'PULocationID': '138', 'DOLocationID': '7'},
 {'PULocationID': '107', 'DOLocationID': '79'},
 {'PULocationID': '161', 'DOLocationID': '137'},
 {'PULocationID': '239', 'DOLocationID': '143'},
 {'PULocationID': '142', 'DOLocationID': '200'},
 {'PULocationID': '164', 'DOLocationID': '236'},
 {'PULocationID': '141', 'DOLocationID': '107'}]

In [62]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts) 
len(dv.feature_names_)


515

In [63]:
X_train.shape

(3009173, 515)

## Q4. One-hot encoding
What's the dimensionality of this matrix (number of columns)?
* 2
* 155
* 345
* 515 ✔
* 715

In [65]:
y_train = df_new['duration'].values


In [77]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)
mean_squared_error(y_train, y_pred, squared=False)




7.6492619241381785

In [None]:
## Q5. Training a model

What's the RMSE on train?

* 3.64
* 7.64 ✔
* 11.64
* 16.64

In [74]:
df_val = pd.read_parquet('../data/yellow_tripdata_2023-02.parquet')
df_val['duration'] = df_val['tpep_dropoff_datetime'] - df_val['tpep_pickup_datetime']
df_val['duration'] = df_val['duration'].dt.total_seconds() / 60
df_val = df_val[(df_val['duration'] >= 1) & (df_val['duration'] <= 60)].copy()

df_val[categorical] = df_val[categorical].fillna(-1).astype('int').astype('str')



In [75]:
val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts) 
y_pred = lr.predict(X_val)
y_val = df_val['duration'].values
mean_squared_error(y_val, y_pred, squared=False)






7.811818380595618


## Q6. Evaluating the model
What's the RMSE on validation?

* 3.81
* 7.81 ✔
* 11.81
* 16.81