In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression


from sklearn.metrics import mean_squared_error

import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [2]:
df_jan = pd.read_parquet('./data/yellow_tripdata_2022-01.parquet')
df_feb = pd.read_parquet('./data/yellow_tripdata_2022-02.parquet')

In [2]:
# def read_dataframe(filename):
#     if filename.endswith('.csv'):
#         df = pd.read_csv(filename)
        
#     elif filename.endswith('.parquet'):
#         df = pd.read_parquet(filename)

#     df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
#     df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

#     df = df[(df.duration >= 1) & (df.duration <= 60)]

#     categorical = ['PULocationID', 'DOLocationID']
#     df[categorical] = df[categorical].astype(str)
    
#     return df

## Question 1
* Read the data for January. How many columns are there?


Answer:

D) 19

In [3]:
# Get the number of columns
num_columns = len(df_jan.columns)

print("Number of columns:", num_columns)

Number of columns: 19


## Question 2

* Now let's compute the duration variable. It should contain the duration of a ride in minutes.

* What's the standard deviation of the trips duration in January?

Answer:

B) 46.45

In [4]:
df_jan['duration'] = df_jan.tpep_dropoff_datetime - df_jan.tpep_pickup_datetime
df_jan.duration = df_jan.duration.apply(lambda td: td.total_seconds() / 60)

In [5]:
df_jan.duration.std()

46.44530513776499

## Question 3

Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

What fraction of the records left after you dropped the outliers?

Answer:
D) 98%

In [7]:
new_df = df_jan[(df_jan.duration >= 1) & (df_jan.duration <= 60)]
frac_df = len(new_df) / len(df_jan)

print("Fraction of Records Left:", frac_df * 100)

Fraction of Records Left: 98.27547930522405


## Question 4
Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

* Turn the dataframe into a list of dictionaries
* Fit a dictionary vectorizer
* Get a feature matrix from it


What's the dimensionality of this matrix (number of columns)?

Answer:

A) 515

In [8]:
cats = ['PULocationID', 'DOLocationID']
target = ['duration']

# convert to str to perform vec
new_df[cats] = new_df[cats].astype(str)

# Turn the dataframe into a list of dictionaries
train_id = new_df[cats].to_dict('records')

# Fit and obtain the feature matrix using the dictionary vectorizer
dv = DictVectorizer()
X_train = dv.fit_transform(train_id)

# Get the dimensionality of the feature matrix
num_columns = X_train.shape[1]
print("Dimensionality of the feature matrix:", num_columns)

Dimensionality of the feature matrix: 515


## Question 5
Now let's use the feature matrix from the previous step to train a model.

* Train a plain linear regression model with default parameters
* Calculate the RMSE of the model on the training data

What's the RMSE on train?

Answer: 

A) 6.99

In [10]:
y_train = new_df['duration'].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

rmse = mean_squared_error(y_train, y_pred, squared=False)
print("RMSE on train:", rmse)

RMSE on train: 6.986191072971965


## Question 6

Now let's apply this model to the validation dataset (February 2022).

What's the RMSE on validation?

Answer:

A) 7.79

In [11]:
def process_and_fit(df):
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    new_df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

    categorical_cols = ['PULocationID', 'DOLocationID']
    new_df[categorical_cols] = new_df[categorical_cols].astype(str)

    dv = DictVectorizer()
    X_train = dv.fit_transform(new_df[categorical_cols].to_dict('records'))
    y_train = new_df['duration'].values

    lr = LinearRegression()
    lr.fit(X_train, y_train)

    return lr, dv

def transform_and_evaluate(df, model, dv):
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    new_df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

    categorical_cols = ['PULocationID', 'DOLocationID']
    new_df[categorical_cols] = new_df[categorical_cols].astype(str)

    X_eval = dv.transform(new_df[categorical_cols].to_dict('records'))
    y_eval = new_df['duration'].values

    y_preds = model.predict(X_eval)
    rmse = mean_squared_error(y_eval, y_preds, squared=False)

    return rmse

def train_and_evaluate(df_train, df_eval):
    model, dv = process_and_fit(df_train)
    rmse = transform_and_evaluate(df_eval, model, dv)
    return rmse

In [12]:
model, dv = process_and_fit(df_jan)
rmse = transform_and_evaluate(df_feb, model, dv)

In [13]:
print("RMSE on validation train:", rmse)

RMSE on validation train: 7.786409044614996
