In [53]:
from typing import Tuple, List, Dict, Any

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [2]:
## Fetch data
!wget -cP ~/data https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
!wget -cP ~/data https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet

--2024-05-14 10:13:21--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 13.32.23.113, 13.32.23.129, 13.32.23.142, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|13.32.23.113|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.

--2024-05-14 10:13:21--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 13.32.23.129, 13.32.23.142, 13.32.23.113, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|13.32.23.129|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [47]:
yellow_jan = pd.read_parquet('~/data/yellow_tripdata_2023-01.parquet')
yellow_feb = pd.read_parquet('~/data/yellow_tripdata_2023-02.parquet')

## Q1: how many columns in January data

In [4]:
len(yellow_jan.dtypes)

19

## Q2: standard deviation of trips in January

In [5]:
def calculate_duration(df: pd.DataFrame) -> pd.DataFrame:
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    return df

yellow_jan = calculate_duration(yellow_jan)

In [6]:
yellow_jan.duration.std()

42.594351241920904

## Q3: remove outliers

In [7]:
MIN_DURATION = 1
MAX_DURATION = 60

In [8]:
# Rows before dropping outliers
before = len(yellow_jan)

In [9]:
def drop_outliers(df, min_duration: int, max_duration: int) -> pd.DataFrame:
    return df[(df.duration >= min_duration) & (df.duration <= max_duration)]

In [10]:
yellow_jan = drop_outliers(yellow_jan, MIN_DURATION, MAX_DURATION)

In [11]:
after = len(yellow_jan)

In [12]:
after / before

0.9812202822125979

## Q4: One-hot encoding

In [13]:
CATEGORICAL_COLS = ['PULocationID', 'DOLocationID']

In [15]:
yellow_jan[CATEGORICAL_COLS] = yellow_jan[CATEGORICAL_COLS].astype(str)

In [16]:
dv = DictVectorizer()

In [17]:
train_dicts = yellow_jan[CATEGORICAL_COLS].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

In [18]:
X_train.shape

(3009173, 515)

## Q5: Linear Regression

In [57]:
def load_data(df_path: str) -> pd.DataFrame:
    df = pd.read_parquet(df_path)
    df = calculate_duration(df)
    df = drop_outliers(df, MIN_DURATION, MAX_DURATION)
    return df

def get_features(df: pd.DataFrame) -> List[Dict[str, Any]]:
    return df[CATEGORICAL_COLS].astype(str).to_dict(orient='records')

def train(df: pd.DataFrame) -> Tuple[LinearRegression, DictVectorizer]:
    feature_records = get_features(df)
    dv = DictVectorizer()
    X_train = dv.fit_transform(feature_records)
    y_train = df.duration.to_numpy()

    reg = LinearRegression()
    reg.fit(X_train, y_train)

    return reg, dv

def evaluate(df: pd.DataFrame, reg: LinearRegression, dv: DictVectorizer) -> float:
    feature_records = get_features(df)
    X = dv.transform(feature_records)
    y_true = df.duration.to_numpy()
    y_pred = reg.predict(X)

    return root_mean_squared_error(y_true, y_pred)


In [58]:
yellow_jan = load_data('~/data/yellow_tripdata_2023-01.parquet')

In [59]:
reg, dv = train(yellow_jan)

In [60]:
evaluate(yellow_jan, reg, dv)

7.6492619633678824

## Q6: Evaluation

In [62]:
yellow_feb = load_data('~/data/yellow_tripdata_2023-02.parquet')
evaluate(yellow_feb, reg, dv)

7.81181893596011