In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [2]:
## Fetch data
!wget -cP ~/data https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
!wget -cP ~/data https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet

--2024-05-14 10:02:06--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 13.32.23.25, 13.32.23.113, 13.32.23.129, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|13.32.23.25|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.

--2024-05-14 10:02:06--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 13.32.23.25, 13.32.23.142, 13.32.23.129, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|13.32.23.25|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [3]:
yellow_jan = pd.read_parquet('~/data/yellow_tripdata_2023-01.parquet')
yellow_feb = pd.read_parquet('~/data/yellow_tripdata_2023-02.parquet')

## Q1: how many columns in January data

In [4]:
len(yellow_jan.dtypes)

19

## Q2: standard deviation of trips in January

In [5]:
yellow_jan['duration'] = yellow_jan.tpep_dropoff_datetime - yellow_jan.tpep_pickup_datetime

# Convert to minutes
yellow_jan.duration = yellow_jan.duration.apply(lambda td: td.total_seconds() / 60)

In [6]:
yellow_jan.duration.std()

42.594351241920904

## Q3: remove outliers

In [7]:
MIN_DURATION = 1
MAX_DURATION = 60

In [8]:
# Rows before dropping outliers
before = len(yellow_jan)

In [9]:
def drop_outliers(df, min_duration: int, max_duration: int) -> pd.DataFrame:
    return df[(df.duration >= min_duration) & (df.duration <= max_duration)]

In [10]:
yellow_jan = drop_outliers(yellow_jan, MIN_DURATION, MAX_DURATION)

In [11]:
after = len(yellow_jan)

In [12]:
after / before

0.9812202822125979

## Q4: One-hot encoding

In [13]:
categorical = ['PULocationID', 'DOLocationID']

In [14]:
yellow_jan[categorical] = yellow_jan[categorical].astype(str)

In [15]:
dv = DictVectorizer()

In [16]:
train_dicts = yellow_jan[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

In [17]:
X_train.shape

(3009173, 515)

## Q5: Linear Regression

In [18]:
reg = LinearRegression()

In [19]:
y_train = yellow_jan.duration.to_numpy()

In [20]:
reg = reg.fit(X_train, y_train)

In [21]:
y_pred = reg.predict(X_train)

In [22]:
root_mean_squared_error(y_train, y_pred)

7.6492619633678824