In [None]:
import pandas as pd
# !pip install pyarrow

### Q1. Downloading the data
df_jan = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")
df_feb = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet")
print(f"Q1. Downloading the data -> {len(df_jan.columns)}")

### Q2. Computing duration
df_jan["duration"] = df_jan["tpep_dropoff_datetime"] - df_jan["tpep_pickup_datetime"]
df_feb["duration"] = df_feb["tpep_dropoff_datetime"] - df_feb["tpep_pickup_datetime"]
print(f"Q2. Computing duration -> {df_jan['duration'].std().seconds / 60}")

### Q3. Dropping outliers
df_jan["duration_min"] = df_jan["duration"].map(lambda timedelta: timedelta.seconds / 60)
df_feb["duration_min"] = df_feb["duration"].map(lambda timedelta: timedelta.seconds / 60)

df_jan_without_duration_outliers = df_jan.drop(df_jan[~df_jan["duration_min"].between(1, 60, inclusive="both")].index)
df_feb_without_duration_outliers = df_feb.drop(df_feb[~df_feb["duration_min"].between(1, 60, inclusive="both")].index)
print(f"Q3. Dropping outliers -> {len(df_jan_without_duration_outliers) / len(df_jan)}")

### Q4. One-hot encoding
import re
df_jan_without_duration_outliers['PULocationID'] = df_jan_without_duration_outliers['PULocationID'].map(str)
df_jan_without_duration_outliers['DOLocationID'] = df_jan_without_duration_outliers['DOLocationID'].map(str)
df_jan_PU_DO_IDS = df_jan_without_duration_outliers[["PULocationID", "DOLocationID"]]
df_jan_dict = df_jan_PU_DO_IDS.to_dict('records')

df_feb_without_duration_outliers['PULocationID'] = df_feb_without_duration_outliers['PULocationID'].map(str)
df_feb_without_duration_outliers['DOLocationID'] = df_feb_without_duration_outliers['DOLocationID'].map(str)
df_feb_PU_DO_IDS = df_feb_without_duration_outliers[["PULocationID", "DOLocationID"]]
df_feb_dict = df_feb_PU_DO_IDS.to_dict('records')

import numpy as np
from sklearn.feature_extraction import DictVectorizer

dict_jan_vectorizer = DictVectorizer(dtype=np.uint8, sparse=False)
df_jan_vectorized = dict_jan_vectorizer.fit_transform(df_jan_dict)
df_jan_feature_map = pd.DataFrame(df_jan_vectorized, columns=dict_jan_vectorizer.get_feature_names_out())

dict_feb_vectorizer = DictVectorizer(dtype=np.uint8, sparse=False)
df_feb_vectorized = dict_feb_vectorizer.fit_transform(df_feb_dict)
df_feb_feature_map = pd.DataFrame(df_feb_vectorized, columns=dict_feb_vectorizer.get_feature_names_out())

print(f"Q4. One-hot encoding -> {len(df_jan_feature_map.columns)}")

### Q5. Training a model. RMSE on train
### Q6. Evaluating the model. RMSE on validation
df_jan_without_duration_outliers["duration_min"] = df_jan_without_duration_outliers["duration_min"].astype(np.float32)
df_feb_without_duration_outliers["duration_min"] = df_feb_without_duration_outliers["duration_min"].astype(np.float32)

df_jan_feature_map["duration"] = df_jan_without_duration_outliers["duration_min"].to_numpy()
df_feb_feature_map["duration"] = df_feb_without_duration_outliers["duration_min"].to_numpy()

df_jan_feature_map.to_parquet('df_jan_feature_map.parquet.gzip', compression='gzip')
df_feb_feature_map.to_parquet("df_feb_feature_map.parquet.gzip", compression='gzip')

In [None]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import root_mean_squared_error

df_jan_feature_map = pd.read_parquet('df_jan_feature_map.parquet.gzip')
df_feb_feature_map = pd.read_parquet("df_feb_feature_map.parquet.gzip")

jan_cols = df_jan_feature_map.columns.values
feb_cols = df_feb_feature_map.columns.values

include_feb_columns = list(set(jan_cols) - set(feb_cols))
for col in include_feb_columns:
  df_feb_feature_map[col] = np.zeros((len(df_feb_feature_map),), dtype=np.uint8)

include_jan_columns = list(set(feb_cols) - set(jan_cols))
for col in include_jan_columns:
  df_jan_feature_map[col] = np.zeros((len(df_jan_feature_map),), dtype=np.uint8)

df_jan_feature_map.sort_index(axis=1, inplace=True)
df_feb_feature_map.sort_index(axis=1, inplace=True)

X_jan_train = df_jan_feature_map.iloc[:, :-1].values
X_jan_train_sparse = sparse.csr_matrix(X_jan_train)

y_jan_train = df_jan_feature_map.iloc[:, -1].values
reg_jan = LinearRegression().fit(X_jan_train_sparse, y_jan_train)
y_jan_pred = reg_jan.predict(X_jan_train_sparse)
print(f"Q5. Training a model. RMSE on train -> {root_mean_squared_error(y_jan_train, y_jan_pred)}")

X_feb_test = df_feb_feature_map.iloc[:, :-1].values
X_feb_test_sparse = sparse.csr_matrix(X_feb_test)

y_feb_test = df_feb_feature_map.iloc[:, -1].values
y_feb_pred = reg_jan.predict(X_feb_test_sparse)
print(f"Q6. Evaluating the model. RMSE on validation -> {root_mean_squared_error(y_feb_test, y_feb_pred)}")