In [45]:
import pandas as pd
import requests
import io
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
import pickle

In [39]:
urls = [
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet",
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet"
]

dataframes = []
for url in urls:
    print(f"Reading from {url}")
    response = requests.get(url)
    response.raise_for_status()
    
    buffer = io.BytesIO(response.content)
    df = pd.read_parquet(buffer, engine="pyarrow") 
    df.columns = df.columns.str.lower()
    dataframes.append(df)

df = pd.concat(dataframes, ignore_index=True)
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])


Reading from https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
Reading from https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet


In [40]:
df.shape

(5980721, 19)

In [41]:
df['duration_minutes'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

In [42]:
std_duration=df[df['tpep_pickup_datetime'].dt.month == 1]['duration_minutes'].std()
print(f"Standard deviation of trip duration in January 2023: {std_duration:.2f}")

Standard deviation of trip duration in January 2023: 42.59


In [43]:
df_january = df[df['tpep_pickup_datetime'].dt.month == 1].copy()
df_january = df_january[(df_january['duration_minutes'] >= 1) & (df_january['duration_minutes'] <= 60)]

fraction_remaining = len(df_january) / len(df[df['tpep_pickup_datetime'].dt.month == 1])
print(f"Fraction remaining: {fraction_remaining:.2%}")


Fraction remaining: 98.12%


In [None]:

# Select relevant columns and convert to string
df_encoded = df_january[['pulocationid', 'dolocationid']].copy()
df_encoded.columns = ['pulocationid', 'dolocationid']
df_encoded = df_encoded.astype(str)

# Convert to list of dictionaries
dicts = df_encoded.to_dict(orient='records')

# Vectorize
dv = DictVectorizer()
X_train = dv.fit_transform(dicts)

print(f"Feature matrix shape: {X_train.shape}")


Fraction remaining: 0.9807


In [44]:
df_encoded = df_january[['pulocationid', 'dolocationid']].copy()
df_encoded = df_encoded.astype(str)
dicts = df_encoded.to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(dicts)

print(f"Feature matrix shape: {X_train.shape}")

Feature matrix shape: (3009145, 515)


In [46]:
# Target variable
y_train = df_january['duration_minutes'].values

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and compute RMSE
y_pred = model.predict(X_train)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
print(f"Train RMSE: {rmse:.2f}")

# Save model and vectorizer
with open('model.bin', 'wb') as f_out:
    pickle.dump((dv, model), f_out)

Train RMSE: 7.65


In [None]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)


df_feb = df[df['tpep_pickup_datetime'].dt.month == 2].copy()
df_feb = df_feb[(df_feb['duration_minutes'] >= 1) & (df_feb['duration_minutes'] <= 60)]


df_val_encoded = df_feb[['pulocationid', 'dolocationid']].copy()
df_val_encoded = df_val_encoded.astype(str)
dicts_val = df_val_encoded.to_dict(orient='records')
X_val = dv.transform(dicts_val)


y_val = df_feb['duration_minutes'].values
y_pred = model.predict(X_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse_val:.2f}")


Validation RMSE: 7.81
