In [1]:
import pandas as pd

In [2]:
taxi_jan = pd.read_parquet("data/yellow_tripdata_2023-01.parquet")

In [3]:
taxi_jan.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [4]:
num_cols = len(taxi_jan.columns)
print(f"Number of columns in the New York Taxi January data: {num_cols}")

Number of columns in the New York Taxi January data: 19


In [5]:
taxi_jan["duration"] = (taxi_jan["tpep_dropoff_datetime"] - taxi_jan["tpep_pickup_datetime"]).dt.total_seconds() / 60

In [6]:
std_taxi_jan_duration = taxi_jan["duration"].std()
print(f"The standard deviation of the trips duration in January: {std_taxi_jan_duration}")

The standard deviation of the trips duration in January: 42.59435124195458


In [7]:
original_records = len(taxi_jan)

# Keep only the records where the duration was between 1 and 60 minutes (inclusive)
taxi_jan = taxi_jan[(taxi_jan["duration"] >= 1) & (taxi_jan["duration"] <= 60)]
kept_records = len(taxi_jan)

print(f"Fraction of the records left after you dropped the outliers: {kept_records / original_records * 100}")

Fraction of the records left after you dropped the outliers: 98.1220282212598


In [8]:
taxi_jan[["PULocationID", "DOLocationID"]] = taxi_jan[["PULocationID", "DOLocationID"]].astype(str)

In [9]:
taxi_jan = taxi_jan[["PULocationID", "DOLocationID", "duration"]]

In [10]:
X_train = taxi_jan.drop("duration", axis=1)
y_train = taxi_jan["duration"]

In [11]:
train_dicts = X_train.to_dict(orient="records")

In [12]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [13]:
print(f"Number of columns of the matrix X: {X_train.shape[1]}")

Number of columns of the matrix X: 515


In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

y_pred = lin_reg.predict(X_train)
train_rmse = root_mean_squared_error(y_train, y_pred)
print(f"Train RMSE: {train_rmse}")

Train RMSE: 7.649261929201487


In [15]:
taxi_feb = pd.read_parquet("data/yellow_tripdata_2023-02.parquet")
taxi_feb["duration"] = (taxi_feb["tpep_dropoff_datetime"] - taxi_feb["tpep_pickup_datetime"]).dt.total_seconds() / 60
taxi_feb = taxi_feb[(taxi_feb["duration"] >= 1) & (taxi_feb["duration"] <= 60)]
taxi_feb = taxi_feb[["PULocationID", "DOLocationID", "duration"]]

X_test = taxi_feb.drop("duration", axis=1)
y_test = taxi_feb["duration"]

test_dicts = X_test.to_dict(orient="records")
X_test = dv.transform(test_dicts)

y_pred = lin_reg.predict(X_test)
test_rmse = root_mean_squared_error(y_test, y_pred)
print(f"Test RMSE: {test_rmse}")

Test RMSE: 13.322283504831706
