In [23]:
import pyarrow.parquet as pq
import pandas as pd
from sqlalchemy import create_engine
from time import time

DATA_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"
DB_URL = "postgresql://root:root@localhost:5432/ny_taxi"

engine = create_engine(DB_URL)

In [10]:
trips = pd.read_parquet(DATA_URL)
trips.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [6]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7f19e67199c0>

In [7]:
print(pd.io.sql.get_schema(trips, name='yellow_taxi_date', con=engine))


CREATE TABLE yellow_taxi_date (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count FLOAT(53), 
	trip_distance FLOAT(53), 
	"RatecodeID" FLOAT(53), 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53), 
	airport_fee FLOAT(53)
)




In [12]:
trips.head(n=0).to_sql(name='yellow_taxi_data', con=engine, if_exists='replace')

0

In [24]:
print("Inserting chuncks...")

batch_size = 10000
total_rows = len(trips)
num_batches = total_rows // batch_size + 1

for i in range(num_batches):
    t_start = time()
    start = i * batch_size
    end = min((i + 1) * batch_size, total_rows)

    batch_data = trips.iloc[start:end]

    batch_data.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')
    t_end = time()

    print(f'...Inserted {end} lines, took {t_end - t_start}s')


Inserting chuncks...
...Inserted 10000 lines, took -3.191936492919922s
...Inserted 20000 lines, took -2.6458539962768555s
...Inserted 30000 lines, took -2.444611072540283s
...Inserted 40000 lines, took -2.7511343955993652s
...Inserted 50000 lines, took -2.884680986404419s
...Inserted 60000 lines, took -3.549940347671509s
...Inserted 70000 lines, took -4.088824033737183s
...Inserted 80000 lines, took -3.391531467437744s
...Inserted 90000 lines, took -3.0700459480285645s
...Inserted 100000 lines, took -4.164892911911011s
...Inserted 110000 lines, took -4.342038631439209s
...Inserted 120000 lines, took -4.597356557846069s
...Inserted 130000 lines, took -5.284945964813232s
...Inserted 140000 lines, took -4.746995449066162s
...Inserted 150000 lines, took -4.692772150039673s
...Inserted 160000 lines, took -5.268951416015625s
...Inserted 170000 lines, took -3.315607786178589s
...Inserted 180000 lines, took -4.1458234786987305s
...Inserted 190000 lines, took -2.733144760131836s
...Inserted 200