In [1]:
import pandas as pd
from sqlalchemy import create_engine
from time import time

DATA_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet"
DB_URL = "postgresql://root:root@localhost:5432/ny_taxi"

engine = create_engine(DB_URL)
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7f761406f7f0>

In [2]:
trips = pd.read_parquet(DATA_URL)
trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[ns]
 2   tpep_dropoff_datetime  datetime64[ns]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [3]:
trips.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [4]:
print(pd.io.sql.get_schema(trips, name='yellow_taxi_data', con=engine))


CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count FLOAT(53), 
	trip_distance FLOAT(53), 
	"RatecodeID" FLOAT(53), 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53), 
	airport_fee FLOAT(53)
)




In [None]:
trips.head(n=0).to_sql(name='yellow_taxi_data', con=engine, if_exists='replace') #insert the columns names in database

In [6]:
print("Inserting chuncks...")

batch_size = 10000
total_rows = len(trips)
num_batches = total_rows // batch_size + 1

start = time()

for i in range(num_batches):
    t_start = time()
    start = i * batch_size
    end = min((i + 1) * batch_size, total_rows)

    batch_data = trips.iloc[start:end]

    batch_data.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')
    t_end = time()

    print('inserted another % chunk, took %.3f second' % (end, t_end - t_start))

final = time()
print(f'Done. Process time: {final - start} seconds')

Inserting chuncks...
...Inserted 10000 lines, took 1.330118179321289s
...Inserted 20000 lines, took 1.9294378757476807s
...Inserted 30000 lines, took 1.5324962139129639s
...Inserted 40000 lines, took 1.758413553237915s
...Inserted 50000 lines, took 1.4920721054077148s
...Inserted 60000 lines, took 1.3539330959320068s
...Inserted 70000 lines, took 1.9001877307891846s
...Inserted 80000 lines, took 1.553354263305664s
...Inserted 90000 lines, took 1.277716875076294s
...Inserted 100000 lines, took 2.6942191123962402s
...Inserted 110000 lines, took 1.4658644199371338s
...Inserted 120000 lines, took 2.545239210128784s
...Inserted 130000 lines, took 3.086322546005249s
...Inserted 140000 lines, took 2.008918285369873s
...Inserted 150000 lines, took 1.9032797813415527s
...Inserted 160000 lines, took 2.333380937576294s
...Inserted 170000 lines, took 2.6177570819854736s
...Inserted 180000 lines, took 2.0898594856262207s
...Inserted 190000 lines, took 1.7590129375457764s
...Inserted 200000 lines, t

In [16]:
query = """
SELECT * FROM yellow_taxi_data LIMIT 5
"""

pd.read_sql(query, con=engine)

Unnamed: 0,index,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
