# Ingest YN yellow trip data into Postgres

In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet("yellow_trip_2021.parquet", engine="pyarrow")

In [3]:
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.10,1.0,N,142,43,2,8.00,3.00,0.5,0.00,0.00,0.3,11.80,2.5,
1,1,2021-01-01 00:51:20,2021-01-01 00:52:19,1.0,0.20,1.0,N,238,151,2,3.00,0.50,0.5,0.00,0.00,0.3,4.30,0.0,
2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1.0,14.70,1.0,N,132,165,1,42.00,0.50,0.5,8.65,0.00,0.3,51.95,0.0,
3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0.0,10.60,1.0,N,138,132,1,29.00,0.50,0.5,6.05,0.00,0.3,36.35,0.0,
4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1.0,4.94,1.0,N,68,33,1,16.50,0.50,0.5,4.06,0.00,0.3,24.36,2.5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1369764,2,2021-01-31 23:03:00,2021-01-31 23:33:00,,8.89,,,229,181,0,27.78,0.00,0.5,7.46,0.00,0.3,38.54,,
1369765,2,2021-01-31 23:29:00,2021-01-31 23:51:00,,7.43,,,41,70,0,32.58,0.00,0.5,0.00,6.12,0.3,39.50,,
1369766,2,2021-01-31 23:25:00,2021-01-31 23:38:00,,6.26,,,74,137,0,16.85,0.00,0.5,3.90,0.00,0.3,24.05,,
1369767,6,2021-01-31 23:01:06,2021-02-01 00:02:03,,19.70,,,265,188,0,53.68,0.00,0.5,0.00,0.00,0.3,54.48,,


Generate DDL statements to create table in Postgres

Connect to psql

In [5]:
from sqlalchemy import create_engine

In [6]:
engine = create_engine('postgresql://root:root@postgres:5432/ny_taxi') #need to chang localhost to postgres cause in docker network

engine.connect()

<sqlalchemy.engine.base.Connection at 0x7fc0ac4ff7d0>

In [8]:
print(pd.io.sql.get_schema(df, name="yellow_taxi_data", con=engine))


CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count FLOAT(53), 
	trip_distance FLOAT(53), 
	"RatecodeID" FLOAT(53), 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53), 
	airport_fee FLOAT(53)
)




create a table with header of df, we will see the header in table `yellow_taxi_data` in `ny_taxi` database

In [10]:
df.head(0).to_sql(name='yellow_taxi_data', con=engine, if_exists='replace')

0

Now pushing all elements into database by chunk

In [26]:
import pyarrow.parquet as pq
from time import time

chunksize = 100_000
file = pq.ParquetFile('yellow_trip_2021.parquet')

In [28]:
%%time
for i, table in enumerate(file.iter_batches(batch_size=chunksize)):
    start = time()
    df = table.to_pandas()
    df.to_sql('yellow_taxi_data', con=engine, if_exists='append')
    end = time()
    print(f'successfully pushing {len(df)} data into database... take {round(end - start, 2)} second')

successfully pushing 100000 data into database... take 18.33 second
successfully pushing 100000 data into database... take 20.08 second
successfully pushing 100000 data into database... take 19.47 second
successfully pushing 100000 data into database... take 18.94 second
successfully pushing 100000 data into database... take 18.78 second
successfully pushing 100000 data into database... take 18.06 second
successfully pushing 100000 data into database... take 18.15 second
successfully pushing 100000 data into database... take 22.76 second
successfully pushing 100000 data into database... take 19.6 second
successfully pushing 100000 data into database... take 19.37 second
successfully pushing 100000 data into database... take 18.24 second
successfully pushing 100000 data into database... take 18.09 second
successfully pushing 100000 data into database... take 17.91 second
successfully pushing 69769 data into database... take 11.67 second
CPU times: user 2min 44s, sys: 1.13 s, total: 2min

# Inges Taxi Zone Maps and Lookup data into Postgres

In [2]:
zone_df = pd.read_csv("https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv")

In [7]:
zone_df.to_sql(name="zones", con=engine, if_exists='replace')

265