# Reading Parquet Files


In [28]:
import pandas as pd
import pyarrow.parquet as pq
import dask.dataframe as dd

In [2]:
%%time
df = pd.read_parquet("./yellow_tripdata_2021-01.parquet")
df.shape

CPU times: user 667 ms, sys: 268 ms, total: 935 ms
Wall time: 1.33 s


(1369769, 19)

In [3]:
df.columns = [c.lower() for c in df.columns]
df.head()

Unnamed: 0,vendorid,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,ratecodeid,store_and_fwd_flag,pulocationid,dolocationid,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1.0,2.1,1.0,N,142,43,2,8.0,3.0,0.5,0.0,0.0,0.3,11.8,2.5,
1,1,2021-01-01 00:51:20,2021-01-01 00:52:19,1.0,0.2,1.0,N,238,151,2,3.0,0.5,0.5,0.0,0.0,0.3,4.3,0.0,
2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1.0,14.7,1.0,N,132,165,1,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0,
3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0.0,10.6,1.0,N,138,132,1,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0,
4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1.0,4.94,1.0,N,68,33,1,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5,


In [4]:

data_schema = pd.io.sql.get_schema(df, "taxi_data",)
print(data_schema)

CREATE TABLE "taxi_data" (
"vendorid" INTEGER,
  "tpep_pickup_datetime" TIMESTAMP,
  "tpep_dropoff_datetime" TIMESTAMP,
  "passenger_count" REAL,
  "trip_distance" REAL,
  "ratecodeid" REAL,
  "store_and_fwd_flag" TEXT,
  "pulocationid" INTEGER,
  "dolocationid" INTEGER,
  "payment_type" INTEGER,
  "fare_amount" REAL,
  "extra" REAL,
  "mta_tax" REAL,
  "tip_amount" REAL,
  "tolls_amount" REAL,
  "improvement_surcharge" REAL,
  "total_amount" REAL,
  "congestion_surcharge" REAL,
  "airport_fee" REAL
)


In [5]:
df.dtypes

vendorid                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
ratecodeid                      float64
store_and_fwd_flag               object
pulocationid                      int64
dolocationid                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object

In [6]:
from sqlalchemy import create_engine

engine = create_engine("postgresql://root:root@localhost:5432/ny_taxi")
engine.connect()

data_schema = pd.io.sql.get_schema(df, "yellow_taxi_data_parquet_2021", con=engine)
print(data_schema)


CREATE TABLE yellow_taxi_data_parquet_2021 (
	vendorid BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count FLOAT(53), 
	trip_distance FLOAT(53), 
	ratecodeid FLOAT(53), 
	store_and_fwd_flag TEXT, 
	pulocationid BIGINT, 
	dolocationid BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53), 
	airport_fee FLOAT(53)
)




## Create table yellow_taxi_data_parquet_2021 and if exists drop it and create new one

In [8]:
# %time df.to_sql(name="yellow_taxi_data_parquet_2021", con=engine, if_exists="replace")

## Insert Data into table and append to it if exists

In [34]:
pfile = pq.ParquetFile("./yellow_tripdata_2021-01.parquet")
table_iter = pfile.iter_batches(batch_size=100_000)
table = next(table_iter)
df = table.to_pandas()
df.head(0).to_sql(name="yellow_taxi_data_parquet_2021", con=engine, if_exists="replace")


# for table in pfile.iter_batches(batch_size=100_000):
#     df = table.to_pandas()
#     print (df.shape)
#     print(df.head())
#     break

In [35]:
### Insert Data into table and append to it if exists
%time df.to_sql(name="yellow_taxi_data_parquet_2021", con=engine, if_exists="append")

CPU times: user 15 s, sys: 3.46 s, total: 18.4 s
Wall time: 3min 20s


## Inserting the rest of the data using loop

In [36]:
from time import time

try:
    while True:
        t_start = time()
        
        table = next(table_iter)
        df = table.to_pandas()
        df.to_sql(name="yellow_taxi_data_parquet_2021", con=engine, if_exists="append")
        
        t_end = time()
        
        print(f"Inserted another chunk of size {len(df)}: took {t_end - t_start:.3f} seconds")
except Exception as e:
    print(str(e))
    print("Finished inserting data")

Inserted another chunk of size 100000: took 686.767 seconds
Inserted another chunk of size 100000: took 523.307 seconds
Inserted another chunk of size 100000: took 431.707 seconds
Inserted another chunk of size 100000: took 298.314 seconds
Inserted another chunk of size 100000: took 302.448 seconds
Inserted another chunk of size 100000: took 445.260 seconds
Inserted another chunk of size 100000: took 264.606 seconds
Inserted another chunk of size 100000: took 262.404 seconds
Inserted another chunk of size 100000: took 264.087 seconds
Inserted another chunk of size 100000: took 258.781 seconds
Inserted another chunk of size 100000: took 238.951 seconds
Inserted another chunk of size 100000: took 247.990 seconds
Inserted another chunk of size 69769: took 177.091 seconds

Finished inserting data


In [44]:
import os
url = "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-01.parquet"
filename = url.split("/")[-1]
status = os.system(f"wget {url} -O {filename}")
status

0