In [1]:
import pandas as pd
import os
from sqlalchemy import create_engine
from time import time

In [2]:
DATA_DIR = 'data'

In [None]:
!wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2021-01.csv --directory-prefix=data

In [None]:
!wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv --directory-prefix=data

In [5]:
zones_df = pd.read_csv(os.path.join(DATA_DIR, 'taxi+_zone_lookup.csv'))

In [6]:
zones_df

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone
...,...,...,...,...
260,261,Manhattan,World Trade Center,Yellow Zone
261,262,Manhattan,Yorkville East,Yellow Zone
262,263,Manhattan,Yorkville West,Yellow Zone
263,264,Unknown,NV,


In [7]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [9]:
print(pd.io.sql.get_schema(zones_df, name='yellow_taxi_zones', con=engine))


CREATE TABLE yellow_taxi_zones (
	"LocationID" BIGINT, 
	"Borough" TEXT, 
	"Zone" TEXT, 
	service_zone TEXT
)




In [10]:
%time zones_df.to_sql(name='yellow_taxi_zones', con=engine, if_exists='replace')

Wall time: 1.54 s


In [11]:
df_iter = pd.read_csv(
    os.path.join(DATA_DIR, 'yellow_tripdata_2021-01.csv'), 
    iterator=True, 
    chunksize=100000
)

In [12]:
trips_df = next(df_iter)

In [13]:
trips_df.shape

(100000, 18)

In [14]:
trips_df.tpep_pickup_datetime = pd.to_datetime(trips_df.tpep_pickup_datetime)
trips_df.tpep_dropoff_datetime = pd.to_datetime(trips_df.tpep_dropoff_datetime)

In [15]:
trips_df.head(n=0).to_sql(name='yellow_taxi_data', con=engine, if_exists='replace')

In [16]:
trips_df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')

In [17]:
for chunk in df_iter:
    t_start = time()

    chunk.tpep_pickup_datetime = pd.to_datetime(chunk.tpep_pickup_datetime)
    chunk.tpep_dropoff_datetime = pd.to_datetime(chunk.tpep_dropoff_datetime)
    chunk.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')

    t_end = time()
    print('Inserted another chunk, took %.3f second' % (t_end - t_start))

Inserted another chunk, took 32.929 second
Inserted another chunk, took 16.429 second
Inserted another chunk, took 15.810 second
Inserted another chunk, took 15.814 second
Inserted another chunk, took 15.787 second
Inserted another chunk, took 15.955 second
Inserted another chunk, took 16.606 second
Inserted another chunk, took 17.031 second
Inserted another chunk, took 15.891 second
Inserted another chunk, took 16.551 second
Inserted another chunk, took 17.255 second


  if (await self.run_code(code, result,  async_=asy)):


Inserted another chunk, took 16.090 second
Inserted another chunk, took 10.271 second
