### Explore the CSV file

In [1]:
import pandas as pd
import pathlib

In [2]:
data = pd.read_csv(pathlib.Path("data","green_tripdata_2019-01.csv"))
data

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2018-12-21 15:17:29,2018-12-21 15:18:57,N,1,264,264,5,0.00,3.0,0.5,0.5,0.00,0.0,,0.3,4.30,2,1,
1,2,2019-01-01 00:10:16,2019-01-01 00:16:32,N,1,97,49,2,0.86,6.0,0.5,0.5,0.00,0.0,,0.3,7.30,2,1,
2,2,2019-01-01 00:27:11,2019-01-01 00:31:38,N,1,49,189,2,0.66,4.5,0.5,0.5,0.00,0.0,,0.3,5.80,1,1,
3,2,2019-01-01 00:46:20,2019-01-01 01:04:54,N,1,189,17,2,2.68,13.5,0.5,0.5,2.96,0.0,,0.3,19.71,1,1,
4,2,2019-01-01 00:19:06,2019-01-01 00:39:43,N,1,82,258,1,4.53,18.0,0.5,0.5,0.00,0.0,,0.3,19.30,2,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
630913,2,2019-01-31 23:08:27,2019-01-31 23:22:59,N,1,255,226,1,3.33,13.0,0.5,0.5,2.14,0.0,,0.3,18.39,1,1,0.0
630914,2,2019-01-31 23:21:26,2019-01-31 23:23:05,N,1,75,151,1,0.72,4.0,0.5,0.5,1.06,0.0,,0.3,6.36,1,1,0.0
630915,2,2019-01-31 23:30:05,2019-01-31 23:36:14,N,1,75,238,1,1.75,7.0,0.5,0.5,0.00,0.0,,0.3,8.30,1,1,0.0
630916,2,2019-01-31 23:59:58,2019-02-01 00:04:18,N,1,74,74,1,0.57,5.0,0.5,0.5,1.00,0.0,,0.3,7.30,1,1,0.0


### Convert some columns and export SQL schema

In [3]:
data.lpep_pickup_datetime = pd.to_datetime(data.lpep_pickup_datetime)
data.lpep_dropoff_datetime = pd.to_datetime(data.lpep_dropoff_datetime)

In [4]:
print(pd.io.sql.get_schema(data, "green_taxi_data"))

CREATE TABLE "green_taxi_data" (
"VendorID" INTEGER,
  "lpep_pickup_datetime" TIMESTAMP,
  "lpep_dropoff_datetime" TIMESTAMP,
  "store_and_fwd_flag" TEXT,
  "RatecodeID" INTEGER,
  "PULocationID" INTEGER,
  "DOLocationID" INTEGER,
  "passenger_count" INTEGER,
  "trip_distance" REAL,
  "fare_amount" REAL,
  "extra" REAL,
  "mta_tax" REAL,
  "tip_amount" REAL,
  "tolls_amount" REAL,
  "ehail_fee" REAL,
  "improvement_surcharge" REAL,
  "total_amount" REAL,
  "payment_type" INTEGER,
  "trip_type" INTEGER,
  "congestion_surcharge" REAL
)


### Connect PostgreSQL database

In [5]:
from sqlalchemy import create_engine

In [6]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

### Get SQL schema for PostgreSQL

In [7]:
print(pd.io.sql.get_schema(data, "green_taxi_data", con=engine))


CREATE TABLE green_taxi_data (
	"VendorID" BIGINT, 
	lpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	lpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	store_and_fwd_flag TEXT, 
	"RatecodeID" BIGINT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	ehail_fee FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	payment_type BIGINT, 
	trip_type BIGINT, 
	congestion_surcharge FLOAT(53)
)




### Push CSV to PostgreSQL database

#### Create the table using the column names of the dataset

In [8]:
data.head(n=0).to_sql(name="green_taxi_data", con=engine, if_exists='replace')

0

In [9]:
import time

In [10]:
df_iter = pd.read_csv(pathlib.Path("data","green_tripdata_2019-01.csv"), iterator=True, chunksize=100000)

In [11]:
for chunk in df_iter:
    
    t0 = time.time()
    
    chunk.lpep_pickup_datetime = pd.to_datetime(chunk.lpep_pickup_datetime)
    chunk.lpep_dropoff_datetime = pd.to_datetime(chunk.lpep_dropoff_datetime)
    chunk.to_sql(name="green_taxi_data", con=engine, if_exists='append')
    
    print(f"inserted another chunk ({len(chunk)})... took {time.time()-t0} secondes")

inserted another chunk (100000)... took 9.785968780517578 secondes
inserted another chunk (100000)... took 9.545292377471924 secondes
inserted another chunk (100000)... took 9.453013896942139 secondes
inserted another chunk (100000)... took 9.855462312698364 secondes
inserted another chunk (100000)... took 9.358324527740479 secondes
inserted another chunk (100000)... took 9.338106393814087 secondes
inserted another chunk (30918)... took 2.9756436347961426 secondes


### Also push an extra table

In [12]:
df_zones = pd.read_csv(pathlib.Path("data","taxi+_zone_lookup.csv"))
df_zones.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [13]:
df_zones.to_sql(name='zones', con=engine, if_exists='replace')

265

#### Check the DB content

In [14]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7ff777070520>

In [15]:
query = """
SELECT *
FROM pg_catalog.pg_tables
WHERE schemaname != 'pg_catalog'
AND schemaname != 'information_schema'
"""

pd.read_sql(query, con=engine)

Unnamed: 0,schemaname,tablename,tableowner,tablespace,hasindexes,hasrules,hastriggers,rowsecurity
0,public,green_taxi_data,root,,True,False,False,False
1,public,zones,root,,True,False,False,False


In [16]:
query = """
SELECT COUNT(1) FROM green_taxi_data;
"""

pd.read_sql(query, con=engine)

Unnamed: 0,count
0,630918
