In [1]:
import pandas as pd
from datetime import datetime

In [2]:
pd.__version__

'1.4.4'

In [4]:
df = pd.read_csv('green_tripdata_2019-01.csv')

In [5]:
df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)

In [6]:
df.columns
df.size

12618360

In [7]:
from sqlalchemy import create_engine

In [10]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [11]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7ff3242b9d30>

In [12]:
print(pd.io.sql.get_schema(df, name='green_trip_data', con=engine))


CREATE TABLE green_trip_data (
	"VendorID" BIGINT, 
	lpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	lpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	store_and_fwd_flag TEXT, 
	"RatecodeID" BIGINT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	ehail_fee FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	payment_type BIGINT, 
	trip_type BIGINT, 
	congestion_surcharge FLOAT(53)
)




In [13]:
df_iter = pd.read_csv('green_tripdata_2019-01.csv', iterator=True, chunksize=100000)

In [18]:
df = next(df_iter)

In [19]:
max(df.lpep_dropoff_datetime)

'2019-01-12 07:24:54'

In [20]:
df.size

2000000

In [21]:
df.lpep_pickup_datetime  = pd.to_datetime(df.lpep_pickup_datetime).dt.date
df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime).dt.date

In [22]:
df.head(0).to_sql(name='green_trip_data', con=engine, if_exists='replace')

0

In [24]:
%time df.to_sql(name='green_trip_data', con=engine, if_exists='append')

CPU times: user 4.22 s, sys: 91.9 ms, total: 4.31 s
Wall time: 9.5 s


1000

In [120]:
#"Question4: Largest trip for each day"
with engine.connect() as con:
    #res = con.execute('''select count(1) from green_trip_data where (EXTRACT(MONTH FROM lpep_pickup_datetime) = '01' and EXTRACT(DAY FROM lpep_pickup_datetime) = '15.0') AND (EXTRACT(MONTH FROM lpep_dropoff_datetime) = '01' AND EXTRACT(DAY FROM lpep_dropoff_datetime) = '15.0') ''')
    res = con.execute('''select TO_CHAR(lpep_pickup_datetime,'yyyy:mm:dd') from green_trip_data where trip_distance = (select max(trip_distance) from green_trip_data); ''')

    for data in res:
        print(data)



('2019:01:15',)


In [108]:
#Question 5: "The number of passengers"
with engine.connect() as con:
    res = con.execute('''select count(*) from green_trip_data where passenger_count  = 2 and TO_CHAR(lpep_pickup_datetime, 'yyyy:mm:dd')='2019:01:01' ''')
    res2 = con.execute('''select count(*) from green_trip_data where passenger_count  = 3 and TO_CHAR(lpep_pickup_datetime, 'yyyy:mm:dd')='2019:01:01' ''')
    
    for data in res:
        print("2: ", data)
    for data in res2:
        print("3: ", data)



2:  (1282,)
3:  (254,)


In [109]:
#"Question3: Count records"

with engine.connect() as con:
    res = con.execute('''select count(1) from green_trip_data where TO_CHAR(lpep_pickup_datetime, 'yyyy:mm:dd')='2019:01:15'  and TO_CHAR(lpep_dropoff_datetime, 'yyyy:mm:dd')='2019:01:15' ''')

    for data in res:
        print(data)



(20530,)


In [110]:
#"Question6: Largest tip for astoria zone"

with engine.connect() as con:

    res = con.execute('''select 
                    zdo."Zone" as "new_zone",
                    tip_amount
                from green_trip_data t,
                    zones zpc,
                    zones zdo
                where t."PULocationID"=zpc."LocationID" AND
                t."DOLocationID"=zdo."LocationID" AND zpc."Zone"='Astoria' order by tip_amount DESC LIMIT 1
                ; ''')
    for data in res:
        print(data)

('Long Island City/Queens Plaza', 88.0)


In [26]:
from time import time

In [27]:
while True: 
    t_start = time()

    df = next(df_iter)

    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    
    df.to_sql(name='green_taxi_data', con=engine, if_exists='append')

    t_end = time()

    print('inserted another chunk, took %.3f second' % (t_end - t_start))

inserted another chunk, took 9.775 second
inserted another chunk, took 9.756 second
inserted another chunk, took 9.691 second
inserted another chunk, took 9.849 second
inserted another chunk, took 3.070 second


StopIteration: 

In [83]:
df_zones = pd.read_csv('taxi+_zone_lookup.csv')

In [84]:
df_zones.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [18]:
df_zones.to_sql(name='zones', con=engine, if_exists='replace')

265

In [50]:
engine.get_execution_options

<bound method Engine.get_execution_options of Engine(postgresql://root:***@localhost:5432/ny_taxi)>