In [3]:
import re
from time import time

from pathlib import Path

import pandas as pd
from sqlalchemy import create_engine, inspect, Engine


In [4]:
DATA_DIR = (Path().resolve() / "data/")
DATA_FILES = list(DATA_DIR.walk())[0][2]
DATA_CHUNK_SIZE = 100_000
DEST_TABLE = "yellow_taxi_data"

In [7]:
green_tripdata = DATA_DIR / "green_tripdata_2019-10.csv"
taxi_zone = DATA_DIR / "taxi_zone_lookup.csv"


In [10]:
# Функция для преобразования CamelCase в snake_case
def camel_to_snake(name):
    return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()

# Функция проверяет существует ли в БД указанная таблица 
def db_table_exists(table_name: str, engine: Engine) -> bool:
    inspector = inspect(engine)
    return inspector.has_table(table_name)    

In [11]:
engine = create_engine("postgresql://postgres:postgres@localhost:5433/ny_taxi")

## Upload green_tripdata_2019-10.csv

In [21]:
DEST_TABLE = "green_tripdata"
fn = green_tripdata
df = pd.read_csv(fn, nrows=10)
df.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2019-10-01 00:26:02,2019-10-01 00:39:58,N,1,112,196,1,5.88,18.0,0.5,0.5,0.0,0,,0.3,19.3,2,1,0
1,1,2019-10-01 00:18:11,2019-10-01 00:22:38,N,1,43,263,1,0.8,5.0,3.25,0.5,0.0,0,,0.3,9.05,2,1,0
2,1,2019-10-01 00:09:31,2019-10-01 00:24:47,N,1,255,228,2,7.5,21.5,0.5,0.5,0.0,0,,0.3,22.8,2,1,0
3,1,2019-10-01 00:37:40,2019-10-01 00:41:49,N,1,181,181,1,0.9,5.5,0.5,0.5,0.0,0,,0.3,6.8,2,1,0
4,2,2019-10-01 00:08:13,2019-10-01 00:17:56,N,1,97,188,1,2.52,10.0,0.5,0.5,2.26,0,,0.3,13.56,1,1,0


In [23]:
fn = green_tripdata
dates_field = ["lpep_pickup_datetime", "lpep_dropoff_datetime"]
df = pd.read_csv(fn, nrows=10, parse_dates=dates_field)

In [24]:
new_columns_name = {
    'VendorID': 'vendor_id',
    'lpep_pickup_datetime': 'lpep_pickup_datetime',
    'lpep_dropoff_datetime': 'lpep_dropoff_datetime',
    'passenger_count': 'passenger_count',
    'trip_distance': 'trip_distance',
    'RatecodeID': 'ratecode_id',
    'store_and_fwd_flag': 'store_and_fwd_flag',
    'PULocationID': 'pu_location_id',
    'DOLocationID': 'do_location_id',
    'payment_type': 'payment_type',
    'fare_amount': 'fare_amount',
    'extra': 'extra',
    'mta_tax': 'mta_tax',
    'tip_amount': 'tip_amount',
    'tolls_amount': 'tolls_amount',
    'improvement_surcharge': 'improvement_surcharge',
    'total_amount': 'total_amount',
    'congestion_surcharge': 'congestion_surcharge'
}
df = df.rename(columns=new_columns_name)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   vendor_id              10 non-null     int64         
 1   lpep_pickup_datetime   10 non-null     datetime64[ns]
 2   lpep_dropoff_datetime  10 non-null     datetime64[ns]
 3   store_and_fwd_flag     10 non-null     object        
 4   ratecode_id            10 non-null     int64         
 5   pu_location_id         10 non-null     int64         
 6   do_location_id         10 non-null     int64         
 7   passenger_count        10 non-null     int64         
 8   trip_distance          10 non-null     float64       
 9   fare_amount            10 non-null     float64       
 10  extra                  10 non-null     float64       
 11  mta_tax                10 non-null     float64       
 12  tip_amount             10 non-null     float64       
 13  tolls_am

In [20]:
print(pd.io.sql.get_schema(df, name=DEST_TABLE, con=engine))


CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	lpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	lpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	store_and_fwd_flag TEXT, 
	"RatecodeID" BIGINT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount BIGINT, 
	ehail_fee FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	payment_type BIGINT, 
	trip_type BIGINT, 
	congestion_surcharge BIGINT
)




In [25]:
db_table_exists(table_name=DEST_TABLE, engine=engine)

False

In [26]:
df.head(n=0).to_sql(con=engine, name=DEST_TABLE, if_exists="replace")

df_iter = pd.read_csv(
    fn, 
    parse_dates=dates_field, 
    iterator=True, 
    chunksize=DATA_CHUNK_SIZE
)
for i, df_chunk in enumerate(df_iter):
    t0 = time()
    df_chunk = df_chunk.rename(columns=new_columns_name)
    if not db_table_exists(table_name=DEST_TABLE, engine=engine):
        df_chunk.head(n=0).to_sql(con=engine, name=DEST_TABLE, if_exists="replace")

    df_chunk.to_sql(con=engine, name=DEST_TABLE, if_exists="append")
    t1 = time()
    print(f"insert chunk #{i+1:03d}, size {len(df_chunk):,d}, took for {t1-t0:.3f} sec.")
    
        
    

insert chunk #001, size 100,000, took for 15.148 sec.
insert chunk #002, size 100,000, took for 14.794 sec.
insert chunk #003, size 100,000, took for 15.025 sec.


  for i, df_chunk in enumerate(df_iter):


insert chunk #004, size 100,000, took for 15.074 sec.
insert chunk #005, size 76,386, took for 10.018 sec.


## Upload taxi_zone_lookup.csv

In [32]:
DEST_TABLE = "taxi_zone"
DATA_CHUNK_SIZE=100
fn = taxi_zone
dates_field=[]
df = pd.read_csv(fn, nrows=10)
df.head()

Unnamed: 0,LocationID,Borough,Zone,service_zone
0,1,EWR,Newark Airport,EWR
1,2,Queens,Jamaica Bay,Boro Zone
2,3,Bronx,Allerton/Pelham Gardens,Boro Zone
3,4,Manhattan,Alphabet City,Yellow Zone
4,5,Staten Island,Arden Heights,Boro Zone


In [33]:
new_columns_name = {
    'LocationID': 'location_id',
    'Borough': 'borough',
    'Zone': 'zone',
    'service_zone': 'service_zone',
}
df = df.rename(columns=new_columns_name)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   location_id   10 non-null     int64 
 1   borough       10 non-null     object
 2   zone          10 non-null     object
 3   service_zone  10 non-null     object
dtypes: int64(1), object(3)
memory usage: 452.0+ bytes


In [34]:
df.head(n=0).to_sql(con=engine, name=DEST_TABLE, if_exists="replace")

df_iter = pd.read_csv(
    fn, 
    parse_dates=dates_field, 
    iterator=True, 
    chunksize=DATA_CHUNK_SIZE
)
for i, df_chunk in enumerate(df_iter):
    t0 = time()
    df_chunk = df_chunk.rename(columns=new_columns_name)
    if not db_table_exists(table_name=DEST_TABLE, engine=engine):
        df_chunk.head(n=0).to_sql(con=engine, name=DEST_TABLE, if_exists="replace")

    df_chunk.to_sql(con=engine, name=DEST_TABLE, if_exists="append")
    t1 = time()
    print(f"insert chunk #{i+1:03d}, size {len(df_chunk):,d}, took for {t1-t0:.3f} sec.")
    
        

insert chunk #001, size 100, took for 0.049 sec.
insert chunk #002, size 100, took for 0.036 sec.
insert chunk #003, size 65, took for 0.040 sec.
