In [79]:
import re
from time import time

from pathlib import Path

import pandas as pd
from sqlalchemy import create_engine, inspect, Engine


In [78]:
DATA_DIR = (Path().resolve().parent / "data/csv/yellow/")
DATA_FILES = list(DATA_DIR.walk())[0][2]
DATA_CHUNK_SIZE = 100_000
DEST_TABLE = "yellow_taxi_data"

In [70]:
DATA_FILES[0]

'yellow_tripdata_2019-01.csv'

In [72]:
# Функция для преобразования CamelCase в snake_case
def camel_to_snake(name):
    return re.sub(r'(?<!^)(?=[A-Z])', '_', name).lower()

# Функция проверяет существует ли в БД указанная таблица 
def db_table_exists(table_name: str, engine: Engine) -> bool:
    inspector = inspect(engine)
    return inspector.has_table(table_name)    

In [None]:
engine = create_engine("postgresql://root:root@localhost:5432/ny_taxi")

In [60]:
fn = DATA_DIR / DATA_FILES[0]
df = pd.read_csv(fn, nrows=10, parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   VendorID               10 non-null     int64         
 1   tpep_pickup_datetime   10 non-null     datetime64[ns]
 2   tpep_dropoff_datetime  10 non-null     datetime64[ns]
 3   passenger_count        10 non-null     int64         
 4   trip_distance          10 non-null     float64       
 5   RatecodeID             10 non-null     int64         
 6   store_and_fwd_flag     10 non-null     object        
 7   PULocationID           10 non-null     int64         
 8   DOLocationID           10 non-null     int64         
 9   payment_type           10 non-null     int64         
 10  fare_amount            10 non-null     float64       
 11  extra                  10 non-null     float64       
 12  mta_tax                10 non-null     float64       
 13  tip_amou

In [66]:
new_columns_name = {
    'VendorID': 'vendor_id',
    'tpep_pickup_datetime': 'tpep_pickup_datetime',
    'tpep_dropoff_datetime': 'tpep_dropoff_datetime',
    'passenger_count': 'passenger_count',
    'trip_distance': 'trip_distance',
    'RatecodeID': 'ratecode_id',
    'store_and_fwd_flag': 'store_and_fwd_flag',
    'PULocationID': 'pu_location_id',
    'DOLocationID': 'do_location_id',
    'payment_type': 'payment_type',
    'fare_amount': 'fare_amount',
    'extra': 'extra',
    'mta_tax': 'mta_tax',
    'tip_amount': 'tip_amount',
    'tolls_amount': 'tolls_amount',
    'improvement_surcharge': 'improvement_surcharge',
    'total_amount': 'total_amount',
    'congestion_surcharge': 'congestion_surcharge'
}
df = df.rename(columns=new_columns_name)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 0 to 9
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   vendor_id              10 non-null     int64         
 1   tpep_pickup_datetime   10 non-null     datetime64[ns]
 2   tpep_dropoff_datetime  10 non-null     datetime64[ns]
 3   passenger_count        10 non-null     int64         
 4   trip_distance          10 non-null     float64       
 5   ratecode_id            10 non-null     int64         
 6   store_and_fwd_flag     10 non-null     object        
 7   pu_location_id         10 non-null     int64         
 8   do_location_id         10 non-null     int64         
 9   payment_type           10 non-null     int64         
 10  fare_amount            10 non-null     float64       
 11  extra                  10 non-null     float64       
 12  mta_tax                10 non-null     float64       
 13  tip_amount   

In [67]:
print(pd.io.sql.get_schema(df, name=DEST_TABLE, con=engine))


CREATE TABLE yellow_taxi_data (
	vendor_id BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	ratecode_id BIGINT, 
	store_and_fwd_flag TEXT, 
	pu_location_id BIGINT, 
	do_location_id BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




In [73]:
db_table_exists(table_name=DEST_TABLE, engine=engine)

False

In [77]:
df.head(n=0).to_sql(con=engine, name=DEST_TABLE, if_exists="replace")

0

In [108]:
df_iter = pd.read_csv(
    fn, 
    parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"], 
    iterator=True, 
    chunksize=DATA_CHUNK_SIZE
)
for i, df_chunk in enumerate(df_iter):
    t0 = time()
    df_chunk = df_chunk.rename(columns=new_columns_name)
    if not db_table_exists(table_name=DEST_TABLE, engine=engine):
        df_chunk.head(n=0).to_sql(con=engine, name=DEST_TABLE, if_exists="replace")

    df_chunk.to_sql(con=engine, name=DEST_TABLE, if_exists="append")
    t1 = time()
    print(f"insert chunk #{i+1:03d}, size {len(df_chunk):,d}, took for {t1-t0:.3f} sec.")
    
        
    

insert chunk #001, size 100,000, took for 5.447 sec.
insert chunk #002, size 100,000, took for 5.247 sec.
insert chunk #003, size 100,000, took for 5.265 sec.
insert chunk #004, size 100,000, took for 5.351 sec.
insert chunk #005, size 100,000, took for 5.303 sec.
insert chunk #006, size 100,000, took for 5.289 sec.
insert chunk #007, size 100,000, took for 5.329 sec.
insert chunk #008, size 100,000, took for 5.284 sec.
insert chunk #009, size 100,000, took for 5.371 sec.
insert chunk #010, size 100,000, took for 5.652 sec.
insert chunk #011, size 100,000, took for 5.445 sec.
insert chunk #012, size 100,000, took for 5.037 sec.
insert chunk #013, size 100,000, took for 5.331 sec.
insert chunk #014, size 100,000, took for 5.334 sec.
insert chunk #015, size 100,000, took for 5.307 sec.
insert chunk #016, size 100,000, took for 5.318 sec.
insert chunk #017, size 100,000, took for 5.302 sec.
insert chunk #018, size 100,000, took for 5.388 sec.
insert chunk #019, size 100,000, took for 5.32

In [107]:
print(f"{123213213:,d} d ")

123,213,213 d 
