In [1]:
# import modules
import pandas as pd
from sqlalchemy import create_engine

In [2]:
# check version of pandas module
pd.__version__

'2.0.3'

In [3]:
# read first 100 rows of dataset
df = pd.read_csv('yellow_tripdata_2021-01.csv',nrows=100)

In [4]:
# display data
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2021-01-01 00:30:10,2021-01-01 00:36:12,1,2.10,1,N,142,43,2,8.0,3.0,0.5,0.00,0.0,0.3,11.80,2.5
1,1,2021-01-01 00:51:20,2021-01-01 00:52:19,1,0.20,1,N,238,151,2,3.0,0.5,0.5,0.00,0.0,0.3,4.30,0.0
2,1,2021-01-01 00:43:30,2021-01-01 01:11:06,1,14.70,1,N,132,165,1,42.0,0.5,0.5,8.65,0.0,0.3,51.95,0.0
3,1,2021-01-01 00:15:48,2021-01-01 00:31:01,0,10.60,1,N,138,132,1,29.0,0.5,0.5,6.05,0.0,0.3,36.35,0.0
4,2,2021-01-01 00:31:49,2021-01-01 00:48:21,1,4.94,1,N,68,33,1,16.5,0.5,0.5,4.06,0.0,0.3,24.36,2.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2,2021-01-01 00:12:41,2021-01-01 00:26:47,1,4.13,1,N,161,226,1,14.5,0.5,0.5,3.66,0.0,0.3,21.96,2.5
96,2,2021-01-01 00:23:29,2021-01-01 00:35:03,2,4.12,1,N,162,74,2,13.5,0.5,0.5,0.00,0.0,0.3,17.30,2.5
97,2,2021-01-01 00:46:17,2021-01-01 00:54:25,2,2.22,1,N,144,170,1,9.0,0.5,0.5,2.56,0.0,0.3,15.36,2.5
98,2,2021-01-01 00:28:16,2021-01-01 00:51:44,1,7.11,1,N,264,264,2,23.5,0.5,0.5,0.00,0.0,0.3,24.80,0.0


In [5]:
# display information on dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   VendorID               100 non-null    int64  
 1   tpep_pickup_datetime   100 non-null    object 
 2   tpep_dropoff_datetime  100 non-null    object 
 3   passenger_count        100 non-null    int64  
 4   trip_distance          100 non-null    float64
 5   RatecodeID             100 non-null    int64  
 6   store_and_fwd_flag     100 non-null    object 
 7   PULocationID           100 non-null    int64  
 8   DOLocationID           100 non-null    int64  
 9   payment_type           100 non-null    int64  
 10  fare_amount            100 non-null    float64
 11  extra                  100 non-null    float64
 12  mta_tax                100 non-null    float64
 13  tip_amount             100 non-null    float64
 14  tolls_amount           100 non-null    float64
 15  improve

The data type of `tpep_pickup_datetime` and `tpep_dropoff_datetime` is supposed to be datetime.

In [6]:
# convert datatypes
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])

In [7]:
# create schema using dataframe
print(pd.io.sql.get_schema(df, name='yellow_taxi_2021-01_data'))

CREATE TABLE "yellow_taxi_2021-01_data" (
"VendorID" INTEGER,
  "tpep_pickup_datetime" TIMESTAMP,
  "tpep_dropoff_datetime" TIMESTAMP,
  "passenger_count" INTEGER,
  "trip_distance" REAL,
  "RatecodeID" INTEGER,
  "store_and_fwd_flag" TEXT,
  "PULocationID" INTEGER,
  "DOLocationID" INTEGER,
  "payment_type" INTEGER,
  "fare_amount" REAL,
  "extra" REAL,
  "mta_tax" REAL,
  "tip_amount" REAL,
  "tolls_amount" REAL,
  "improvement_surcharge" REAL,
  "total_amount" REAL,
  "congestion_surcharge" REAL
)


In [8]:
# create postgres engine
engine = create_engine('postgresql://outis:outis@localhost:5432/ny_taxi')

NB: I have already created a database in postgres with the name `ny_taxi`. The command I used was
```bash
docker run -it \
    -e POSTGRES_USER = 'user" \
    -e POSTGRES_PASSWORD = "password" \
    -e POSTGRES_DB = "ny_taxi" \
    -v $(pwd)/ny_taxi_postgres_data:/var/lib/postgresql/data \
    -p 5432:5432 \
    postgres:13
```


In [9]:
# connect to engine
engine.connect()

<sqlalchemy.engine.base.Connection at 0x7f4eb1705010>

In [10]:
# recreate schema using dataframe and database engine
print(pd.io.sql.get_schema(df, name='yellow_taxi_2021-01_data',con=engine))


CREATE TABLE "yellow_taxi_2021-01_data" (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	"RatecodeID" BIGINT, 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




In [11]:
# read data again but this time read entire data in chunks of 100,000
df_iterator = pd.read_csv('yellow_tripdata_2021-01.csv',iterator=True,chunksize=100000)

In [12]:
df_iterator

<pandas.io.parsers.readers.TextFileReader at 0x7f4eb1318890>

In [13]:
# extract first chunk in iterator
df = next(df_iterator)
len(df)

100000

In [14]:
# convert columns into datetime
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])

In [15]:
# create table with just column headers
df.head(n=0).to_sql(name="yellow_taxi_2021_01_data", con=engine,if_exists="replace")

0

To verify table creation, run `\dt` in pgcli.


After creating table in database, we will add the first 100000 rows of data.

In [16]:
# add first 100000 rows of data to table
df.to_sql(name='yellow_taxi_2021_01_data',con=engine,if_exists='append')

1000

To verify that the rows were added, run 
```sql
 SELECT COUNT(1) FROM yellow_taxi_2021_01_data
```
 in pgcli.

In [17]:
# add remaining chunks of data to table
try:
    while True:
        df = next(df_iterator)
        
        df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
        df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])

        df.to_sql(name='yellow_taxi_2021_01_data',con=engine,if_exists='append')

        print('rows inserted successfully...')

except StopIteration:
    print("Successfully added all rows")

rows inserted successfully...
rows inserted successfully...
rows inserted successfully...
rows inserted successfully...
rows inserted successfully...
rows inserted successfully...
rows inserted successfully...
rows inserted successfully...
rows inserted successfully...
rows inserted successfully...
rows inserted successfully...


  df = next(df_iterator)


rows inserted successfully...
rows inserted successfully...
Successfully added all rows
