# EDA and Docker Prep
## yellow_taxi_data
Using Google Colab

## Exploratory Data Analysis

### Import Packages

In [3]:
import pandas as pd

Check version

In [100]:
pd.__version__

'1.4.2'

### Explore Data

In [101]:
df = pd.read_csv(r'C:\git\data-engineering-zoomcamp\week_1\2_docker_sql\yellow_tripdata_2021-01.csv', nrows = 100)

Convert Pandas dataframe to SQL DDL

In [102]:
print(pd.io.sql.get_schema(df, name = 'yellow_taxi_data'))

CREATE TABLE "yellow_taxi_data" (
"VendorID" INTEGER,
  "tpep_pickup_datetime" TEXT,
  "tpep_dropoff_datetime" TEXT,
  "passenger_count" INTEGER,
  "trip_distance" REAL,
  "RatecodeID" INTEGER,
  "store_and_fwd_flag" TEXT,
  "PULocationID" INTEGER,
  "DOLocationID" INTEGER,
  "payment_type" INTEGER,
  "fare_amount" REAL,
  "extra" REAL,
  "mta_tax" REAL,
  "tip_amount" REAL,
  "tolls_amount" REAL,
  "improvement_surcharge" REAL,
  "total_amount" REAL,
  "congestion_surcharge" REAL
)


Note data types:  
  "tpep_pickup_datetime" TEXT  
  "tpep_dropoff_datetime" TEXT  
These must be converted to TIMESTAP data types

In [103]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_pickup_datetime)

In [104]:
print(pd.io.sql.get_schema(df, name = 'yellow_taxi_data'))

CREATE TABLE "yellow_taxi_data" (
"VendorID" INTEGER,
  "tpep_pickup_datetime" TIMESTAMP,
  "tpep_dropoff_datetime" TIMESTAMP,
  "passenger_count" INTEGER,
  "trip_distance" REAL,
  "RatecodeID" INTEGER,
  "store_and_fwd_flag" TEXT,
  "PULocationID" INTEGER,
  "DOLocationID" INTEGER,
  "payment_type" INTEGER,
  "fare_amount" REAL,
  "extra" REAL,
  "mta_tax" REAL,
  "tip_amount" REAL,
  "tolls_amount" REAL,
  "improvement_surcharge" REAL,
  "total_amount" REAL,
  "congestion_surcharge" REAL
)


## Docker Prep

### Import Packages

In [5]:
from sqlalchemy import create_engine

### Connect to postgres engine

Connect to Engine:
{type of database}://{user}:{password}@{host}:{port}/{database name}

In [6]:
engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')

In [7]:
engine.connect()

<sqlalchemy.engine.base.Connection at 0x25222a87940>

### Create postgres DDL

Create definition in postgres

In [108]:
print(pd.io.sql.get_schema(df, name = 'yellow_taxi_data', con = engine))


CREATE TABLE yellow_taxi_data (
	"VendorID" BIGINT, 
	tpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, 
	tpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, 
	passenger_count BIGINT, 
	trip_distance FLOAT(53), 
	"RatecodeID" BIGINT, 
	store_and_fwd_flag TEXT, 
	"PULocationID" BIGINT, 
	"DOLocationID" BIGINT, 
	payment_type BIGINT, 
	fare_amount FLOAT(53), 
	extra FLOAT(53), 
	mta_tax FLOAT(53), 
	tip_amount FLOAT(53), 
	tolls_amount FLOAT(53), 
	improvement_surcharge FLOAT(53), 
	total_amount FLOAT(53), 
	congestion_surcharge FLOAT(53)
)




### Create batch iterator

There are too many rows in the csv to be uploaded at once.
Using iterator with chunk size 100,000.

In [118]:
df_iter = pd.read_csv(r'C:\git\data-engineering-zoomcamp\week_1\2_docker_sql\yellow_tripdata_2021-01.csv', iterator = True, chunksize = 100000, low_memory = False)

return next element in iterator

In [119]:
df = next(df_iter)

Check iteration size.

In [120]:
len(df)

100000

Convert string timestampe to datetime as above.

In [121]:
df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_pickup_datetime)

Create a table with only the headers.
If the table 'yellow_taxi_data' already exists then drop and replace.

In [122]:
df.head(n=0)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge


In [123]:
df.head(n=0).to_sql(name = 'yellow_taxi_data', con = engine, if_exists = 'replace')

0

If table exists append each chunk of the iterator.

In [124]:
df.to_sql(name = 'yellow_taxi_data', con = engine, if_exists = 'append')

1000

Create loop inserting each iteration with time tracking.

In [125]:
from time import time

In [126]:
while True:
    t_start = time()
    df = next(df_iter)
    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    df.to_sql(name = 'yellow_taxi_data', con = engine, if_exists = 'append')
    t_end = time()
    print('Chunk inserted. Processing time: %.3f second' % (t_end - t_start))

Chunk inserted. Processing time: 8.513 second
Chunk inserted. Processing time: 8.430 second
Chunk inserted. Processing time: 8.945 second
Chunk inserted. Processing time: 8.731 second
Chunk inserted. Processing time: 9.240 second
Chunk inserted. Processing time: 8.395 second
Chunk inserted. Processing time: 9.114 second
Chunk inserted. Processing time: 8.397 second
Chunk inserted. Processing time: 7.770 second
Chunk inserted. Processing time: 8.472 second
Chunk inserted. Processing time: 8.777 second
Chunk inserted. Processing time: 10.532 second
Chunk inserted. Processing time: 5.650 second


StopIteration: 

add the zone lookup data.

In [8]:
!wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv

--2023-01-21 08:29:06--  https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.84.157, 52.217.142.104, 52.217.164.0, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.84.157|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12322 (12K) [application/octet-stream]
Saving to: 'taxi+_zone_lookup.csv.1'

     0K .......... ..                                         100% 1.59M=0.007s

2023-01-21 08:29:06 (1.59 MB/s) - 'taxi+_zone_lookup.csv.1' saved [12322/12322]



In [9]:
df_zones = pd.read_csv('taxi+_zone_lookup.csv')

In [10]:
df_zones.to_sql(name='zones', con=engine, if_exists='replace')

265