In [1]:
import configparser
import os
from pathlib import Path
from pyspark.sql import SparkSession

In [2]:
# CONFIG
config = configparser.ConfigParser()
config.read('dl.cfg')

KEY = config.get('AWS', 'AWS_ACCESS_KEY_ID')
SECRET = config.get('AWS', 'AWS_SECRET_ACCESS_KEY')
output_data = './data/outputs'# config.get('S3', 'DEST_S3_BUCKET')


os.environ['AWS_ACCESS_KEY_ID']=KEY
os.environ['AWS_SECRET_ACCESS_KEY']=SECRET

In [3]:
spark = SparkSession.builder\
                    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0")\
                    .enableHiveSupport().getOrCreate()

In [4]:
s3_bucket = Path(output_data)

In [5]:
for file_dir in s3_bucket.iterdir():
    if file_dir.is_dir():
        path = str(file_dir)
        df = spark.read.parquet(path)
        print("Table: " + path.split('/')[-1])
        schema = df.printSchema()

Table: city_code
root
 |-- city_code: string (nullable = true)
 |-- city: string (nullable = true)

Table: d_temperature
root
 |-- dt: date (nullable = true)
 |-- avg_temp: string (nullable = true)
 |-- avg_temp_uncertnty: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)

Table: d_citizen
root
 |-- cic_id: double (nullable = true)
 |-- citizen_country: double (nullable = true)
 |-- residence_country: double (nullable = true)
 |-- birth_year: double (nullable = true)
 |-- gender: string (nullable = true)
 |-- ins_num: string (nullable = true)
 |-- immi_citizen_id: long (nullable = true)

Table: d_airline
root
 |-- cic_id: double (nullable = true)
 |-- airline: string (nullable = true)
 |-- admin_num: double (nullable = true)
 |-- flight_number: string (nullable = true)
 |-- visa_type: string (nullable = true)
 |-- immi_airline_id: long (nullable = true)

Table: f

In [6]:
for file_dir in s3_bucket.iterdir():
    if file_dir.is_dir():
        path = str(file_dir)
        df = spark.read.parquet(path)
        record_num = df.count()
        if record_num <= 0:
            raise ValueError("This table is empty!")
        else:
            print("Table: " + path.split('/')[-1] + f" is not empty: total {record_num} records.")


Table: city_code is not empty: total 660 records.
Table: d_temperature is not empty: total 687004 records.
Table: d_citizen is not empty: total 3096313 records.
Table: d_airline is not empty: total 3096313 records.
Table: f_immigration is not empty: total 3096313 records.
Table: d_demog_statistics is not empty: total 596 records.
Table: country_code is not empty: total 235 records.
Table: state_code is not empty: total 55 records.
