In [11]:
import os
import sys
import pandas as pd
import logging
from google.cloud import bigquery
from hashlib import md5
from typing import List
import uuid
from pandas.io.json import json_normalize

In [12]:
#create source file from .json currently in 1nf
df = pd.read_json('./data/tickets.json', lines=True)

#create a df for d_airlines by extrating the dict values into a new table
d_airline_df=pd.concat([pd.json_normalize(df['airline'][key]) for key in df.index]).reset_index(drop=True)
d_airline_df = d_airline_df.rename(columns={'name': 'airline_name', 'icao': 'airline_icao', 'iata': 'airline_iata', 'country': 'airline_country'})

#create a df for d_airports by extrating the dict values into a new table and appending origin airport with dest airport then dropping duplicated to get unique airort iata codes
d_origin_airport_df=pd.json_normalize(df['origin'])
d_destination_df=d_destination_df=pd.json_normalize(df['destination'])
d_airports_df=d_destination_df.append(d_origin_airport_df)
d_airports_df.set_index('iata', inplace=True)
d_airports_df.index.rename('airport_iata', inplace=True)
d_airports_df = d_airports_df.drop_duplicates()

#create a df for d_passenger by extrating the dict values into a new table
d_passenger_df=pd.concat([pd.json_normalize(df['passenger'][key]) for key in df.index]).reset_index(drop=True)

#add start_date and end_date columns
d_passenger_df["start_date"] = " "
d_passenger_df["end_date"] = " "


In [13]:
# **** SETUP ****

# change to match your filesystem
DATA_DIR = "./data/"
DEFAULT_SOURCE_FILE = os.path.join(DATA_DIR, "airport_source.csv")
# change to match your gcloud project 
PROJECT_NAME = "deb-01-372116"
DATASET_NAME = "airline_ticket_processor"


# **** TABLE SCHEMAS ****

TABLE_METADATA = {
    'd_airlines': {
        'table_name': 'd_airlines',
        'schema': [
            # indexes are written if only named in the schema
            bigquery.SchemaField('airline_iata', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('airline_name', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('airline_icao', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('callsign', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('airline_country', 'STRING', mode='NULLABLE'),
        ],
    },
    'd_airports': {
        'table_name': 'd_airports',
        'schema': [
            # indexes are written if only named in the schema
            bigquery.SchemaField('airport_iata', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('name', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('city', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('country', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('icao', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('latitude', 'FLOAT', mode='NULLABLE'),
            bigquery.SchemaField('longitude', 'FLOAT', mode='NULLABLE'),
            bigquery.SchemaField('altitude', 'FLOAT', mode='NULLABLE'),
            bigquery.SchemaField('tz_timezone', 'STRING', mode='NULLABLE'),
        ],
    },
    'd_passengers': {
        'table_name': 'd_passengers',
        'schema': [
            # indexes are written if only named in the schema
            bigquery.SchemaField('passenger_sk', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('first_name', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('last_name', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('gender', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('birth_date', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('email', 'STRING', mode='REQUIRED'),
            bigquery.SchemaField('city', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('state', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('zip', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('start_date', 'STRING', mode='NULLABLE'),
            bigquery.SchemaField('end_date', 'STRING', mode='NULLABLE'),
        ],
    },
}


# **** SETUP LOGGING ****
# setup logging and logger
logging.basicConfig(            # setting up the root logger
    format='[%(levelname)-5s][%(asctime)s][%(module)s:%(lineno)04d] : %(message)s',
    level=logging.INFO,
    stream=sys.stdout
)
logger: logging.Logger = logging.getLogger('root')      # alias the root logger as `logger`
logger.setLevel(logging.DEBUG)                          # programmatically reassign the logging level


# **** BIGQUERY CLIENT ****
logger.debug(f"Creating bigquery client")
client = bigquery.Client()

logger.info(f"Setup Completed")

[DEBUG][2023-01-10 14:09:05,364][4051764568:0072] : Creating bigquery client
[INFO ][2023-01-10 14:09:05,369][4051764568:0075] : Setup Completed


In [14]:
# create dataset if needed
dataset_id = f"{PROJECT_NAME}.{DATASET_NAME}"
dataset = bigquery.Dataset(dataset_id)
dataset.location = "US"
dataset = client.create_dataset(dataset, exists_ok=True)

logger.info(f"Created airline_ticket_processor: {dataset.full_dataset_id}")

[INFO ][2023-01-10 14:09:06,100][2378520198:0007] : Created airline_ticket_processor: deb-01-372116:airline_ticket_processor


In [15]:
#define the load_table function
def load_table(
    df: pd.DataFrame, 
    client: bigquery.Client, 
    table_name: str, 
    schema: List[bigquery.SchemaField], 
    create_disposition: str = 'CREATE_IF_NEEDED', 
    write_disposition: str = 'WRITE_TRUNCATE'
    ) -> None:
    """load dataframe into bigquery table

    Args:
        df (pd.DataFrame): dataframe to load
        client (bigquery.Client): bigquery client
        table_name (str): full table name including project and dataset id
        schema (List[bigquery.SchemaField]): table schema with data types
        create_disposition (str, optional): create table disposition. Defaults to 'CREATE_IF_NEEDED'.
        write_disposition (str, optional): overwrite table disposition. Defaults to 'WRITE_TRUNCATE'.
    """
    # *** run some checks ***
    # test table name to be full table name including project and dataset name. It must contain to dots
    assert len(table_name.split('.')) == 3, f"Table name must be a full bigquery table name including project and dataset id: '{table_name}'"
    # setup bigquery load job:
    #  create table if needed, replace rows, define the table schema
    job_config = bigquery.LoadJobConfig(
        create_disposition=create_disposition,
        write_disposition=write_disposition,
        schema=schema
    )
    logger.info(f"loading table: '{table_name}'")
    job = client.load_table_from_dataframe(df, destination=table_name, job_config=job_config)
    job.result()        # wait for the job to finish
    # get the resulting table
    table = client.get_table(table_name)
    logger.info(f"loaded {table.num_rows} rows into {table.full_table_id}")

In [16]:
# prepare airline df to be loaded
df = d_airline_df

logger.debug(f"getting uniques products...")

# set of unique columns to return
cols = ['airline_iata', 'airline_name', 'airline_icao', 'callsign', 'airline_country']
# group by unique columns and only select them
df = df.groupby(cols).all()
df = df.reset_index().loc[:, cols]

logger.info(f"products dim - found {len(df.index)} rows")

#----------------load d_airlines---------------------
# get table name and schema from our TABLE_METADATA config param
table_name = f"{PROJECT_NAME}.{DATASET_NAME}.{TABLE_METADATA['d_airlines']['table_name']}"
schema = schema=TABLE_METADATA['d_airlines']['schema']
# load dataframe
load_table(df, client, table_name, schema)

logger.info(f"loaded products dim")
display(df.head())

[DEBUG][2023-01-10 14:09:06,185][3186691139:0004] : getting uniques products...
[INFO ][2023-01-10 14:09:06,193][3186691139:0012] : products dim - found 48 rows
[INFO ][2023-01-10 14:09:06,194][1547112885:0030] : loading table: 'deb-01-372116.airline_ticket_processor.d_airlines'
[INFO ][2023-01-10 14:09:10,533][1547112885:0035] : loaded 48 rows into deb-01-372116:airline_ticket_processor.d_airlines
[INFO ][2023-01-10 14:09:10,534][3186691139:0021] : loaded products dim


Unnamed: 0,airline_iata,airline_name,airline_icao,callsign,airline_country
0,3U,Sichuan Airlines,CSC,SI CHUAN,China
1,7C,Jeju Air,JJA,JEJU AIR,Republic of Korea
2,9K,Cape Air,KAP,CAIR,United States
3,9S,Spring Airlines,CQH,AIR SPRING,China
4,AA,American Airlines,AAL,AMERICAN,United States


In [17]:
#----------------load d_airports---------------------
df = d_airports_df
logger.debug(f"getting uniques airports...")

# set of unique columns to return
cols = ['airport_iata', 'name', 'city', 'country', 'icao', 'latitude', 'longitude','altitude','tz_timezone']
# group by unique columns and only select them
df = df.groupby(cols).all()
df = df.reset_index().loc[:, cols]
logger.info(f"generated iata codes")

logger.info(f"airports dim - found {len(df.index)} rows")

#load table into bigquery
# get table name and schema from our TABLE_METADATA config param
table_name = f"{PROJECT_NAME}.{DATASET_NAME}.{TABLE_METADATA['d_airports']['table_name']}"
schema = schema=TABLE_METADATA['d_airports']['schema']
# load dataframe
load_table(df, client, table_name, schema)

logger.info(f"loaded airports dim")
display(df.head())

[DEBUG][2023-01-10 14:09:10,576][537061453:0003] : getting uniques airports...
[INFO ][2023-01-10 14:09:10,586][537061453:0010] : generated iata codes
[INFO ][2023-01-10 14:09:10,587][537061453:0012] : airports dim - found 386 rows
[INFO ][2023-01-10 14:09:10,588][1547112885:0030] : loading table: 'deb-01-372116.airline_ticket_processor.d_airports'
[INFO ][2023-01-10 14:09:14,073][1547112885:0035] : loaded 386 rows into deb-01-372116:airline_ticket_processor.d_airports
[INFO ][2023-01-10 14:09:14,074][537061453:0021] : loaded airports dim


Unnamed: 0,airport_iata,name,city,country,icao,latitude,longitude,altitude,tz_timezone
0,ABQ,Albuquerque International Sunport,Albuquerque,United States,KABQ,35.04,-106.61,5355.0,America/Denver
1,ACE,Lanzarote Airport,Arrecife,Spain,GCRR,28.95,-13.61,46.0,Atlantic/Canary
2,ADZ,Gustavo Rojas Pinilla International Airport,San Andres Island,Colombia,SKSP,12.58,-81.71,19.0,America/Bogota
3,AEP,Jorge Newbery Airpark,Buenos Aires,Argentina,SABE,-34.56,-58.42,18.0,America/Buenos_Aires
4,AER,Sochi International Airport,Sochi,Russia,URSS,43.45,39.96,89.0,Europe/Moscow


In [18]:
#----------------load d_passengers---------------------
df = d_passenger_df

logger.debug(f"getting unique passengers...")

#create UUIDS
for email in df['email'].unique():
    df.loc[df['email'] == email, 'passenger_sk'] = uuid.uuid4()
df = df.astype({"passenger_sk": str})
#set start-date to the ticket date
df = df.assign(start_date='2022-03-21')
df = df.assign(end_date='None')
# set of unique columns to return
cols = ['passenger_sk','email','first_name','last_name', 'gender', 'birth_date', 'street', 'city', 'state','zip','start_date','end_date']
# group by unique columns and only select them
df = df.groupby(cols).all()
df = df.reset_index().loc[:, cols]


logger.info(f"passengers dim - found {len(df.index)} rows")

# #load table into bigquery
# get table name and schema from our TABLE_METADATA config param
table_name = f"{PROJECT_NAME}.{DATASET_NAME}.{TABLE_METADATA['d_passengers']['table_name']}"
schema = schema=TABLE_METADATA['d_passengers']['schema']
# load dataframe
load_table(df, client, table_name, schema)

logger.info(f"loaded passengers dim")

display(df.head())

[DEBUG][2023-01-10 14:09:14,121][3198413335:0004] : getting unique passengers...
[INFO ][2023-01-10 14:09:14,168][3198413335:0020] : passengers dim - found 32 rows
[INFO ][2023-01-10 14:09:14,169][1547112885:0030] : loading table: 'deb-01-372116.airline_ticket_processor.d_passengers'
[INFO ][2023-01-10 14:09:17,648][1547112885:0035] : loaded 32 rows into deb-01-372116:airline_ticket_processor.d_passengers
[INFO ][2023-01-10 14:09:17,649][3198413335:0029] : loaded passengers dim


Unnamed: 0,passenger_sk,email,first_name,last_name,gender,birth_date,street,city,state,zip,start_date,end_date
0,0beb14f7-9853-496e-98d3-b5b495e0fd65,rachel.duffy.60@hotmail.com,Rachel,Duffy,NB,1960-05-14,18868 Dougherty Drive,Snowchester,OK,73022,2022-03-21,
1,0d7cf6d6-9230-40be-9883-fbef16d00906,seth.thompson.68@yahoo.com,Seth,Thompson,M,1968-05-02,22455 Higgins Junction Apt. 042,New Keith,OR,97405,2022-03-21,
2,165cd827-e219-43aa-ba5d-04c592a5dcd4,autumn.morse.60@hotmail.com,Autumn,Morse,F,1960-01-18,6984 Price Shoals,Erictown,HI,96818,2022-03-21,
3,27b2d924-dcb8-4890-b1a2-164cd8b867fc,danielle.henderson.70@hotmail.com,Danielle,Henderson,NB,1970-08-11,7389 Alec Squares Suite 508,Port Jonathan,NM,87320,2022-03-21,
4,2948e7df-9ff2-458c-95f6-0e0c7dc80d34,tony.hoffman.03@gmail.com,Tony,Hoffman,NB,2003-03-18,4706 Amy Roads Apt. 206,Stewartborough,AZ,85030,2022-03-21,
