In [54]:
import os
import sys
import pandas as pd
import logging
from google.cloud import bigquery
from hashlib import md5
from typing import List
import uuid

In [55]:
#create source file from .json currently in 1nf
df = pd.read_json('./data/tickets.json', lines=True)
#create a csv for d_airlines by extrating the dict values into a new table
from pandas.io.json import json_normalize

d_airline_df=pd.concat([pd.json_normalize(df['airline'][key]) for key in df.index]).reset_index(drop=True)
d_airline_df = d_airline_df.rename(columns={'name': 'airline_name', 'icao': 'airline_icao', 'iata': 'airline_iata', 'country': 'airline_country'})
d_airline_df.to_csv('./data/d_airlines.csv', mode= 'w')
#create a csv for d_origin_airport by extrating the dict values into a new table
d_origin_airport_df=pd.json_normalize(df['origin'])
d_origin_airport_df = d_origin_airport_df.rename(columns={'name': 'origin_airport_name', 'icao': 'origin_icao', 'iata': 'origin_airport_iata', 'country': 'origin_airport_country','city':'origin_airport_city','latitude':'origin_lat','longitude': 'origin_long', 'altitude': 'origin_altitude', 'tz_timezone':'origin_timezone'})
d_origin_airport_df.to_csv('./data/d_origin_airports.csv', mode= 'w')
#create a csv for d_destination_airports
d_destination_df=d_destination_df=pd.json_normalize(df['destination'])
d_destination_df = d_destination_df.rename(columns={'name': 'dest_airport_name', 'icao': 'dest_icao', 'iata': 'dest_airport_iata', 'country': 'dest_airport_country','city':'dest_airport_city','latitude':'dest_lat','longitude': 'dest_long', 'altitude': 'dest_altitude', 'tz_timezone':'dest_timezone'})
#create a csv for d_passenger by extrating the dict values into a new table
d_passenger_df=pd.concat([pd.json_normalize(df['passenger'][key]) for key in df.index]).reset_index(drop=True)
#add start_date and end_date columns
d_passenger_df["start_date"] = " "
d_passenger_df["end_date"] = " "
d_origin_airport_df = d_origin_airport_df.rename(columns={'name': 'origin_airport_name'})
d_passenger_df.to_csv('./data/SCD2_d_passengers.csv', mode= 'w')
#create f_tickets table
f_tickets_df = df[['eticket_num', 'confirmation','ticket_date','price','seat','status']]
#merge datasets to make a source csv
source_df= pd.merge(d_airline_df, d_origin_airport_df, left_index=True, right_index=True)
source_df= pd.merge(source_df, d_passenger_df, left_index=True, right_index=True)
source_df= pd.merge(source_df,d_destination_df, left_index=True, right_index=True)
source_df= pd.merge(source_df,f_tickets_df, left_index=True, right_index=True)
source_df["org_iata_dest_iata"] = source_df['origin_airport_iata'].astype(str) +"-"+ source_df["dest_airport_iata"] #create the primary key using both iata codes
source_df.to_csv('./data/airport_source.csv')


In [56]:
import os
import sys
import pandas as pd
import logging
from google.cloud import bigquery
from hashlib import md5
from typing import List


# **** SETUP ****

# change to match your filesystem
DATA_DIR = "./data/"
DEFAULT_SOURCE_FILE = os.path.join(DATA_DIR, "airport_source.csv")
# change to match your gcloud project 
PROJECT_NAME = "deb-01-372116"
DATASET_NAME = "airline_ticket_processor"


# **** TABLE SCHEMAS ****

TABLE_METADATA = {
    'd_airlines': {
        'table_name': 'd_airlines',
        'schema': [
            # indexes are written if only named in the schema
            bigquery.SchemaField('airline_iata', 'string', mode='REQUIRED'),
            bigquery.SchemaField('airline_name', 'string', mode='NULLABLE'),
            bigquery.SchemaField('airline_icao', 'string', mode='NULLABLE'),
            bigquery.SchemaField('callsign', 'string', mode='NULLABLE'),
            bigquery.SchemaField('airline_country', 'string', mode='NULLABLE'),
        ],
    },
    'd_airports': {
        'table_name': 'd_airports',
        'schema': [
            # indexes are written if only named in the schema
            bigquery.SchemaField('org_iata_dest_iata', 'string', mode='REQUIRED'),
            bigquery.SchemaField('origin_airport_iata', 'string', mode='NULLABLE'),
            bigquery.SchemaField('dest_airport_iata', 'string', mode='NULLABLE'),
            bigquery.SchemaField('origin_airport_name', 'string', mode='NULLABLE'),
            bigquery.SchemaField('origin_airport_city', 'string', mode='NULLABLE'),
            bigquery.SchemaField('origin_airport_country', 'string', mode='NULLABLE'),
            bigquery.SchemaField('origin_icao', 'string', mode='NULLABLE'),
            bigquery.SchemaField('origin_lat', 'FLOAT', mode='NULLABLE'),
            bigquery.SchemaField('origin_long', 'FLOAT', mode='NULLABLE'),
            bigquery.SchemaField('origin_altitude', 'FLOAT', mode='NULLABLE'),
            bigquery.SchemaField('origin_timezone', 'string', mode='NULLABLE'),
            bigquery.SchemaField('dest_airport_name', 'string', mode='NULLABLE'),
            bigquery.SchemaField('dest_airport_city', 'string', mode='NULLABLE'),
            bigquery.SchemaField('dest_airport_country', 'string', mode='NULLABLE'),
            bigquery.SchemaField('dest_icao', 'string', mode='NULLABLE'),
            bigquery.SchemaField('dest_lat', 'FLOAT', mode='NULLABLE'),
            bigquery.SchemaField('dest_long', 'FLOAT', mode='NULLABLE'),
            bigquery.SchemaField('dest_altitude', 'FLOAT', mode='NULLABLE'),
            bigquery.SchemaField('dest_timezone', 'string', mode='NULLABLE'),
        ],
    },
    'SCD2_d_passengers': {
        'table_name': 'SCD2_d_passengers',
        'schema': [
            # indexes are written if only named in the schema
            bigquery.SchemaField('UUID_SK', 'string', mode='REQUIRED'),
            bigquery.SchemaField('first_name', 'string', mode='NULLABLE'),
            bigquery.SchemaField('last_name', 'string', mode='NULLABLE'),
            bigquery.SchemaField('gender', 'string', mode='NULLABLE'),
            bigquery.SchemaField('birth_date', 'string', mode='NULLABLE'),
            bigquery.SchemaField('email', 'string', mode='REQUIRED'),
            bigquery.SchemaField('city', 'string', mode='NULLABLE'),
            bigquery.SchemaField('state', 'string', mode='NULLABLE'),
            bigquery.SchemaField('zip', 'INTEGER', mode='NULLABLE'),
            bigquery.SchemaField('start_date', 'string', mode='NULLABLE'),
            bigquery.SchemaField('end_date', 'string', mode='NULLABLE'),
        ],
    },
}


# **** SETUP LOGGING ****
# setup logging and logger
logging.basicConfig(            # setting up the root logger
    format='[%(levelname)-5s][%(asctime)s][%(module)s:%(lineno)04d] : %(message)s',
    level=logging.INFO,
    stream=sys.stdout
)
logger: logging.Logger = logging.getLogger('root')      # alias the root logger as `logger`
logger.setLevel(logging.DEBUG)                          # programmatically reassign the logging level


# **** BIGQUERY CLIENT ****
logger.debug(f"Creating bigquery client")
client = bigquery.Client()

logger.info(f"Setup Completed")

[DEBUG][2023-01-06 16:03:44,310][4055345799:0091] : Creating bigquery client
[INFO ][2023-01-06 16:03:44,314][4055345799:0094] : Setup Completed


In [57]:
# create dataset if needed
dataset_id = f"{PROJECT_NAME}.{DATASET_NAME}"
dataset = bigquery.Dataset(dataset_id)
dataset.location = "US"
dataset = client.create_dataset(dataset, exists_ok=True)

logger.info(f"Created airline_ticket_processor: {dataset.full_dataset_id}")

[INFO ][2023-01-06 16:03:45,755][2378520198:0007] : Created airline_ticket_processor: deb-01-372116:airline_ticket_processor


In [58]:
#load airlines data
# receipts data file name
filename = DEFAULT_SOURCE_FILE
logger.debug(f"attempting to process: {filename}")

# *** always perform checks first ***
# check if the file exists
assert os.path.exists(filename), f"Data file does not exists: '{filename}'"
# check if the file contains any data. Header row alone is 78 bytes. size > 78
assert os.path.getsize(filename) > 78, f"Data file size incorrect; does not seem to contain data: '{filename}'"

# load into dataframe
df = pd.read_csv(filename, header=0)
logger.info(f"loaded {len(df.index)} rows from: {filename}")

# assign & remember receipts dataframe
source_df = df

[DEBUG][2023-01-06 16:03:45,826][1238745669:0004] : attempting to process: ./data/airport_source.csv
[INFO ][2023-01-06 16:03:45,899][1238745669:0014] : loaded 4096 rows from: ./data/airport_source.csv


In [59]:
# prepare airline df to be loaded
df = source_df

logger.debug(f"getting uniques products...")

# set of unique columns to return
cols = ['airline_iata', 'airline_name', 'airline_icao', 'callsign', 'airline_country']
# group by unique columns and only select them
df = df.groupby(cols).all()
df = df.reset_index().loc[:, cols]

logger.info(f"products dim - found {len(df.index)} rows")
display(df.head())

[DEBUG][2023-01-06 16:03:45,957][3447633031:0004] : getting uniques products...
[INFO ][2023-01-06 16:03:46,038][3447633031:0012] : products dim - found 48 rows


Unnamed: 0,airline_iata,airline_name,airline_icao,callsign,airline_country
0,3U,Sichuan Airlines,CSC,SI CHUAN,China
1,7C,Jeju Air,JJA,JEJU AIR,Republic of Korea
2,9K,Cape Air,KAP,CAIR,United States
3,9S,Spring Airlines,CQH,AIR SPRING,China
4,AA,American Airlines,AAL,AMERICAN,United States


In [60]:
#define the load_table function
def load_table(
    df: pd.DataFrame, 
    client: bigquery.Client, 
    table_name: str, 
    schema: List[bigquery.SchemaField], 
    create_disposition: str = 'CREATE_IF_NEEDED', 
    write_disposition: str = 'WRITE_TRUNCATE'
    ) -> None:
    """load dataframe into bigquery table

    Args:
        df (pd.DataFrame): dataframe to load
        client (bigquery.Client): bigquery client
        table_name (str): full table name including project and dataset id
        schema (List[bigquery.SchemaField]): table schema with data types
        create_disposition (str, optional): create table disposition. Defaults to 'CREATE_IF_NEEDED'.
        write_disposition (str, optional): overwrite table disposition. Defaults to 'WRITE_TRUNCATE'.
    """
    # *** run some checks ***
    # test table name to be full table name including project and dataset name. It must contain to dots
    assert len(table_name.split('.')) == 3, f"Table name must be a full bigquery table name including project and dataset id: '{table_name}'"
    # setup bigquery load job:
    #  create table if needed, replace rows, define the table schema
    job_config = bigquery.LoadJobConfig(
        create_disposition=create_disposition,
        write_disposition=write_disposition,
        schema=schema
    )
    logger.info(f"loading table: '{table_name}'")
    job = client.load_table_from_dataframe(df, destination=table_name, job_config=job_config)
    job.result()        # wait for the job to finish
    # get the resulting table
    table = client.get_table(table_name)
    logger.info(f"loaded {table.num_rows} rows into {table.full_table_id}")

In [61]:
##----------------load d_airlines---------------------
# get table name and schema from our TABLE_METADATA config param
table_name = f"{PROJECT_NAME}.{DATASET_NAME}.{TABLE_METADATA['d_airlines']['table_name']}"
schema = schema=TABLE_METADATA['d_airlines']['schema']
# load dataframe
load_table(df, client, table_name, schema)

logger.info(f"loaded products dim")

[INFO ][2023-01-06 16:03:46,231][1547112885:0030] : loading table: 'deb-01-372116.airline_ticket_processor.d_airlines'
[INFO ][2023-01-06 16:03:49,759][1547112885:0035] : loaded 48 rows into deb-01-372116:airline_ticket_processor.d_airlines
[INFO ][2023-01-06 16:03:49,761][3048314498:0008] : loaded products dim


In [62]:
#----------------load d_airports---------------------
df = source_df

logger.debug(f"getting uniques airports...")

# set of unique columns to return
cols = ['org_iata_dest_iata','origin_airport_iata', 'dest_airport_iata', 'origin_airport_name', 'origin_airport_city', 'origin_airport_country', 'origin_icao', 'origin_lat','origin_long','origin_altitude','origin_timezone','dest_airport_name', 'dest_airport_city', 'dest_airport_country', 'dest_icao', 'dest_lat','dest_long','dest_altitude','dest_timezone']
# group by unique columns and only select them
df = df.groupby(cols).all()
df = df.reset_index().loc[:, cols]
logger.info(f"generated iata codes")

logger.info(f"airports dim - found {len(df.index)} rows")

#load table into bigquery
# get table name and schema from our TABLE_METADATA config param
table_name = f"{PROJECT_NAME}.{DATASET_NAME}.{TABLE_METADATA['d_airports']['table_name']}"
schema = schema=TABLE_METADATA['d_airports']['schema']
# load dataframe
load_table(df, client, table_name, schema)

logger.info(f"loaded airports dim")
display(df.head())

[DEBUG][2023-01-06 16:03:49,823][2379349505:0004] : getting uniques airports...
[INFO ][2023-01-06 16:03:49,897][2379349505:0011] : generated iata codes
[INFO ][2023-01-06 16:03:49,898][2379349505:0013] : airports dim - found 3817 rows
[INFO ][2023-01-06 16:03:49,900][1547112885:0030] : loading table: 'deb-01-372116.airline_ticket_processor.d_airports'
[INFO ][2023-01-06 16:03:55,124][1547112885:0035] : loaded 3817 rows into deb-01-372116:airline_ticket_processor.d_airports
[INFO ][2023-01-06 16:03:55,125][2379349505:0022] : loaded airports dim


Unnamed: 0,org_iata_dest_iata,origin_airport_iata,dest_airport_iata,origin_airport_name,origin_airport_city,origin_airport_country,origin_icao,origin_lat,origin_long,origin_altitude,origin_timezone,dest_airport_name,dest_airport_city,dest_airport_country,dest_icao,dest_lat,dest_long,dest_altitude,dest_timezone
0,ABQ-BOM,ABQ,BOM,Albuquerque International Sunport,Albuquerque,United States,KABQ,35.04,-106.61,5355.0,America/Denver,Chhatrapati Shivaji International Airport,Mumbai,India,VABB,19.09,72.87,39.0,Asia/Calcutta
1,ABQ-HKG,ABQ,HKG,Albuquerque International Sunport,Albuquerque,United States,KABQ,35.04,-106.61,5355.0,America/Denver,Hong Kong International Airport,Hong Kong,Hong Kong,VHHH,22.31,113.92,28.0,Asia/Hong_Kong
2,ABQ-JUL,ABQ,JUL,Albuquerque International Sunport,Albuquerque,United States,KABQ,35.04,-106.61,5355.0,America/Denver,Inca Manco Capac International Airport,Juliaca,Peru,SPJL,-15.47,-70.16,12552.0,America/Lima
3,ABQ-KUL,ABQ,KUL,Albuquerque International Sunport,Albuquerque,United States,KABQ,35.04,-106.61,5355.0,America/Denver,Kuala Lumpur International Airport,Kuala Lumpur,Malaysia,WMKK,2.75,101.71,69.0,Asia/Kuala_Lumpur
4,ABQ-MAD,ABQ,MAD,Albuquerque International Sunport,Albuquerque,United States,KABQ,35.04,-106.61,5355.0,America/Denver,Adolfo Suárez Madrid–Barajas Airport,Madrid,Spain,LEMD,40.47,-3.56,1998.0,Europe/Madrid


In [63]:
#----------------load SCD2_d_passengers---------------------
df = source_df

logger.debug(f"getting unique passengers...")

#create UUIDS
for email in df['email'].unique():
    df.loc[df['email'] == email, 'UUID_SK'] = uuid.uuid4()
df = df.astype({"UUID_SK": str})
#set start-date to the ticket date
df = df.assign(start_date=f_tickets_df['ticket_date'])
df = df.assign(end_date='None')
# set of unique columns to return
cols = ['UUID_SK','email','first_name','last_name', 'gender', 'birth_date', 'street', 'city', 'state','zip','start_date','end_date']
# group by unique columns and only select them
df = df.groupby(cols).all()
df = df.reset_index().loc[:, cols]

logger.info(f"passengers dim - found {len(df.index)} rows")

# #load table into bigquery
# get table name and schema from our TABLE_METADATA config param
table_name = f"{PROJECT_NAME}.{DATASET_NAME}.{TABLE_METADATA['SCD2_d_passengers']['table_name']}"
schema = schema=TABLE_METADATA['SCD2_d_passengers']['schema']
# load dataframe
load_table(df, client, table_name, schema)

logger.info(f"loaded passengers dim")

display(df.head())

[DEBUG][2023-01-06 16:03:55,186][756748291:0004] : getting unique passengers...
[INFO ][2023-01-06 16:03:55,290][756748291:0019] : passengers dim - found 128 rows
[INFO ][2023-01-06 16:03:55,292][1547112885:0030] : loading table: 'deb-01-372116.airline_ticket_processor.SCD2_d_passengers'
[INFO ][2023-01-06 16:03:58,878][1547112885:0035] : loaded 128 rows into deb-01-372116:airline_ticket_processor.SCD2_d_passengers
[INFO ][2023-01-06 16:03:58,879][756748291:0028] : loaded passengers dim


Unnamed: 0,UUID_SK,email,first_name,last_name,gender,birth_date,street,city,state,zip,start_date,end_date
0,06484a3e-d210-46df-9c6c-7b8407382f82,seth.thompson.68@yahoo.com,Seth,Thompson,M,1968-05-02,22455 Higgins Junction Apt. 042,New Keith,OR,97405,2022-03-21,
1,06484a3e-d210-46df-9c6c-7b8407382f82,seth.thompson.68@yahoo.com,Seth,Thompson,M,1968-05-02,22455 Higgins Junction Apt. 042,New Keith,OR,97405,2022-03-22,
2,06484a3e-d210-46df-9c6c-7b8407382f82,seth.thompson.68@yahoo.com,Seth,Thompson,M,1968-05-02,22455 Higgins Junction Apt. 042,New Keith,OR,97405,2022-03-23,
3,06484a3e-d210-46df-9c6c-7b8407382f82,seth.thompson.68@yahoo.com,Seth,Thompson,M,1968-05-02,22455 Higgins Junction Apt. 042,New Keith,OR,97405,2022-03-24,
4,082fb705-d8f1-489a-acdc-3da240f5d4ae,sean.murray.53@yahoo.com,Sean,Murray,M,1953-02-11,809 Erika Valley Apt. 634,Petersenfort,NY,13184,2022-03-21,
