In [1]:
import boto3
from botocore.exceptions import ClientError


def get_secret():

    secret_name = "DataSource_PostgresDB_Credentials"
    region_name = "eu-west-2"

    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )

    try:
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        # For a list of exceptions thrown, see
        # https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
        raise e

    secret = get_secret_value_response['SecretString']
    return secret

In [2]:
# Test the function
try:
    result = get_secret()
    print("Secret retrieved successfully")
    # Be careful not to print the actual secret in a production environment
except Exception as e:
    print(f"Error retrieving secret: {str(e)}")

Secret retrieved successfully


In [3]:
import boto3
from botocore.exceptions import ClientError
from pg8000.native import Connection, Error
import pandas as pd
import json

def get_secret():
    secret_name = "DataSource_PostgresDB_Credentials"
    region_name = "eu-west-2"

    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )

    try:
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        raise e

    secret = get_secret_value_response['SecretString']
    return secret

def create_connection():
    # Retrieve the secret
    secret = get_secret()
    secret_dict = json.loads(secret)

    # Extract credentials from the secret
    user = secret_dict['user']
    password = secret_dict['password']
    host = secret_dict['host']
    database = secret_dict['database']
    port = secret_dict['port']

    # Establish the database connection
    conn = Connection(
        user=user,
        database=database,
        host=host,
        password=password,
        port=port
    )
    return conn

In [4]:
conn = create_connection()

try:
    query = 'SELECT * FROM sales_order LIMIT 10;'
    results = conn.run(query)

    column_names = [desc['name'] for desc in conn.columns]
    df_sales_order = pd.DataFrame(results, columns=column_names)
    print("Table: sales_order")
    print(df_sales_order.to_string(index=False))
    # print(df_sales_order)
finally:
    conn.close()

Table: sales_order
 sales_order_id              created_at            last_updated  design_id  staff_id  counterparty_id  units_sold unit_price  currency_id agreed_delivery_date agreed_payment_date  agreed_delivery_location_id
              2 2022-11-03 14:20:52.186 2022-11-03 14:20:52.186          3        19                8       42972       3.94            2           2022-11-07          2022-11-08                            8
              3 2022-11-03 14:20:52.188 2022-11-03 14:20:52.188          4        10                4       65839       2.91            3           2022-11-06          2022-11-07                           19
              4 2022-11-03 14:20:52.188 2022-11-03 14:20:52.188          4        10               16       32069       3.89            2           2022-11-05          2022-11-07                           15
              5 2022-11-03 14:20:52.186 2022-11-03 14:20:52.186          7        18                4       49659       2.41            3           2

In [5]:
conn = create_connection()

try:
    query = 'SELECT * FROM design LIMIT 10;'
    results = conn.run(query)

    column_names = [desc['name'] for desc in conn.columns]
    df_design = pd.DataFrame(results, columns=column_names)
    print("Table: design")
    print(df_design.to_string(index=False))
finally:
    conn.close()

Table: design
 design_id              created_at design_name file_location                   file_name            last_updated
         8 2022-11-03 14:20:49.962      Wooden          /usr   wooden-20220717-npgz.json 2022-11-03 14:20:49.962
        51 2023-01-12 18:50:09.935      Bronze      /private   bronze-20221024-4dds.json 2023-01-12 18:50:09.935
        69 2023-02-07 17:31:10.093      Bronze   /lost+found   bronze-20230102-r904.json 2023-02-07 17:31:10.093
        16 2022-11-22 15:02:10.226        Soft       /System     soft-20211001-cjaz.json 2022-11-22 15:02:10.226
        54 2023-01-16 09:14:09.775     Plastic    /usr/ports  plastic-20221206-bw3l.json 2023-01-16 09:14:09.775
        55 2023-01-19 08:10:10.138    Concrete  /opt/include concrete-20210614-04nd.json 2023-01-19 08:10:10.138
        10 2022-11-03 14:20:49.962        Soft    /usr/share     soft-20220201-hzz1.json 2022-11-03 14:20:49.962
        57 2023-01-19 10:37:09.965      Cotton /etc/periodic   cotton-20220527-vn4

In [6]:
conn = create_connection()

try:
    query = 'SELECT * FROM currency LIMIT 10;'
    results = conn.run(query)

    column_names = [desc['name'] for desc in conn.columns]
    df_currency = pd.DataFrame(results, columns=column_names)
    print("Table: currency")
    print(df_currency.to_string(index=False))
finally:
    conn.close()

Table: currency
 currency_id currency_code              created_at            last_updated
           1           GBP 2022-11-03 14:20:49.962 2022-11-03 14:20:49.962
           2           USD 2022-11-03 14:20:49.962 2022-11-03 14:20:49.962
           3           EUR 2022-11-03 14:20:49.962 2022-11-03 14:20:49.962


In [7]:
conn = create_connection()

try:
    query = 'SELECT * FROM staff LIMIT 10;'
    results = conn.run(query)

    column_names = [desc['name'] for desc in conn.columns]
    df_staff = pd.DataFrame(results, columns=column_names)
    print("Table: staff")
    print(df_staff.to_string(index=False))
finally:
    conn.close()

Table: staff
 staff_id first_name last_name  department_id                       email_address              created_at            last_updated
        1    Jeremie    Franey              2    jeremie.franey@terrifictotes.com 2022-11-03 14:20:51.563 2022-11-03 14:20:51.563
        2      Deron     Beier              6       deron.beier@terrifictotes.com 2022-11-03 14:20:51.563 2022-11-03 14:20:51.563
        3   Jeanette    Erdman              6   jeanette.erdman@terrifictotes.com 2022-11-03 14:20:51.563 2022-11-03 14:20:51.563
        4        Ana    Glover              3        ana.glover@terrifictotes.com 2022-11-03 14:20:51.563 2022-11-03 14:20:51.563
        5  Magdalena     Zieme              8   magdalena.zieme@terrifictotes.com 2022-11-03 14:20:51.563 2022-11-03 14:20:51.563
        6      Korey   Kreiger              3     korey.kreiger@terrifictotes.com 2022-11-03 14:20:51.563 2022-11-03 14:20:51.563
        7    Raphael    Rippin              2    raphael.rippin@terrifictotes

In [8]:
conn = create_connection()

try:
    query = 'SELECT * FROM counterparty LIMIT 10;'
    results = conn.run(query)

    column_names = [desc['name'] for desc in conn.columns]
    df_counterparty = pd.DataFrame(results, columns=column_names)
    print("Table: counterparty")
    print(df_counterparty.to_string(index=False))
finally:
    conn.close()

Table: counterparty
 counterparty_id        counterparty_legal_name  legal_address_id commercial_contact        delivery_contact              created_at            last_updated
               1                 Fahey and Sons                15        Micheal Toy Mrs. Lucy Runolfsdottir 2022-11-03 14:20:51.563 2022-11-03 14:20:51.563
               2    Leannon, Predovic and Morar                28      Melba Sanford           Jean Hane III 2022-11-03 14:20:51.563 2022-11-03 14:20:51.563
               3                  Armstrong Inc                 2          Jane Wiza            Myra Kovacek 2022-11-03 14:20:51.563 2022-11-03 14:20:51.563
               4                     Kohler Inc                29        Taylor Haag       Alfredo Cassin II 2022-11-03 14:20:51.563 2022-11-03 14:20:51.563
               5    Frami, Yundt and Macejkovic                22     Homer Mitchell         Ivan Balistreri 2022-11-03 14:20:51.563 2022-11-03 14:20:51.563
               6                      

In [9]:
conn = create_connection()

try:
    query = 'SELECT * FROM address LIMIT 10;'
    results = conn.run(query)

    column_names = [desc['name'] for desc in conn.columns]
    df_address = pd.DataFrame(results, columns=column_names)
    print("Table: address")
    print(df_address.to_string(index=False))
finally:
    conn.close()

Table: address
 address_id       address_line_1   address_line_2        district              city postal_code                               country       phone              created_at            last_updated
          1      6826 Herzog Via             None            Avon New Patienceburgh       28441                                Turkey 1803 637401 2022-11-03 14:20:49.962 2022-11-03 14:20:49.962
          2    179 Alexie Cliffs             None            None       Aliso Viejo  99305-7380                            San Marino 9621 880720 2022-11-03 14:20:49.962 2022-11-03 14:20:49.962
          3     148 Sincere Fort             None            None      Lake Charles       89360                                 Samoa 0730 783349 2022-11-03 14:20:49.962 2022-11-03 14:20:49.962
          4   6102 Rogahn Skyway             None    Bedfordshire         Olsonside       47518                     Republic of Korea 1239 706295 2022-11-03 14:20:49.962 2022-11-03 14:20:49.962
          5    

In [10]:
conn = create_connection()

try:
    query = 'SELECT * FROM department LIMIT 10;'
    results = conn.run(query)

    column_names = [desc['name'] for desc in conn.columns]
    df_department = pd.DataFrame(results, columns=column_names)
    print("Table: department")
    print(df_department.to_string(index=False))
finally:
    conn.close()

Table: department
 department_id department_name   location        manager              created_at            last_updated
             1           Sales Manchester   Richard Roma 2022-11-03 14:20:49.962 2022-11-03 14:20:49.962
             2      Purchasing Manchester Naomi Lapaglia 2022-11-03 14:20:49.962 2022-11-03 14:20:49.962
             3      Production      Leeds   Chester Ming 2022-11-03 14:20:49.962 2022-11-03 14:20:49.962
             4        Dispatch       Leds     Mark Hanna 2022-11-03 14:20:49.962 2022-11-03 14:20:49.962
             5         Finance Manchester Jordan Belfort 2022-11-03 14:20:49.962 2022-11-03 14:20:49.962
             6      Facilities Manchester Shelley Levene 2022-11-03 14:20:49.962 2022-11-03 14:20:49.962
             7  Communications      Leeds      Ann Blake 2022-11-03 14:20:49.962 2022-11-03 14:20:49.962
             8              HR      Leeds     James Link 2022-11-03 14:20:49.962 2022-11-03 14:20:49.962


In [11]:
conn = create_connection()

try:
    query = 'SELECT * FROM purchase_order LIMIT 10;'
    results = conn.run(query)

    column_names = [desc['name'] for desc in conn.columns]
    df_purchase_order = pd.DataFrame(results, columns=column_names)
    print("Table: purchase_order")
    print(df_purchase_order.to_string(index=False))
finally:
    conn.close()

Table: purchase_order
 purchase_order_id              created_at            last_updated  staff_id  counterparty_id item_code  item_quantity item_unit_price  currency_id agreed_delivery_date agreed_payment_date  agreed_delivery_location_id
                 1 2022-11-03 14:20:52.187 2022-11-03 14:20:52.187        12               11   ZDOI5EA            371          361.39            2           2022-11-09          2022-11-07                            6
                 2 2022-11-03 14:20:52.186 2022-11-03 14:20:52.186        20               17   QLZLEXR            286          199.04            2           2022-11-04          2022-11-07                            8
                 3 2022-11-03 14:20:52.187 2022-11-03 14:20:52.187        12               15   AN3D85L            839          658.58            2           2022-11-05          2022-11-04                           16
                 5 2022-11-03 14:20:52.186 2022-11-03 14:20:52.186        18                2   I9MET53   

In [12]:
conn = create_connection()

try:
    query = 'SELECT * FROM payment_type LIMIT 10;'
    results = conn.run(query)

    column_names = [desc['name'] for desc in conn.columns]
    df_payment_type = pd.DataFrame(results, columns=column_names)
    print("Table: payment_type")
    print(df_payment_type.to_string(index=False))
finally:
    conn.close()

Table: payment_type
 payment_type_id payment_type_name              created_at            last_updated
               1     SALES_RECEIPT 2022-11-03 14:20:49.962 2022-11-03 14:20:49.962
               2      SALES_REFUND 2022-11-03 14:20:49.962 2022-11-03 14:20:49.962
               3  PURCHASE_PAYMENT 2022-11-03 14:20:49.962 2022-11-03 14:20:49.962
               4   PURCHASE_REFUND 2022-11-03 14:20:49.962 2022-11-03 14:20:49.962


In [13]:
conn = create_connection()

try:
    query = 'SELECT * FROM payment LIMIT 10;'
    results = conn.run(query)

    column_names = [desc['name'] for desc in conn.columns]
    df_payment = pd.DataFrame(results, columns=column_names)
    print("Table: payment")
    print(df_payment.to_string(index=False))
finally:
    conn.close()

Table: payment
 payment_id              created_at            last_updated  transaction_id  counterparty_id payment_amount  currency_id  payment_type_id  paid payment_date  company_ac_number  counterparty_ac_number
          2 2022-11-03 14:20:52.187 2022-11-03 14:20:52.187               2               15      552548.62            2                3 False   2022-11-04           67305075                31622269
          3 2022-11-03 14:20:52.186 2022-11-03 14:20:52.186               3               18      205952.22            3                1 False   2022-11-03           81718079                47839086
          5 2022-11-03 14:20:52.187 2022-11-03 14:20:52.187               5               17       57067.20            2                3 False   2022-11-06           66213052                91659548
          8 2022-11-03 14:20:52.186 2022-11-03 14:20:52.186               8                2      254007.12            3                3 False   2022-11-05           32948439          

In [14]:
conn = create_connection()

try:
    query = 'SELECT * FROM transaction LIMIT 10;'
    results = conn.run(query)

    column_names = [desc['name'] for desc in conn.columns]
    df_transaction = pd.DataFrame(results, columns=column_names)
    print("Table: transaction")
    print(df_transaction.to_string(index=False))
finally:
    conn.close()

Table: transaction
 transaction_id transaction_type  sales_order_id  purchase_order_id              created_at            last_updated
              1         PURCHASE             NaN                2.0 2022-11-03 14:20:52.186 2022-11-03 14:20:52.186
              2         PURCHASE             NaN                3.0 2022-11-03 14:20:52.187 2022-11-03 14:20:52.187
              3             SALE             1.0                NaN 2022-11-03 14:20:52.186 2022-11-03 14:20:52.186
              4         PURCHASE             NaN                1.0 2022-11-03 14:20:52.187 2022-11-03 14:20:52.187
              5         PURCHASE             NaN                4.0 2022-11-03 14:20:52.187 2022-11-03 14:20:52.187
              6             SALE             2.0                NaN 2022-11-03 14:20:52.186 2022-11-03 14:20:52.186
              7             SALE             3.0                NaN 2022-11-03 14:20:52.188 2022-11-03 14:20:52.188
              8         PURCHASE             NaN     

In [15]:
import boto3
from botocore.exceptions import ClientError


def get_secret(secret_name):
    if secret_name not in ["DataSource_PostgresDB_Credentials","DataTarget_PostgresDB_Credentials"]:
        if type(secret_name) == str:
            raise ValueError
        else:
            raise TypeError

    region_name = "eu-west-2"

    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(service_name="secretsmanager", region_name=region_name)

    try:
        get_secret_value_response = client.get_secret_value(SecretId=secret_name)
    except ClientError as e:
        raise e

    secret = get_secret_value_response["SecretString"]
    return secret


In [16]:
import boto3
from botocore.exceptions import ClientError
from pg8000.native import Connection, Error
import pandas as pd
import json
import os

if os.environ.get("AWS_EXECUTION_ENV") is not None:
    from get_secret import get_secret
else:
    from src.load.get_secret import get_secret


def create_connection(stage_name):
    # Retrieve the secret
    if stage_name == "extraction":
        secret = get_secret("DataSource_PostgresDB_Credentials")
    elif stage_name == "load":
        secret = get_secret("DataTarget_PostgresDB_Credentials")
    secret_dict = json.loads(secret)

    # Extract credentials from the secret
    user = secret_dict["user"]
    password = secret_dict["password"]
    host = secret_dict["host"]
    database = secret_dict["database"]
    port = secret_dict["port"]

    # Establish the database connection
    conn = Connection(
        user=user, database=database, host=host, password=password, port=port
    )
    return conn

In [17]:
def have_a_look_at_the_warehouse():
    db = create_connection("load")
    are_there_tables_query = """
    SELECT table_schema , table_name 
    FROM information_schema.tables
    WHERE table_schema not in ('information_schema', 'pg_catalog')
        AND table_type = 'BASE TABLE'
    ORDER BY table_schema, table_name
    """ # the answer was yes


    tables = db.run(are_there_tables_query)
    print("Tables in the database:")
    for schema, table_name in tables:
        print(f"- {schema}.{table_name}")
    sql_query = """
    SELECT * from project_team_9.fact_sales_order;
    """
    response = db.run(sql_query)
    columns = db.columns
    print(response, columns)

if __name__ == "__main__":
    have_a_look_at_the_warehouse()

Tables in the database:
- project_team_9.dim_counterparty
- project_team_9.dim_currency
- project_team_9.dim_date
- project_team_9.dim_design
- project_team_9.dim_location
- project_team_9.dim_payment_type
- project_team_9.dim_staff
- project_team_9.dim_transaction
- project_team_9.fact_payment
- project_team_9.fact_purchase_order
- project_team_9.fact_sales_order
[] [{'table_oid': 404637, 'column_attrnum': 1, 'type_oid': 23, 'type_size': 4, 'type_modifier': -1, 'format': 0, 'name': 'sales_record_id'}, {'table_oid': 404637, 'column_attrnum': 2, 'type_oid': 23, 'type_size': 4, 'type_modifier': -1, 'format': 0, 'name': 'sales_order_id'}, {'table_oid': 404637, 'column_attrnum': 3, 'type_oid': 1082, 'type_size': 4, 'type_modifier': -1, 'format': 0, 'name': 'created_date'}, {'table_oid': 404637, 'column_attrnum': 4, 'type_oid': 1083, 'type_size': 8, 'type_modifier': -1, 'format': 0, 'name': 'created_time'}, {'table_oid': 404637, 'column_attrnum': 5, 'type_oid': 1082, 'type_size': 4, 'type_m

In [18]:
def print_table_and_column_names(connection):
    try:
        # Query to list all tables and their columns in non-system schemas
        list_columns_query = """
        SELECT table_schema, table_name, column_name
        FROM information_schema.columns
        WHERE table_schema NOT IN ('information_schema', 'pg_catalog')
        ORDER BY table_schema, table_name, ordinal_position
        """

        # Execute the query to get the list of tables and columns
        columns_info = connection.run(list_columns_query)
        # Organize the data into a dictionary for easy printing
        tables = {}
        for schema, table_name, column_name in columns_info:
            full_table_name = f"{schema}.{table_name}"
            if full_table_name not in tables:
                tables[full_table_name] = []
            tables[full_table_name].append(column_name)

        # Print the tables and their columns
        print("Tables and their columns in the database:")
        for table, columns in tables.items():
            print(f"- {table}:")
            for column in columns:
                print(f"  - {column}")

    except Exception as e:
        print(f"An error occurred while listing tables and columns: {e}")

In [19]:
conn = create_connection("load")

        # List tables and preview data
print_table_and_column_names(conn)

        # Close the connection
conn.close()

Tables and their columns in the database:
- project_team_9.dim_counterparty:
  - counterparty_id
  - counterparty_legal_name
  - counterparty_legal_address_line_1
  - counterparty_legal_address_line_2
  - counterparty_legal_district
  - counterparty_legal_city
  - counterparty_legal_postal_code
  - counterparty_legal_country
  - counterparty_legal_phone_number
- project_team_9.dim_currency:
  - currency_id
  - currency_code
  - currency_name
- project_team_9.dim_date:
  - date_id
  - year
  - month
  - day
  - day_of_week
  - day_name
  - month_name
  - quarter
- project_team_9.dim_design:
  - design_id
  - design_name
  - file_location
  - file_name
- project_team_9.dim_location:
  - location_id
  - address_line_1
  - address_line_2
  - district
  - city
  - postal_code
  - country
  - phone
- project_team_9.dim_payment_type:
  - payment_type_id
  - payment_type_name
- project_team_9.dim_staff:
  - staff_id
  - first_name
  - last_name
  - department_name
  - location
  - email_addres

In [20]:
def print_table_and_column_names_with_keys(connection):
    try:
        # Query to list all tables and their columns in non-system schemas
        list_columns_query = """
        SELECT table_schema, table_name, column_name
        FROM information_schema.columns
        WHERE table_schema NOT IN ('information_schema', 'pg_catalog')
        ORDER BY table_schema, table_name, ordinal_position
        """

        # Query to find primary keys
        primary_keys_query = """
        SELECT tc.table_schema, tc.table_name, kcu.column_name
        FROM information_schema.table_constraints AS tc
        JOIN information_schema.key_column_usage AS kcu
        ON tc.constraint_name = kcu.constraint_name
        WHERE tc.constraint_type = 'PRIMARY KEY'
        AND tc.table_schema NOT IN ('information_schema', 'pg_catalog')
        ORDER BY tc.table_schema, tc.table_name;
        """

        # Query to find foreign keys
        foreign_keys_query = """
        SELECT tc.table_schema, tc.table_name, kcu.column_name, ccu.table_schema AS foreign_table_schema,
        ccu.table_name AS foreign_table_name, ccu.column_name AS foreign_column_name
        FROM information_schema.table_constraints AS tc
        JOIN information_schema.key_column_usage AS kcu
        ON tc.constraint_name = kcu.constraint_name
        JOIN information_schema.constraint_column_usage AS ccu
        ON ccu.constraint_name = tc.constraint_name
        WHERE tc.constraint_type = 'FOREIGN KEY'
        AND tc.table_schema NOT IN ('information_schema', 'pg_catalog')
        ORDER BY tc.table_schema, tc.table_name;
        """

        # Execute the queries
        columns_info = connection.run(list_columns_query)
        primary_keys = connection.run(primary_keys_query)
        foreign_keys = connection.run(foreign_keys_query)

        # Organize the data into a dictionary for easy printing
        tables = {}
        for schema, table_name, column_name in columns_info:
            full_table_name = f"{schema}.{table_name}"
            if full_table_name not in tables:
                tables[full_table_name] = {'columns': [], 'primary_keys': [], 'foreign_keys': []}
            tables[full_table_name]['columns'].append(column_name)

        for schema, table_name, column_name in primary_keys:
            full_table_name = f"{schema}.{table_name}"
            if full_table_name in tables:
                tables[full_table_name]['primary_keys'].append(column_name)

        for schema, table_name, column_name, foreign_schema, foreign_table, foreign_column in foreign_keys:
            full_table_name = f"{schema}.{table_name}"
            if full_table_name in tables:
                fk_info = f"{column_name} -> {foreign_schema}.{foreign_table}.{foreign_column}"
                tables[full_table_name]['foreign_keys'].append(fk_info)

        # Print the tables, their columns, and keys
        print("Tables, their columns, and keys in the database:")
        for table, info in tables.items():
            print(f"- {table}:")
            print("  Columns:")
            for column in info['columns']:
                print(f"    - {column}")
            if info['primary_keys']:
                print("  Primary Keys:")
                for pk in info['primary_keys']:
                    print(f"    - {pk}")
            if info['foreign_keys']:
                print("  Foreign Keys:")
                for fk in info['foreign_keys']:
                    print(f"    - {fk}")

    except Exception as e:
        print(f"An error occurred while listing tables, columns, and keys: {e}")



In [21]:


conn = create_connection("load")
if conn:
    print_table_and_column_names_with_keys(conn)
    conn.close()

Tables, their columns, and keys in the database:
- project_team_9.dim_counterparty:
  Columns:
    - counterparty_id
    - counterparty_legal_name
    - counterparty_legal_address_line_1
    - counterparty_legal_address_line_2
    - counterparty_legal_district
    - counterparty_legal_city
    - counterparty_legal_postal_code
    - counterparty_legal_country
    - counterparty_legal_phone_number
  Primary Keys:
    - counterparty_id
- project_team_9.dim_currency:
  Columns:
    - currency_id
    - currency_code
    - currency_name
  Primary Keys:
    - currency_id
- project_team_9.dim_date:
  Columns:
    - date_id
    - year
    - month
    - day
    - day_of_week
    - day_name
    - month_name
    - quarter
  Primary Keys:
    - date_id
- project_team_9.dim_design:
  Columns:
    - design_id
    - design_name
    - file_location
    - file_name
  Primary Keys:
    - design_id
- project_team_9.dim_location:
  Columns:
    - location_id
    - address_line_1
    - address_line_2
    - 

In [22]:
pd.read_parquet("test/test_load/test_data/currency-2024-08-23_10.36.53.parquet")

Unnamed: 0,currency_id,currency_code,currency_name
0,1,GBP,British Pound
1,2,USD,US Dollar
2,3,EUR,Euro


In [23]:
import pandas as pd
pd.read_parquet("test/test_load/test_data/dim_counterparty-2024-08-23_11.41.31 (1).parquet")

FileNotFoundError: [Errno 2] No such file or directory: 'test/test_load/test_data/dim_counterparty-2024-08-23_11.41.31 (1).parquet'

In [None]:
pd.read_parquet("test/test_load/test_data/dim_counterparty-2024-08-23_13.51.46.parquet")

Unnamed: 0,currency_id,currency_code,currency_name
0,1,GBP,British Pound
1,2,USD,US Dollar
2,3,EUR,Euro


In [None]:
import pandas as pd
pd.read_parquet("test/test_load/test_data/fact_sales_order-2024-08-23_11.41.35.parquet")

Unnamed: 0,sales_order_id,created_date,created_time,last_updated_date,last_updated_time,sales_staff_id,counterparty_id,units_sold,unit_price,currency_id,design_id,agreed_payment_date,agreed_delivery_date,agreed_delivery_location_id
0,2,2022-11-03,14:20:52.186000,2022-11-03,14:20:52.186000,19,8,42972,3.94,2,3,2022-11-08,2022-11-07,8
1,3,2022-11-03,14:20:52.188000,2022-11-03,14:20:52.188000,10,4,65839,2.91,3,4,2022-11-07,2022-11-06,19
2,4,2022-11-03,14:20:52.188000,2022-11-03,14:20:52.188000,10,16,32069,3.89,2,4,2022-11-07,2022-11-05,15
3,5,2022-11-03,14:20:52.186000,2022-11-03,14:20:52.186000,18,4,49659,2.41,3,7,2022-11-08,2022-11-05,25
4,6,2022-11-04,11:37:10.341000,2022-11-04,11:37:10.341000,13,18,83908,3.99,3,3,2022-11-07,2022-11-04,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9864,9866,2024-08-23,09:42:10.083000,2024-08-23,09:42:10.083000,5,7,82022,3.91,1,163,2024-08-28,2024-08-29,11
9865,9867,2024-08-23,09:48:09.739000,2024-08-23,09:48:09.739000,5,6,79799,2.22,2,60,2024-08-24,2024-08-28,22
9866,8175,2024-05-09,13:31:10.043000,2024-05-09,13:31:10.043000,3,2,87155,2.70,3,237,2024-05-12,2024-05-12,29
9867,9868,2024-08-23,10:22:10.139000,2024-08-23,10:22:10.139000,19,13,31324,3.64,1,370,2024-08-25,2024-08-23,6


In [None]:
pd.read_parquet("test/test_load/test_data/dim_date-2024-08-23_11.05.48.parquet")

Unnamed: 0,date_id,year,month,day,day_of_week,day_name,month_name,quarter
0,2022-01-01,2022,1,1,5,Saturday,January,1
1,2022-01-02,2022,1,2,6,Sunday,January,1
2,2022-01-03,2022,1,3,0,Monday,January,1
3,2022-01-04,2022,1,4,1,Tuesday,January,1
4,2022-01-05,2022,1,5,2,Wednesday,January,1
...,...,...,...,...,...,...,...,...
1091,2024-12-27,2024,12,27,4,Friday,December,4
1092,2024-12-28,2024,12,28,5,Saturday,December,4
1093,2024-12-29,2024,12,29,6,Sunday,December,4
1094,2024-12-30,2024,12,30,0,Monday,December,4


In [28]:
import pandas as pd

location_df = pd.read_parquet('test/test_load/test_data/dim_location-2024-08-23_11.41.33.parquet')
location_df['country'] = location_df['country'].str.replace("'", "''")
print(location_df)

    location_id           address_line_1    address_line_2         district  \
0             1          6826 Herzog Via           Unknown             Avon   
1             2        179 Alexie Cliffs           Unknown          Unknown   
2             3         148 Sincere Fort           Unknown          Unknown   
3             4       6102 Rogahn Skyway           Unknown     Bedfordshire   
4             5        34177 Upton Track           Unknown          Unknown   
5             6        846 Kailey Island           Unknown          Unknown   
6             7     75653 Ernestine Ways           Unknown  Buckinghamshire   
7             8       0579 Durgan Common           Unknown          Unknown   
8             9        644 Edward Garden           Unknown          Borders   
9            10        49967 Kaylah Flat  Tremaine Circles     Bedfordshire   
10           11      249 Bernier Mission           Unknown  Buckinghamshire   
11           12  6461 Ernesto Expressway           U