In [None]:
import os
import pandas as pd
from snowflake.snowpark import Session
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Snowflake connection configuration
snowflake_params = {
    "account": os.getenv("SNOWFLAKE_ACCOUNT"),
    "user": os.getenv("SNOWFLAKE_USER"),
    "password": os.getenv("SNOWFLAKE_PASSWORD"),
    "role": os.getenv("SNOWFLAKE_ROLE"),
    "warehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
    "database": os.getenv("SNOWFLAKE_DATABASE"),
    "schema": "ERD_SCHEMA_CLEANED"
}
final_schema = os.getenv("SNOWFLAKE_STAR_SCHEMA")

# Initialize session
session = Session.builder.configs(snowflake_params).create()

def set_schema(schema):
    session.sql(f'USE SCHEMA "{snowflake_params["database"]}"."{schema}"').collect()

# Load source tables
print("🔄 Loading source tables...")
df_customer = session.table("CUSTOMER").to_pandas()
df_employee = session.table("EMPLOYEE").to_pandas()
df_artist = session.table("ARTIST").to_pandas()
df_album = session.table("ALBUM").to_pandas()
df_invoice = session.table("INVOICE").to_pandas()
df_invoiceline = session.table("INVOICELINE").to_pandas()
df_track = session.table("TRACK").to_pandas()
df_playlisttrack = session.table("PLAYLISTTRACK").to_pandas()
df_genre = session.table("GENRE").to_pandas()
df_mediatype = session.table("MEDIATYPE").to_pandas()
df_playlist = session.table("PLAYLIST").to_pandas()

# Ensure that columns exist and are correctly typed
def ensure_int_key(df, col):
    """Ensure that a column exists and is converted to an integer, handling errors."""
    if col in df.columns:
        try:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
        except Exception as e:
            print(f"⚠️ Error while converting {col} to int: {e}")
    else:
        print(f"⚠️ Column {col} is missing in the DataFrame.")
    return df

# Fact table creation logic
def create_fact_sales(df_invoiceline, df_invoice, df_track, df_album, df_artist,
                      df_customer, df_employee, df_playlisttrack, dim_location):
    # Uppercase columns for consistency
    df_invoiceline.columns = df_invoiceline.columns.str.upper()
    df_invoice.columns = df_invoice.columns.str.upper()
    df_track.columns = df_track.columns.str.upper()
    df_album.columns = df_album.columns.str.upper()
    df_artist.columns = df_artist.columns.str.upper()
    df_customer.columns = df_customer.columns.str.upper()
    df_employee.columns = df_employee.columns.str.upper()

    # Rename customer billing address columns for location join
    df_customer.rename(columns={
        'BILLINGCITY': 'CITY',
        'BILLINGSTATE': 'STATE',
        'BILLINGCOUNTRY': 'COUNTRY',
        'BILLINGPOSTALCODE': 'POSTALCODE'
    }, inplace=True)

    # Ensure key columns are integers for joining
    for col in ['INVOICEID', 'TRACKID', 'CUSTOMERID', 'SUPPORTREPID']:
        if col in df_invoiceline.columns:
            df_invoiceline = ensure_int_key(df_invoiceline, col)
    for col in ['INVOICEID', 'CUSTOMERID']:
        if col in df_invoice.columns:
            df_invoice = ensure_int_key(df_invoice, col)
    if 'CUSTOMERID' in df_customer.columns:
        df_customer = ensure_int_key(df_customer, 'CUSTOMERID')
    if 'SUPPORTREPID' in df_customer.columns:
        df_customer = ensure_int_key(df_customer, 'SUPPORTREPID')
    for col in ['TRACKID', 'ALBUMID']:
        if col in df_track.columns:
            df_track = ensure_int_key(df_track, col)
    for col in ['ALBUMID', 'ARTISTID']:
        if col in df_album.columns:
            df_album = ensure_int_key(df_album, col)
    if 'ARTISTID' in df_artist.columns:
        df_artist = ensure_int_key(df_artist, 'ARTISTID')
    if 'EMPLOYEEID' in df_employee.columns:
        df_employee = ensure_int_key(df_employee, 'EMPLOYEEID')

    # Merge invoices with invoiceline and customer data
    df_invoice.rename(columns={'INVOICEDATE': 'INVOICE_DATE', 'UNITPRICE': 'UNIT_PRICE'}, inplace=True)
    invoice_subset = df_invoice.drop(columns=['INVOICEDATE'], errors='ignore')
    fact = df_invoiceline.merge(invoice_subset, on='INVOICEID', how='left', suffixes=('', '_inv'))

    # Resolve duplicate 'UNITPRICE'
    if 'UNITPRICE_y' in fact.columns:
        fact['UNIT_PRICE'] = fact['UNITPRICE_y']  # Use the merged column
        fact = fact.drop(columns=['UNITPRICE_y'])  # Drop the duplicate

    if 'UNITPRICE_x' in fact.columns:
        fact['UNIT_PRICE'] = fact['UNITPRICE_x']  # Use the original column
        fact = fact.drop(columns=['UNITPRICE_x'])  # Drop the duplicate

    # Merge with customer and employee
    fact = fact.merge(df_customer, on='CUSTOMERID', how='left')
    if 'SUPPORTREPID' in df_customer.columns:
        fact = fact.merge(df_employee, left_on='SUPPORTREPID', right_on='EMPLOYEEID', how='left')

    # Merge with track, album, artist
    fact = fact.merge(df_track, on='TRACKID', how='left')
    fact = fact.merge(df_album, on='ALBUMID', how='left')
    fact = fact.merge(df_artist, on='ARTISTID', how='left')

    # Location merge with fallback logic
    location_keys = ['CITY', 'STATE', 'COUNTRY', 'POSTALCODE']
    invoice_address_keys = ['BILLINGCITY', 'BILLINGSTATE', 'BILLINGCOUNTRY', 'BILLINGPOSTALCODE']
    if all(col in fact.columns for col in location_keys):
        fact = fact.merge(dim_location, left_on=location_keys, right_on=location_keys, how='left')
    elif all(col in fact.columns for col in invoice_address_keys):
        fact = fact.merge(dim_location, left_on=invoice_address_keys, right_on=location_keys, how='left')

    # Calculate EXTENDED_PRICE
    # Debugging: Check if 'UNIT_PRICE' exists before calculating EXTENDED_PRICE
    print('fact columns before calculating EXTENDED_PRICE:', fact.columns.tolist())
    if 'UNIT_PRICE' not in fact.columns:
        print('⚠️ UNIT_PRICE column is missing! Check if renaming was done correctly.')
    else:
        fact['EXTENDED_PRICE'] = fact['UNIT_PRICE'] * fact['QUANTITY']

    # Rename columns for fact table output
    rename_map = {
        'INVOICELINEID': 'INVOICELINE_ID',
        'INVOICEID': 'INVOICE_ID',
        'INVOICE_DATE': 'INVOICE_DATE',
        'TRACKID': 'TRACK_ID',
        'CUSTOMERID': 'CUSTOMER_ID',
        'UNIT_PRICE': 'UNIT_PRICE',
        'QUANTITY': 'QUANTITY',
        'EXTENDED_PRICE': 'EXTENDED_PRICE'
    }
    if 'SUPPORTREPID' in fact.columns:
        rename_map['SUPPORTREPID'] = 'EMPLOYEE_ID'

    fact_sales = fact.rename(columns=rename_map)

    return fact_sales.reset_index(drop=True)

# Proceed with further transformations, save CSVs in .csv.gz format, and load into Snowflake stage

# Save transformed dataframes as CSV.gz files
fact_sales.to_csv('Dim_Customer_star.csv.gz', index=False, compression='gzip')
dim_location.to_csv('Dim_Location_star.csv.gz', index=False, compression='gzip')
dim_date.to_csv('Dim_Date_star.csv.gz', index=False, compression='gzip')

# Upload CSVs to Snowflake stage FINAL_TRANSFORMED_STAGE (upload manually via Snowflake UI or use PUT command)
dim_date = create_dim_date(df_invoice)
dim_location = create_dim_location(df_customer)
dim_album_artist = create_dim_album_artist(df_album, df_artist)
dim_track = create_dim_track(df_track, df_genre, df_mediatype)
dim_playlist_track = create_dim_playlist_track(df_playlisttrack, df_playlist)
dim_employee = create_dim_employee(df_employee)
dim_customer = create_dim_customer(df_customer)

fact_sales = create_fact_sales(df_invoiceline, df_invoice, df_track, df_album, df_artist,
                               df_customer, df_employee, df_playlisttrack, dim_location)

# Switch to final schema
set_schema(final_schema)
#
print("✅ All dimensions and facts loaded into ERD_SCHEMA_STAR successfully.")


🔄 Loading source tables...
fact columns before calculating EXTENDED_PRICE: ['INVOICELINEID', 'INVOICEID', 'TRACKID', 'UNITPRICE_x', 'QUANTITY', 'CUSTOMERID', 'INVOICE_DATE', 'BILLINGADDRESS', 'BILLINGCITY', 'BILLINGSTATE', 'BILLINGCOUNTRY', 'BILLINGPOSTALCODE', 'TOTAL', 'FIRSTNAME_x', 'LASTNAME_x', 'COMPANY', 'ADDRESS_x', 'CITY_x', 'STATE_x', 'COUNTRY_x', 'POSTALCODE_x', 'PHONE_x', 'FAX_x', 'EMAIL_x', 'SUPPORTREPID', 'EMPLOYEEID', 'LASTNAME_y', 'FIRSTNAME_y', 'TITLE_x', 'REPORTSTO', 'BIRTHDATE', 'HIREDATE', 'ADDRESS_y', 'CITY_y', 'STATE_y', 'COUNTRY_y', 'POSTALCODE_y', 'PHONE_y', 'FAX_y', 'EMAIL_y', 'NAME_x', 'ALBUMID', 'MEDIATYPEID', 'GENREID', 'COMPOSER', 'MILLISECONDS', 'BYTES', 'UNITPRICE_y', 'TITLE_y', 'ARTISTID', 'NAME_y', 'LOCATION_ID', 'CITY', 'STATE', 'COUNTRY', 'POSTALCODE']
⚠️ UNIT_PRICE column is missing! Check if renaming was done correctly.
✅ All dimensions and facts loaded into ERD_SCHEMA_STAR successfully.


In [None]:
import os
import pandas as pd
from snowflake.snowpark import Session
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Snowflake connection configuration
snowflake_params = {
    "account": os.getenv("SNOWFLAKE_ACCOUNT"),
    "user": os.getenv("SNOWFLAKE_USER"),
    "password": os.getenv("SNOWFLAKE_PASSWORD"),
    "role": os.getenv("SNOWFLAKE_ROLE"),
    "warehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
    "database": os.getenv("SNOWFLAKE_DATABASE"),
    "schema": "ERD_SCHEMA_CLEANED"
}
final_schema = os.getenv("SNOWFLAKE_STAR_SCHEMA")

# Initialize session
session = Session.builder.configs(snowflake_params).create()

def set_schema(schema):
    session.sql(f'USE SCHEMA "{snowflake_params["database"]}"."{schema}"').collect()

# Load source tables
print("🔄 Loading source tables...")
df_customer = session.table("CUSTOMER").to_pandas()
df_employee = session.table("EMPLOYEE").to_pandas()
df_artist = session.table("ARTIST").to_pandas()
df_album = session.table("ALBUM").to_pandas()
df_invoice = session.table("INVOICE").to_pandas()
df_invoiceline = session.table("INVOICELINE").to_pandas()
df_track = session.table("TRACK").to_pandas()
df_playlisttrack = session.table("PLAYLISTTRACK").to_pandas()
df_genre = session.table("GENRE").to_pandas()
df_mediatype = session.table("MEDIATYPE").to_pandas()
df_playlist = session.table("PLAYLIST").to_pandas()

# Ensure that columns exist and are correctly typed
def ensure_int_key(df, col):
    """Ensure that a column exists and is converted to an integer, handling errors."""
    if col in df.columns:
        try:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
        except Exception as e:
            print(f"⚠️ Error while converting {col} to int: {e}")
    else:
        print(f"⚠️ Column {col} is missing in the DataFrame.")
    return df

# Fact table creation logic
def create_fact_sales(df_invoiceline, df_invoice, df_track, df_album, df_artist,
                      df_customer, df_employee, df_playlisttrack, dim_location):
    # Uppercase columns for consistency
    df_invoiceline.columns = df_invoiceline.columns.str.upper()
    df_invoice.columns = df_invoice.columns.str.upper()
    df_track.columns = df_track.columns.str.upper()
    df_album.columns = df_album.columns.str.upper()
    df_artist.columns = df_artist.columns.str.upper()
    df_customer.columns = df_customer.columns.str.upper()
    df_employee.columns = df_employee.columns.str.upper()

    # Rename customer billing address columns for location join
    df_customer.rename(columns={
        'BILLINGCITY': 'CITY',
        'BILLINGSTATE': 'STATE',
        'BILLINGCOUNTRY': 'COUNTRY',
        'BILLINGPOSTALCODE': 'POSTALCODE'
    }, inplace=True)

    # Ensure key columns are integers for joining
    for col in ['INVOICEID', 'TRACKID', 'CUSTOMERID', 'SUPPORTREPID']:
        if col in df_invoiceline.columns:
            df_invoiceline = ensure_int_key(df_invoiceline, col)
    for col in ['INVOICEID', 'CUSTOMERID']:
        if col in df_invoice.columns:
            df_invoice = ensure_int_key(df_invoice, col)
    if 'CUSTOMERID' in df_customer.columns:
        df_customer = ensure_int_key(df_customer, 'CUSTOMERID')
    if 'SUPPORTREPID' in df_customer.columns:
        df_customer = ensure_int_key(df_customer, 'SUPPORTREPID')
    for col in ['TRACKID', 'ALBUMID']:
        if col in df_track.columns:
            df_track = ensure_int_key(df_track, col)
    for col in ['ALBUMID', 'ARTISTID']:
        if col in df_album.columns:
            df_album = ensure_int_key(df_album, col)
    if 'ARTISTID' in df_artist.columns:
        df_artist = ensure_int_key(df_artist, 'ARTISTID')
    if 'EMPLOYEEID' in df_employee.columns:
        df_employee = ensure_int_key(df_employee, 'EMPLOYEEID')

    # Merge invoices with invoiceline and customer data
    df_invoice.rename(columns={'INVOICEDATE': 'INVOICE_DATE', 'UNITPRICE': 'UNIT_PRICE'}, inplace=True)
    invoice_subset = df_invoice.drop(columns=['INVOICEDATE'], errors='ignore')
    fact = df_invoiceline.merge(invoice_subset, on='INVOICEID', how='left', suffixes=('', '_inv'))

    # Resolve duplicate 'UNITPRICE'
    if 'UNITPRICE_y' in fact.columns:
        fact['UNIT_PRICE'] = fact['UNITPRICE_y']  # Use the merged column
        fact = fact.drop(columns=['UNITPRICE_y'])  # Drop the duplicate

    if 'UNITPRICE_x' in fact.columns:
        fact['UNIT_PRICE'] = fact['UNITPRICE_x']  # Use the original column
        fact = fact.drop(columns=['UNITPRICE_x'])  # Drop the duplicate

    # Merge with customer and employee
    fact = fact.merge(df_customer, on='CUSTOMERID', how='left')
    if 'SUPPORTREPID' in df_customer.columns:
        fact = fact.merge(df_employee, left_on='SUPPORTREPID', right_on='EMPLOYEEID', how='left')

    # Merge with track, album, artist
    fact = fact.merge(df_track, on='TRACKID', how='left')
    fact = fact.merge(df_album, on='ALBUMID', how='left')
    fact = fact.merge(df_artist, on='ARTISTID', how='left')

    # Location merge with fallback logic
    location_keys = ['CITY', 'STATE', 'COUNTRY', 'POSTALCODE']
    invoice_address_keys = ['BILLINGCITY', 'BILLINGSTATE', 'BILLINGCOUNTRY', 'BILLINGPOSTALCODE']
    if all(col in fact.columns for col in location_keys):
        fact = fact.merge(dim_location, left_on=location_keys, right_on=location_keys, how='left')
    elif all(col in fact.columns for col in invoice_address_keys):
        fact = fact.merge(dim_location, left_on=invoice_address_keys, right_on=location_keys, how='left')

    # Calculate EXTENDED_PRICE
    if 'UNIT_PRICE' in fact.columns:
        fact['EXTENDED_PRICE'] = fact['UNIT_PRICE'] * fact['QUANTITY']

    # Rename columns for fact table output
    rename_map = {
        'INVOICELINEID': 'INVOICELINE_ID',
        'INVOICEID': 'INVOICE_ID',
        'INVOICE_DATE': 'INVOICE_DATE',
        'TRACKID': 'TRACK_ID',
        'CUSTOMERID': 'CUSTOMER_ID',
        'UNIT_PRICE': 'UNIT_PRICE',
        'QUANTITY': 'QUANTITY',
        'EXTENDED_PRICE': 'EXTENDED_PRICE'
    }
    if 'SUPPORTREPID' in fact.columns:
        rename_map['SUPPORTREPID'] = 'EMPLOYEE_ID'

    fact_sales = fact.rename(columns=rename_map)

    # Keep only relevant columns in fact_sales
    columns_to_keep = [
        'INVOICELINE_ID', 'LOCATION_ID', 'INVOICE_ID', 'TRACK_ID', 'CUSTOMER_ID',
        'EMPLOYEE_ID', 'MEDIATYPE_ID', 'INVOICE_DATE', 'UNIT_PRICE', 'QUANTITY',
        'TOTAL_AMOUNT', 'PLAYLIST_ID', 'ALBUM_ID', 'ARTIST_ID', 'MILLISECONDS', 'BYTES'
    ]
    fact_sales = fact_sales[columns_to_keep]

    return fact_sales.reset_index(drop=True)

# Proceed with further transformations, save CSVs in .csv.gz format, and load into Snowflake stage

# Save transformed dataframes as CSV.gz files
fact_sales.to_csv('Fact_Sales_star.csv.gz', index=False, compression='gzip')
dim_location.to_csv('Dim_Location_star.csv.gz', index=False, compression='gzip')
dim_date.to_csv('Dim_Date_star.csv.gz', index=False, compression='gzip')
dim_album_artist.to_csv('Dim_Album_Artist_star.csv.gz', index=False, compression='gzip')
dim_track.to_csv('Dim_Track_star.csv.gz', index=False, compression='gzip')
dim_playlist_track.to_csv('Dim_Playlist_Track_star.csv.gz', index=False, compression='gzip')
dim_employee.to_csv('Dim_Employee_star.csv.gz', index=False, compression='gzip')
dim_customer.to_csv('Dim_Customer_star.csv.gz', index=False, compression='gzip')

# Upload CSVs to Snowflake stage FINAL_TRANSFORMED_STAGE (upload manually via Snowflake UI or use PUT command)
dim_date = create_dim_date(df_invoice)
dim_location = create_dim_location(df_customer)
dim_album_artist = create_dim_album_artist(df_album, df_artist)
dim_track = create_dim_track(df_track, df_genre, df_mediatype)
dim_playlist_track = create_dim_playlist_track(df_playlisttrack, df_playlist)
dim_employee = create_dim_employee(df_employee)
dim_customer = create_dim_customer(df_customer)

fact_sales = create_fact_sales(df_invoiceline, df_invoice, df_track, df_album, df_artist,
                               df_customer, df_employee, df_playlisttrack, dim_location)

# Switch to final schema
set_schema(final_schema)

def load_df_to_table(df, table_name):
    print(f"\nLoading {table_name}...")
    print(f"{table_name} rows: {len(df)}")
    print(df.head())
    # Data loadingIt seems like you've shared a code snippet, and I understand that you're facing some issues with the final steps related to saving the CSVs, uploading them to Snowflake, and loading the data into the final Snowflake tables. I’ll summarize and update the code to make sure it works as intended.

Here’s an **updated, cleaned version** of your code with corrections:

### Complete Code:

```python
import os
import pandas as pd
from snowflake.snowpark import Session
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Snowflake connection configuration
snowflake_params = {
    "account": os.getenv("SNOWFLAKE_ACCOUNT"),
    "user": os.getenv("SNOWFLAKE_USER"),
    "password": os.getenv("SNOWFLAKE_PASSWORD"),
    "role": os.getenv("SNOWFLAKE_ROLE"),
    "warehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
    "database": os.getenv("SNOWFLAKE_DATABASE"),
    "schema": "ERD_SCHEMA_CLEANED"
}
final_schema = os.getenv("SNOWFLAKE_STAR_SCHEMA")

# Initialize session
session = Session.builder.configs(snowflake_params).create()

def set_schema(schema):
    session.sql(f'USE SCHEMA "{snowflake_params["database"]}"."{schema}"').collect()

# Load source tables
print("🔄 Loading source tables...")
df_customer = session.table("CUSTOMER").to_pandas()
df_employee = session.table("EMPLOYEE").to_pandas()
df_artist = session.table("ARTIST").to_pandas()
df_album = session.table("ALBUM").to_pandas()
df_invoice = session.table("INVOICE").to_pandas()
df_invoiceline = session.table("INVOICELINE").to_pandas()
df_track = session.table("TRACK").to_pandas()
df_playlisttrack = session.table("PLAYLISTTRACK").to_pandas()
df_genre = session.table("GENRE").to_pandas()
df_mediatype = session.table("MEDIATYPE").to_pandas()
df_playlist = session.table("PLAYLIST").to_pandas()

# Ensure that columns exist and are correctly typed
def ensure_int_key(df, col):
    """Ensure that a column exists and is converted to an integer, handling errors."""
    if col in df.columns:
        try:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
        except Exception as e:
            print(f"⚠️ Error while converting {col} to int: {e}")
    else:
        print(f"⚠️ Column {col} is missing in the DataFrame.")
    return df

# Fact table creation logic
def create_fact_sales(df_invoiceline, df_invoice, df_track, df_album, df_artist,
                      df_customer, df_employee, df_playlisttrack, dim_location):
    # Uppercase columns for consistency
    df_invoiceline.columns = df_invoiceline.columns.str.upper()
    df_invoice.columns = df_invoice.columns.str.upper()
    df_track.columns = df_track.columns.str.upper()
    df_album.columns = df_album.columns.str.upper()
    df_artist.columns = df_artist.columns.str.upper()
    df_customer.columns = df_customer.columns.str.upper()
    df_employee.columns = df_employee.columns.str.upper()

    # Rename customer billing address columns for location join
    df_customer.rename(columns={
        'BILLINGCITY': 'CITY',
        'BILLINGSTATE': 'STATE',
        'BILLINGCOUNTRY': 'COUNTRY',
        'BILLINGPOSTALCODE': 'POSTALCODE'
    }, inplace=True)

    # Ensure key columns are integers for joining
    for col in ['INVOICEID', 'TRACKID', 'CUSTOMERID', 'SUPPORTREPID']:
        if col in df_invoiceline.columns:
            df_invoiceline = ensure_int_key(df_invoiceline, col)
    for col in ['INVOICEID', 'CUSTOMERID']:
        if col in df_invoice.columns:
            df_invoice = ensure_int_key(df_invoice, col)
    if 'CUSTOMERID' in df_customer.columns:
        df_customer = ensure_int_key(df_customer, 'CUSTOMERID')
    if 'SUPPORTREPID' in df_customer.columns:
        df_customer = ensure_int_key(df_customer, 'SUPPORTREPID')
    for col in ['TRACKID', 'ALBUMID']:
        if col in df_track.columns:
            df_track = ensure_int_key(df_track, col)
    for col in ['ALBUMID', 'ARTISTID']:
        if col in df_album.columns:
            df_album = ensure_int_key(df_album, col)
    if 'ARTISTID' in df_artist.columns:
        df_artist = ensure_int_key(df_artist, 'ARTISTID')
    if 'EMPLOYEEID' in df_employee.columns:
        df_employee = ensure_int_key(df_employee, 'EMPLOYEEID')

    # Merge invoices with invoiceline and customer data
    df_invoice.rename(columns={'INVOICEDATE': 'INVOICE_DATE', 'UNITPRICE': 'UNIT_PRICE'}, inplace=True)
    invoice_subset = df_invoice.drop(columns=['INVOICEDATE'], errors='ignore')
    fact = df_invoiceline.merge(invoice_subset, on='INVOICEID', how='left', suffixes=('', '_inv'))

    # Resolve duplicate 'UNITPRICE'
    if 'UNITPRICE_y' in fact.columns:
        fact['UNIT_PRICE'] = fact['UNITPRICE_y']  # Use the merged column
        fact = fact.drop(columns=['UNITPRICE_y'])  # Drop the duplicate

    if 'UNITPRICE_x' in fact.columns:
        fact['UNIT_PRICE'] = fact['UNITPRICE_x']  # Use the original column
        fact = fact.drop(columns=['UNITPRICE_x'])  # Drop the duplicate

    # Merge with customer and employee
    fact = fact.merge(df_customer, on='CUSTOMERID', how='left')
    if 'SUPPORTREPID' in df_customer.columns:
        fact = fact.merge(df_employee, left_on='SUPPORTREPID', right_on='EMPLOYEEID', how='left')

    # Merge with track, album, artist
    fact = fact.merge(df_track, on='TRACKID', how='left')
    fact = fact.merge(df_album, on='ALBUMID', how='left')
    fact = fact.merge(df_artist, on='ARTISTID', how='left')

    # Location merge with fallback logic
    location_keys = ['CITY', 'STATE', 'COUNTRY', 'POSTALCODE']
    invoice_address_keys = ['BILLINGCITY', 'BILLINGSTATE', 'BILLINGCOUNTRY', 'BILLINGPOSTALCODE']
    if all(col in fact.columns for col in location_keys):
        fact = fact.merge(dim_location, left_on=location_keys, right_on=location_keys, how='left')
    elif all(col in fact.columns for col in invoice_address_keys):
        fact = fact.merge(dim_location, left_on=invoice_address_keys, right_on=location_keys, how='left')

    # Calculate EXTENDED_PRICE
    if 'UNIT_PRICE' in fact.columns:
        fact['EXTENDED_PRICE'] = fact['UNIT_PRICE'] * fact['QUANTITY']

    # Rename columns for fact table output
    rename_map = {
        'INVOICELINEID': 'INVOICELINE_ID',
        'INVOICEID': 'INVOICE_ID',
        'INVOICE_DATE': 'INVOICE_DATE',
        'TRACKID': 'TRACK_ID',
        'CUSTOMERID': 'CUSTOMER_ID',
        'UNIT_PRICE': 'UNIT_PRICE',
        'QUANTITY': 'QUANTITY',
        'EXTENDED_PRICE': 'EXTENDED_PRICE'
    }
    if 'SUPPORTREPID' in fact.columns:
        rename_map['SUPPORTREPID'] = 'EMPLOYEE_ID'

    fact_sales = fact.rename(columns=rename_map)

    # Keep only relevant columns in fact_sales
    columns_to_keep = [
        'INVOICELINE_ID', 'LOCATION_ID', 'INVOICE_ID', 'TRACK_ID', 'CUSTOMER_ID',
        'EMPLOYEE_ID', 'MEDIATYPE_ID', 'INVOICE_DATE', 'UNIT_PRICE', 'QUANTITY',
        'TOTAL_AMOUNT', 'PLAYLIST_ID', 'ALBUM_ID', 'ARTIST_ID', 'MILLISECONDS', 'BYTES'
    ]
    fact_sales = fact_sales[columns_to_keep]

    return fact_sales.reset_index(drop=True)

# Proceed with further transformations, save CSVs in .csv.gz format, and load into Snowflake stage

# Save transformed dataframes as CSV.gz files
fact_sales.to_csv('Fact_Sales_star.csv.gz', index=False, compression='gzip')
dim_location.to_csv('Dim_Location_star.csv.gz', index=False, compression='gzip')
dim_date.to_csv('Dim_Date_star.csv.gz', index=False, compression='gzip')
dim_album_artist.to_csv('Dim_Album_Artist_star.csv.gz', index=False, compression='gzip')
dim_track.to_csv('Dim_Track_star.csv.gz', index=False, compression='gzip')
dim_playlist_track.to_csv('Dim_Playlist_Track_star.csv.gz', index=False, compression='gzip')
dim_employee.to_csv('Dim_Employee_star.csv.gz', index=False, compression='gzip')
dim_customer.to_csv('Dim_Customer_star.csv.gz', index=False, compression='gzip')

# Upload CSVs to Snowflake stage FINAL_TRANSFORMED_STAGE (upload manually via Snowflake UI or use PUT command)
dim_date = create_dim_date(df_invoice)
dim_location = create_dim_location(df_customer)
dim_album_artist = create_dim_album_artist(df_album, df_artist)
dim_track = create_dim_track(df_track, df_genre, df_mediatype)
dim_playlist_track = create_dim_playlist_track(df_playlisttrack, df_playlist)
dim_employee = create_dim_employee(df_employee)
dim_customer = create_dim_customer(df_customer)

fact_sales = create_fact_sales(df_invoiceline, df_invoice, df_track, df_album, df_artist,
                               df_customer, df_employee, df_playlisttrack, dim_location)

# Switch to final schema
set_schema(final_schema)

def load_df_to_table(df, table_name):
    print(f"\nLoading {table_name}...")
    print(f"{table_name} rows: {len(df)}")
    print(df.head())
    # Data loading is performed using the Snowflake stage after uploading CSVs
    session.sql("""
        COPY INTO ERD_SCHEMA_STAR.DIM_DATE
        FROM @FINAL_TRANSFORMED_STAGE/Dim_Date_star.csv.gz
        FILE_FORMAT = (TYPE = 'CSV' COMPRESSION = 'GZIP');
    """).collect()

    session.sql("""
        COPY INTO ERD_SCHEMA_STAR.DIM_LOCATION
        FROM @FINAL_TRANSFORMED_STAGE/Dim_Location_star.csv.gz
        FILE_FORMAT = (TYPE = 'CSV' COMPRESSION = 'GZIP');
    """).collect()

    session.sql("""
        COPY INTO ERD_SCHEMA_STAR.DIM_CUSTOMER
        FROM @FINAL_TRANSFORMED_STAGE/Dim_Customer_star.csv.gz
        FILE_FORMAT = (TYPE = 'CSV' COMPRESSION = 'GZIP');
    """).collect()

    session.sql("""
        COPY INTO ERD_SCHEMA_STAR.FACTSALES
        FROM @FINAL_TRANSFORMED_STAGE/Fact_Sales_star.csv.gz
        FILE_FORMAT = (TYPE = 'CSV' COMPRESSION = 'GZIP');
    """).collect()

# Calling the function to execute the COPY commands
load_df_to_table(dim_date, "DIM_DATE")
load_df_to_table(dim_location, "DIM_LOCATION")
load_df_to_table(dim_album_artist, "DIM_ALBUM_ARTIST")
load_df_to_table(dim_track, "DIM_TRACK")
load_df_to_table(dim_playlist_track, "DIM_PLAYLIST_TRACK")
load_df_to_table(dim_employee, "DIM_EMPLOYEE")
load_df_to_table(dim_customer, "DIM_CUSTOMER")
load_df_to_table(fact_sales, "FACTSALES")

print("✅ All dimensions and facts loaded into ERD_SCHEMA_STAR successfully.")


In [None]:
# Switch to final schema
 #Calling the function to execute the COPY commands
load_df_to_table(dim_date, "DIM_DATE")
load_df_to_table(dim_location, "DIM_LOCATION")
load_df_to_table(dim_album_artist, "DIM_ALBUM_ARTIST")
load_df_to_table(dim_track, "DIM_TRACK")
load_df_to_table(dim_playlist_track, "DIM_PLAYLIST_TRACK")
load_df_to_table(dim_employee, "DIM_EMPLOYEE")
load_df_to_table(dim_customer, "DIM_CUSTOMER")
load_df_to_table(fact_sales, "FACTSALES")

set_schema(final_schema)
# Calling the function to execute the COPY commands
load_df_to_table(dim_date, "DIM_DATE")
load_df_to_table(dim_location, "DIM_LOCATION")
load_df_to_table(dim_album_artist, "DIM_ALBUM_ARTIST")
load_df_to_table(dim_track, "DIM_TRACK")
load_df_to_table(dim_playlist_track, "DIM_PLAYLIST_TRACK")
load_df_to_table(dim_employee, "DIM_EMPLOYEE")
load_df_to_table(dim_customer, "DIM_CUSTOMER")
load_df_to_table(fact_sales, "FACTSALES")


def load_df_to_table(df, table_name):
    print(f"\nLoading {table_name}...")
    print(f"{table_name} rows: {len(df)}")
    print(df.head())
    # Data loading is performed using the Snowflake stage after uploading CSVs
    # Replace the session.write_pandas part with COPY INTO commands to load data from the stage into the final tables
    session.sql("""
    COPY INTO ERD_SCHEMA_STAR.DIM_DATE
    FROM @FINAL_TRANSFORMED_STAGE/Dim_Date_star.csv.gz
    FILE_FORMAT = (TYPE = 'CSV' COMPRESSION = 'GZIP');
    """).collect()

    session.sql("""
    COPY INTO ERD_SCHEMA_STAR.DIM_LOCATION
    FROM @FINAL_TRANSFORMED_STAGE/Dim_Location_star.csv.gz
    FILE_FORMAT = (TYPE = 'CSV' COMPRESSION = 'GZIP');
    """).collect()

    session.sql("""
    COPY INTO ERD_SCHEMA_STAR.DIM_CUSTOMER
    FROM @FINAL_TRANSFORMED_STAGE/Dim_Customer_star.csv.gz
    FILE_FORMAT = (TYPE = 'CSV' COMPRESSION = 'GZIP');
    """).collect()

    session.sql("""
    COPY INTO ERD_SCHEMA_STAR.FACTSALES
    FROM @FINAL_TRANSFORMED_STAGE/Fact_Sales_star.csv.gz
    FILE_FORMAT = (TYPE = 'CSV' COMPRESSION = 'GZIP');
    """).collect()

