# United outdoors datawarehouse

## Imports

In [None]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, BigInteger, Integer, String, LargeBinary, VARCHAR, NVARCHAR, DECIMAL, CHAR, \
    DATE
from sqlalchemy.dialects.mssql import BIT, XML, MONEY, TIME
from sqlalchemy.exc import OperationalError
from urllib import parse
import re
import time

## Starting timer

In [None]:
start_time = time.time()

## Database connection details

In [None]:
DB = {
    'servername' : '(local)\\SQLEXPRESS',
    'united_outdoors_database' : 'UnitedOutdoors',
    'northwind_database' : 'Northwind',
    'aenc_database' : 'Aenc',
    'adventureworks_database' : 'AdventureWorks2019',
    'master' : 'master'
}

In [None]:
def create_connection(servername, database):
    params = parse.quote_plus(f'DRIVER={{SQL Server}};SERVER={servername};DATABASE={database};Trusted_Connection=yes')
    engine = create_engine(f'mssql+pyodbc:///?odbc_connect={params}', use_setinputsizes=False, connect_args={'options': '-c search_path=dbo'}, fast_executemany=True) # setinputsizes needs to be turned off for sql server, idk why but gives errors otherwise
    try:
        establish_conn = engine.connect()
        print(f'Connection to {database} database successful')
        return establish_conn, engine
    except OperationalError as e:
        print(f'Error: {e}')
        return None, None

In [None]:
def split_and_execute_sql_script(script, execute_engine):
    # creating a connection
    connection = execute_engine.connect()
    
    # splitting the script into the database creation and the rest
    commands = re.split(r'GO\n', script)
    # removing all \bGO\b from the commands
    commands = [re.sub(r'\bGO\b', '', command) for command in commands]
    
    # Execute the commands
    for command in commands:
        command = command.strip()
        # Skip if the command is empty or 'GO'
        if not command or command.upper() == 'GO':
            continue
        try:
            connection.connection.execute(command)
            connection.connection.commit()
            #print(f'Command executed: {command}')
        except OperationalError as e:
            print(f'Error: {e} at command: {command}')

In [None]:
def bulk_insert(df, dtypes, table_name, engine):
    # calculating chunk size
    chunk_size = (2000 // len(df.columns))  # 2100 is the maximum number of parameters in a query, -100 for safety

    print(f'Inserting data into table: {table_name} with chunk size: {chunk_size}')
    df.to_sql(name=table_name, schema='dbo', con=engine, if_exists='append', index=False, dtype=dtypes, method='multi', chunksize= chunk_size)

In [None]:
def prepare(dataframe, nk_sk_dict=None):
    # replacing the natural keys with the surrogate keys
    if nk_sk_dict:
        for column in nk_sk_dict:
            # Check for duplicate keys
            if len(nk_sk_dict[column]) != len(set(nk_sk_dict[column])):
                raise ValueError(f'Duplicate keys found in nk_sk_dict for column: {column}')
            else:
                print(f'Replacing natural keys with surrogate keys for column: {column}')
                for natural_key in nk_sk_dict[column]:
                    dataframe[column] = dataframe[column].replace(natural_key, nk_sk_dict[column][natural_key])
    
    # replace empty values with None
    dataframe = dataframe.where(pd.notnull(dataframe), None)
    dataframe = dataframe.replace({np.nan: None})
    
    # stripping all columns with string data
    dataframe = dataframe.map(lambda x: x.strip() if isinstance(x, str) else x)
    # replacing all empty strings with None
    dataframe = dataframe.replace(r'^\s*$', None, regex=True)

In [None]:
def prepare_and_insert(dataframe, dtypes, table_name, insert_engine, nk_sk_dict=None):
    """
    Prepares the dataframe for insertion into the database and inserts it into the database.
        @param dataframe: The dataframe to be inserted into the database
        @param dtypes: The data types of the columns in the dataframe
        @param table_name: The name of the table in the database
        @param insert_engine: The engine to insert the data into
        @param nk_sk_dict: A 3d dictionary containing the natural keys and their corresponding surrogate keys, per column (so nk_sk_dict[column][natural_key] = surrogate_key)
    """
       
    prepare(dataframe, nk_sk_dict)
    
    # adding the data to the database
    bulk_insert(dataframe, dtypes, table_name, insert_engine)

In [None]:
def prepare_and_insert_return_sk(dataframe, dtypes, table_name, insert_engine, natural_key_column, nk_sk_dict=None):
    """
    Prepares the dataframe for insertion into the database and inserts it into the database.
        @param dataframe: The dataframe to be inserted into the database
        @param dtypes: The data types of the columns in the dataframe
        @param table_name: The name of the table in the database
        @param insert_engine: The engine to insert the data into
        @param natural_key_column: The name of the column containing the natural keys
        @param nk_sk_dict: A 3d dictionary containing the natural keys and their corresponding surrogate keys, per column (so nk_sk_dict[column][natural_key] = surrogate_key)
        @return: A dictionary containing the natural keys and their corresponding surrogate keys
    """
    
    prepare(dataframe, nk_sk_dict)
    
    # adding the data to the database
    bulk_insert(dataframe, dtypes, table_name, insert_engine)
    
    # making a cleaned dictionary without None/nan values in the natural_key_column
    mask = dataframe[natural_key_column].notnull()
    filtered_dataframe = dataframe[mask]
    
    # getting the natural keys and their corresponding surrogate keys
    # TODO FIX THIS, relying on the dataframe index is not a good idea
    nk_sk_dict = dict(zip(filtered_dataframe[natural_key_column], dataframe.index))
    
    # adding +1 to the index to get the surrogate key
    nk_sk_dict = {k: v + 1 for k, v in nk_sk_dict.items()}
    
    return nk_sk_dict

In [None]:
def prepare_and_update(table_name, update_engine, nk_sk_dict=None):   
    # creating a connection
    connection = update_engine.connect()

    # Start a transaction
    trans = connection.begin()

    try:
        # adding the data to the database
        print(f'Updating data in table: {table_name}')
        for column in nk_sk_dict:
            # Prepare a batch update statement
            update_stmts = []
            for natural_key in nk_sk_dict[column]:
                # TODO this would not work with updated data in datawarehouse, since this would replace the old and new data. Maybe check the datetime or something
                statement = f'UPDATE {table_name} SET {column} = {nk_sk_dict[column][natural_key]} WHERE {column} = {int(natural_key)}'
                update_stmts.append(statement)

                # If the number of parameters reaches the limit, execute the batch update and clear the list
                if len(update_stmts) * 2 >= 2000:  # Each update statement has 2 parameters, cause of the Column and Where clause
                    connection.connection.execute(";".join(update_stmts))
                    update_stmts = []

            # Execute the remaining update statements
            if update_stmts:
                connection.connection.execute(";".join(update_stmts))

        # Commit the transaction
        trans.commit()
    except:
        # Rollback the transaction in case of error
        trans.rollback()
        raise
    finally:
        # Close the connection
        connection.close()

In [None]:
def drop_modified_date_rowguid(dataframe):
    # dropping all columns with 'rowguid' in their name
    columns_to_drop_mr = dataframe.filter(like='rowguid').columns
    
    # dropping all columns with 'ModifiedDate' in their name
    columns_to_drop_mr = columns_to_drop_mr.append(dataframe.filter(like='ModifiedDate').columns)
    
    # dropping the columns
    dataframe.drop(columns=columns_to_drop_mr, inplace=True)

## Create the UnitedOutdoors datawarehouse

In [None]:
_, creation_engine = create_connection(DB["servername"], DB["master"])

# Open the SQL script file and read its contents
with open('sql/UnitedOutdoors_creation.sql', 'r') as file:
    sql_script = file.read()

split_and_execute_sql_script(sql_script, creation_engine)

creation_engine.dispose()

## Connecting to the UnitedOutdoors datawarehouse

In [None]:
_ , united_outdoors_engine = create_connection(DB["servername"], DB["united_outdoors_database"])

## Loading the data from the source databases

### Northwind database

#### Connection

In [None]:
northwind_conn, northwind_engine = create_connection(DB["servername"], DB["northwind_database"])

#### Loading data

In [None]:
# Load the data from the source database
northwind_categories = pd.read_sql('SELECT * FROM Categories', northwind_conn)
northwind_customer_customer_demo = pd.read_sql('SELECT * FROM CustomerCustomerDemo', northwind_conn)
northwind_customer_demographics = pd.read_sql('SELECT * FROM CustomerDemographics', northwind_conn)
northwind_customers = pd.read_sql('SELECT * FROM Customers', northwind_conn)
northwind_employees = pd.read_sql('SELECT *, CONVERT(VARCHAR(MAX), CONVERT(VARBINARY(MAX), Photo), 1) as EMPLOYEE_EMPLOYEES_PhotoHexString FROM Employees', northwind_conn) # image is deprecated, but still in northwind. Direct conversion from image to varchar is not possible, so we need to convert it to varbinary first
northwind_employee_territories = pd.read_sql('SELECT * FROM EmployeeTerritories', northwind_conn)
northwind_order_details = pd.read_sql('SELECT * FROM [Order Details]', northwind_conn)
northwind_orders = pd.read_sql('SELECT * FROM Orders', northwind_conn)
northwind_products = pd.read_sql('SELECT * FROM Products', northwind_conn)
northwind_region = pd.read_sql('SELECT * FROM Region', northwind_conn)
northwind_shippers = pd.read_sql('SELECT * FROM Shippers', northwind_conn)
northwind_suppliers = pd.read_sql('SELECT * FROM Suppliers', northwind_conn)
northwind_territories = pd.read_sql('SELECT * FROM Territories', northwind_conn)

northwind_conn.close()

### Aenc database

#### Connection

In [None]:
aenc_conn , aenc_engine = create_connection(DB["servername"], DB["aenc_database"])

#### Loading data

In [None]:
aenc_bonus              = pd.read_sql('SELECT * FROM Bonus', aenc_conn)
aenc_customer           = pd.read_sql('SELECT * FROM Customer', aenc_conn)
aenc_department         = pd.read_sql('SELECT * FROM Department', aenc_conn)
aenc_employee           = pd.read_sql('SELECT * FROM Employee', aenc_conn)
aenc_product            = pd.read_sql('SELECT * FROM Product', aenc_conn)
aenc_region             = pd.read_sql('SELECT * FROM Region', aenc_conn)
aenc_sales_order        = pd.read_sql('SELECT * FROM SalesOrder', aenc_conn)
aenc_sales_order_item   = pd.read_sql('SELECT * FROM SalesOrderItem', aenc_conn)
aenc_state              = pd.read_sql('SELECT * FROM State', aenc_conn)

aenc_conn.close()

### AdventureWorks database

#### Connection

In [None]:
adventureworks_conn, adventureworks_engine = create_connection(DB["servername"], DB["adventureworks_database"])

#### Loading data

In [None]:
adventureworks_humanresources_department = pd.read_sql('SELECT * FROM HumanResources.Department', adventureworks_conn)
adventureworks_humanresources_employee = pd.read_sql('SELECT * FROM HumanResources.Employee', adventureworks_conn)
adventureworks_humanresources_employeedepartmenthistory = pd.read_sql('SELECT * FROM HumanResources.EmployeeDepartmentHistory', adventureworks_conn)
adventureworks_humanresources_employeepayhistory = pd.read_sql('SELECT * FROM HumanResources.EmployeePayHistory', adventureworks_conn)
adventureworks_humanresources_jobcandidate = pd.read_sql('SELECT * FROM HumanResources.JobCandidate', adventureworks_conn)
adventureworks_humanresources_shift = pd.read_sql('SELECT * FROM HumanResources.Shift', adventureworks_conn)

In [None]:
adventureworks_person_address = pd.read_sql('SELECT AddressID, AddressLine1, AddressLine2, City, StateProvinceID, PostalCode, CAST(SpatialLocation AS VARCHAR(MAX)) AS SpatialLocation,rowguid, ModifiedDate   FROM Person.Address', adventureworks_conn)
adventureworks_person_address_type = pd.read_sql('SELECT * FROM Person.AddressType', adventureworks_conn)
adventureworks_person_businessentity = pd.read_sql('SELECT * FROM Person.BusinessEntity', adventureworks_conn)
adventureworks_person_businessentityaddress = pd.read_sql('SELECT * FROM Person.BusinessEntityAddress', adventureworks_conn)
adventureworks_person_businessentitycontact = pd.read_sql('SELECT * FROM Person.BusinessEntityContact', adventureworks_conn)
adventureworks_person_contacttype = pd.read_sql('SELECT * FROM Person.ContactType', adventureworks_conn)
adventureworks_person_countryregion = pd.read_sql('SELECT * FROM Person.CountryRegion', adventureworks_conn)
adventureworks_person_emailaddress = pd.read_sql('SELECT * FROM Person.EmailAddress', adventureworks_conn)
adventureworks_person_password = pd.read_sql('SELECT * FROM Person.Password', adventureworks_conn)
adventureworks_person_person = pd.read_sql('SELECT * FROM Person.Person', adventureworks_conn)
adventureworks_person_personphone = pd.read_sql('SELECT * FROM Person.PersonPhone', adventureworks_conn)
adventureworks_person_phonenumbertype = pd.read_sql('SELECT * FROM Person.PhoneNumberType', adventureworks_conn)
adventureworks_person_stateprovince = pd.read_sql('SELECT * FROM Person.StateProvince', adventureworks_conn)

In [None]:
adventureworks_production_bill_of_materials = pd.read_sql('SELECT * FROM Production.BillOfMaterials', adventureworks_conn)
adventureworks_production_culture = pd.read_sql('SELECT * FROM Production.Culture', adventureworks_conn)
adventureworks_production_document = pd.read_sql('SELECT * FROM Production.Document', adventureworks_conn)
adventureworks_production_illustration = pd.read_sql('SELECT * FROM Production.Illustration', adventureworks_conn)
adventureworks_production_location = pd.read_sql('SELECT * FROM Production.Location', adventureworks_conn)
adventureworks_production_product = pd.read_sql('SELECT * FROM Production.Product', adventureworks_conn)
adventureworks_production_productcategory = pd.read_sql('SELECT * FROM Production.ProductCategory', adventureworks_conn)
adventureworks_production_productcosthistory = pd.read_sql('SELECT * FROM Production.ProductCostHistory', adventureworks_conn)
adventureworks_production_productdescription = pd.read_sql('SELECT * FROM Production.ProductDescription', adventureworks_conn)
adventureworks_production_productdocument = pd.read_sql('SELECT * , CAST(DocumentNode AS VARCHAR(MAX)) AS DocumentNodeString  FROM Production.ProductDocument', adventureworks_conn)
adventureworks_production_productinventory = pd.read_sql('SELECT * FROM Production.ProductInventory', adventureworks_conn)
adventureworks_production_productlistpricehistory = pd.read_sql('SELECT * FROM Production.ProductListPriceHistory', adventureworks_conn)
adventureworks_production_productmodel = pd.read_sql('SELECT * FROM Production.ProductModel', adventureworks_conn)
adventureworks_production_productmodelillustration = pd.read_sql('SELECT * FROM Production.ProductModelIllustration', adventureworks_conn)
adventureworks_production_productmodelproductdescriptionculture = pd.read_sql('SELECT * FROM Production.ProductModelProductDescriptionCulture', adventureworks_conn)
adventureworks_production_productphoto = pd.read_sql('SELECT ProductPhotoID, CONVERT(VARCHAR(MAX),ThumbNailPhoto, 1) as ThumbNailPhotoHexString, ThumbNailPhotoFileName, CONVERT(VARCHAR(MAX), LargePhoto, 1) as LargePhotoHexString, LargePhotoFileName, ModifiedDate FROM Production.ProductPhoto', adventureworks_conn)
adventureworks_production_productproductphoto = pd.read_sql('SELECT * FROM Production.ProductProductPhoto', adventureworks_conn)
adventureworks_production_productreview = pd.read_sql('SELECT * FROM Production.ProductReview', adventureworks_conn)
adventureworks_production_productsubcategory = pd.read_sql('SELECT * FROM Production.ProductSubcategory', adventureworks_conn)
adventureworks_production_scrapreason = pd.read_sql('SELECT * FROM Production.ScrapReason', adventureworks_conn)
adventureworks_production_transactionhistory = pd.read_sql('SELECT * FROM Production.TransactionHistory', adventureworks_conn)
adventureworks_production_transactionhistoryarchive = pd.read_sql('SELECT * FROM Production.TransactionHistoryArchive', adventureworks_conn)
adventureworks_production_unitmeasure = pd.read_sql('SELECT * FROM Production.UnitMeasure', adventureworks_conn)
adventureworks_production_workorder = pd.read_sql('SELECT * FROM Production.WorkOrder', adventureworks_conn)
adventureworks_production_workorderrouting = pd.read_sql('SELECT * FROM Production.WorkOrderRouting', adventureworks_conn)

In [None]:
adventureworks_purchasing_productvendor = pd.read_sql('SELECT * FROM Purchasing.ProductVendor', adventureworks_conn)
adventureworks_purchasing_purchaseorderdetail = pd.read_sql('SELECT * FROM Purchasing.PurchaseOrderDetail', adventureworks_conn)
adventureworks_purchasing_purchaseorderheader = pd.read_sql('SELECT * FROM Purchasing.PurchaseOrderHeader', adventureworks_conn)
adventureworks_purchasing_shipmethod = pd.read_sql('SELECT * FROM Purchasing.ShipMethod', adventureworks_conn)
adventureworks_purchasing_vendor = pd.read_sql('SELECT * FROM Purchasing.Vendor', adventureworks_conn)

In [None]:
adventureworks_sales_countryregioncurrency = pd.read_sql('SELECT * FROM Sales.CountryRegionCurrency', adventureworks_conn)
adventureworks_sales_creditcard = pd.read_sql('SELECT * FROM Sales.CreditCard', adventureworks_conn)
adventureworks_sales_currency = pd.read_sql('SELECT * FROM Sales.Currency', adventureworks_conn)
adventureworks_sales_currencyrate = pd.read_sql('SELECT * FROM Sales.CurrencyRate', adventureworks_conn)
adventureworks_sales_customer = pd.read_sql('SELECT * FROM Sales.Customer', adventureworks_conn)
adventureworks_sales_personcreditcard = pd.read_sql('SELECT * FROM Sales.PersonCreditCard', adventureworks_conn)
adventureworks_sales_salesorderdetail = pd.read_sql('SELECT * FROM Sales.SalesOrderDetail', adventureworks_conn)
adventureworks_sales_salesorderheader = pd.read_sql('SELECT * FROM Sales.SalesOrderHeader', adventureworks_conn)
adventureworks_sales_salesorderhearerrsaleseason = pd.read_sql('SELECT * FROM Sales.SalesOrderHeaderSalesReason', adventureworks_conn)
adventureworks_sales_salesperson = pd.read_sql('SELECT * FROM Sales.SalesPerson', adventureworks_conn)
adventureworks_sales_salespersonquotahistory = pd.read_sql('SELECT * FROM Sales.SalesPersonQuotaHistory', adventureworks_conn)
adventureworks_sales_salesreason = pd.read_sql('SELECT * FROM Sales.SalesReason', adventureworks_conn)
adventureworks_sales_salestaxrate = pd.read_sql('SELECT * FROM Sales.SalesTaxRate', adventureworks_conn)
adventureworks_sales_salesterritory = pd.read_sql('SELECT * FROM Sales.SalesTerritory', adventureworks_conn)
adventureworks_sales_salesterritoryhistory = pd.read_sql('SELECT * FROM Sales.SalesTerritoryHistory', adventureworks_conn)
adventureworks_sales_shoppingcartitem = pd.read_sql('SELECT * FROM Sales.ShoppingCartItem', adventureworks_conn)
adventureworks_sales_specialoffer = pd.read_sql('SELECT * FROM Sales.SpecialOffer', adventureworks_conn)
adventureworks_sales_specialofferproduct = pd.read_sql('SELECT * FROM Sales.SpecialOfferProduct', adventureworks_conn)
adventureworks_sales_store = pd.read_sql('SELECT * FROM Sales.Store', adventureworks_conn)

adventureworks_conn.close()

## Combining the data
ORDER MATTERS, CAUSE SURROGATE KEYS
SK STILL NEEDS TO BE DONE

### Departments
Combining aenc and adventureworks department data

In [None]:
# adding DEPARTMENT_source_database columns to the dataframes
aenc_department['DEPARTMENT_source_database'] = 'aenc'
adventureworks_humanresources_department['DEPARTMENT_source_database'] = 'adventureworks'

# combining all department data
departments = pd.concat([aenc_department, adventureworks_humanresources_department], ignore_index=True)

# combining name and department name columns to create a name column
departments['DEPARTMENT_DEPARTMENT_DeptName'] = departments['Name'].combine_first(departments['dept_name'])
# combining dept_id and DepartmentID columns
departments['DEPARTMENT_DEPARTMENT_DeptID'] = departments['dept_id'].combine_first(departments['DepartmentID'])

# dropping the redundant columns
drop_modified_date_rowguid(departments)
departments.drop(columns=['dept_id', 'Name', 'dept_name', 'DepartmentID'], inplace=True)

# renaming the remaining columns
departments.rename(columns={'dept_head_id': 'DEPARTMENT_DEPARTMENT_DeptHeadID', 'GroupName': 'DEPARTMENT_DEPARTMENT_GroupName'}, inplace=True)

departments.head()

### Employees

In [None]:
# merge northwind with territory and employee
employee1 = pd.merge(northwind_employees, northwind_employee_territories, on='EmployeeID')
employee1 = employee1.drop(columns=['EmployeeID', 'LastName', 'FirstName', 'BirthDate', 'Address', 'City', 'Region', 'PostalCode', 'Country', 'HomePhone', 'ReportsTo'])

# merge employee from aenc with bonus
employee2 = pd.merge(aenc_employee, aenc_bonus, on='emp_id')

# after that concat together
employees = pd.concat([employee1, employee2], ignore_index=True)

# rename
employees.rename(columns={
    'emp_id': 'EMPLOYEE_EMPLOYEE_EmployeeID', 
    'dept_id': 'EMPLOYEE_EMPLOYEE_DepartmentID', 
    'manager_id': 'EMPLOYEE_EMPLOYEE_ManagerID', 
    'territory_id' : 'EMPLOYEE_EMPLOYEETERRITORIES_TerritoryID',
    'emp_fname' : 'EMPLOYEE_EMPLOYEE_Emp_Fname',
    'emp_lname' : 'EMPLOYEE_EMPLOYEE_Emp_Lname',
    'street' : 'EMPLOYEE_EMPLOYEE_Street',
    'city' : 'EMPLOYEE_EMPLOYEE_City',
    'state' : 'EMPLOYEE_EMPLOYEE_State',
    'zip_code' : 'EMPLOYEE_EMPLOYEE_Zip_Code',
    'phone' : 'EMPLOYEE_EMPLOYEE_Phone',
    'status' : 'EMPLOYEE_EMPLOYEE_Status',
    'ss_number' : 'EMPLOYEE_EMPLOYEE_SS_Number',
    'salary' : 'EMPLOYEE_EMPLOYEE_Salary',
    'start_date' : 'EMPLOYEE_EMPLOYEE_Start_Date',
    'termination_date' : 'EMPLOYEE_EMPLOYEE_Termination',
    'birth_date' : 'EMPLOYEE_EMPLOYEE_Birth_Date',
    'bene_health_ins' : 'EMPLOYEE_EMPLOYEE_Bene_Health_Ins',
    'bene_life_ins' : 'EMPLOYEE_EMPLOYEE_Bene_Life_Ins',
    'bene_day_care' : 'EMPLOYEE_EMPLOYEE_Bene_Day_Care',
    'sex' : 'EMPLOYEE_EMPLOYEE_Sex',
    'bonus_date' : 'EMPLOYEE_BONUS_Bonus_Date',
    'bonus_amount' : 'EMPLOYEE_BONUS_Bonus_Amount',
    'Title' : 'EMPLOYEE_EMPLOYEES_Title',
    'TitleOfCourtesy' : 'EMPLOYEE_EMPLOYEES_TitleOfCourtesy',
    'HireDate' : 'EMPLOYEE_EMPLOYEES_HireDate',
    'HomePhone' : 'EMPLOYEE_EMPLOYEES_HomePhone',
    'Extension' : 'EMPLOYEE_EMPLOYEES_Extension',
    'PhotoPath' : 'EMPLOYEE_EMPLOYEES_PhotoPath',
    'Notes' : 'EMPLOYEE_EMPLOYEES_Notes'
}, inplace=True)

desired_columns_order = ['EMPLOYEE_EMPLOYEE_EmployeeID', 'EMPLOYEE_EMPLOYEE_DepartmentID', 'EMPLOYEE_EMPLOYEE_ManagerID', 'EMPLOYEE_EMPLOYEETERRITORIES_TerritoryID','EMPLOYEE_EMPLOYEE_Emp_Fname','EMPLOYEE_EMPLOYEE_Emp_Lname','EMPLOYEE_EMPLOYEE_Street','EMPLOYEE_EMPLOYEE_City','EMPLOYEE_EMPLOYEE_State','EMPLOYEE_EMPLOYEE_Zip_Code','EMPLOYEE_EMPLOYEE_Phone','EMPLOYEE_EMPLOYEE_Status','EMPLOYEE_EMPLOYEE_SS_Number','EMPLOYEE_EMPLOYEE_Salary','EMPLOYEE_EMPLOYEE_Start_Date','EMPLOYEE_EMPLOYEE_Termination','EMPLOYEE_EMPLOYEE_Birth_Date','EMPLOYEE_EMPLOYEE_Bene_Health_Ins','EMPLOYEE_EMPLOYEE_Bene_Life_Ins','EMPLOYEE_EMPLOYEE_Bene_Day_Care','EMPLOYEE_EMPLOYEE_Sex','EMPLOYEE_BONUS_Bonus_Date','EMPLOYEE_BONUS_Bonus_Amount','EMPLOYEE_EMPLOYEES_Title','EMPLOYEE_EMPLOYEES_TitleOfCourtesy','EMPLOYEE_EMPLOYEES_HireDate','EMPLOYEE_EMPLOYEES_HomePhone','EMPLOYEE_EMPLOYEES_Extension','EMPLOYEE_EMPLOYEES_PhotoPath','EMPLOYEE_EMPLOYEES_Notes', 'EMPLOYEE_EMPLOYEES_PhotoHexString']

#order
employees = employees.reindex(columns=desired_columns_order)

employees.head()

### BusinessEntities

In [None]:
# Combining the adventureworks Person.BusinessEntity, BusinessEntityContact and ContactType data
businessentities= pd.merge(adventureworks_person_businessentity, adventureworks_person_businessentitycontact, left_on='BusinessEntityID', right_on='BusinessEntityID', suffixes=('_person_businessentity', '_businessentitycontact'), how="outer")

businessentities = pd.merge(businessentities, adventureworks_person_contacttype, left_on='ContactTypeID', right_on='ContactTypeID', suffixes=('', '_contacttype'), how="outer")

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(businessentities)

# renaming the columns
businessentities.rename(columns={'BusinessEntityID': 'BUSINESSENTITY_BUSINESSENTITY_BusinessEntityID', 'PersonID': 'BUSINESSENTITY_BUSINESSENTITYCONTACT_PersonID', 'ContactTypeID': 'BUSINESSENTITY_CONTACTTYPE_ContactTypeID', 'Name' : 'BUSINESSENTITY_CONTACTTYPE_Name'}, inplace=True)

businessentities.head()

### BillOfMaterial

In [None]:
# merge table
billofmaterials = pd.merge(adventureworks_production_bill_of_materials, adventureworks_production_unitmeasure, on='UnitMeasureCode')

# rename
billofmaterials.rename(columns={
    'BillOfMaterialsID': 'BILLOFMATERIAL_BILLOFMATERIAL_BillOfMaterialID',
    'ProductAssemblyID': 'BILLOFMATERIAL_BILLOFMATERIAL_ProductAssemblyID',
    'ComponentID': 'BILLOFMATERIAL_BILLOFMATERIAL_ComponentID',
    'StartDate': 'BILLOFMATERIAL_BILLOFMATERIAL_StartDate',
    'EndDate': 'BILLOFMATERIAL_BILLOFMATERIAL_EndDate',
    'UnitMeasureCode': 'BILLOFMATERIAL_BILLOFMATERIAL_UnitMeasureCode',
    'BOMLevel': 'BILLOFMATERIAL_BILLOFMATERIAL_BOMLevel',
    'PerAssemblyQty': 'BILLOFMATERIAL_BILLOFMATERIAL_PerAssemblyQty',
    'Name': 'BILLOFMATERIAL_UNITMEASURE_Name'
}, inplace=True)

# drop unneeded column
drop_modified_date_rowguid(billofmaterials)

billofmaterials.head()

### Document

In [None]:
# merge table
documents = pd.merge(adventureworks_production_document, adventureworks_production_productdocument, on='DocumentNode')

# rename
documents.rename(columns={
    'DocumentNode': 'DOCUMENT_DOCUMENT_DocumentNode',
    'DocumentLevel': 'DOCUMENT_DOCUMENT_DocumentLevel',
    'Title': 'DOCUMENT_DOCUMENT_Title',
    'Owner': 'DOCUMENT_DOCUMENT_Owner',
    'FolderFlag': 'DOCUMENT_DOCUMENT_FolderFlag',
    'FileName': 'DOCUMENT_DOCUMENT_FileName',
    'FileExtension': 'DOCUMENT_DOCUMENT_FileExtension',
    'Revision': 'DOCUMENT_DOCUMENT_Revision',
    'ChangeNumber': 'DOCUMENT_DOCUMENT_ChangeNumber',
    'Status': 'DOCUMENT_DOCUMENT_Status',
    'DocumentSummary': 'DOCUMENT_DOCUMENT_DocumentSummary',
    'Document': 'DOCUMENT_DOCUMENT_Document',
    'ProductID': 'DOCUMENT_PRODUCTDOCUMENT_ProductID'
}, inplace=True)

# drop unneeded column
drop_modified_date_rowguid(documents)
documents.drop(columns=['DocumentNodeString'], inplace=True)

documents.head()

### Illustration

In [None]:
# merge table
illustrations = pd.merge(adventureworks_production_illustration, adventureworks_production_productmodelillustration, on='IllustrationID')

# rename
illustrations.rename(columns={
    'IllustrationID': 'ILLUSTRATION_ILLUSTRATION_IllustrationID',
    'Diagram': 'ILLUSTRATION_ILLUSTRATION_Diagram',
    'ProductModelID': 'ILLUSTRATION_PRODUCTMODELILLUSTRATION_ProductModelID',
}, inplace=True)

# drop unneeded column
drop_modified_date_rowguid(illustrations)

illustrations.head()

### ProductPhoto

In [None]:
# merge table
productphotos = pd.merge(adventureworks_production_productphoto, adventureworks_production_productproductphoto, on='ProductPhotoID')

# rename
productphotos.rename(columns={
    'ProductPhotoID': 'PRODUCTPHOTO_PRODUCTPHOTO_ProductPhotoID',
    'ThumbNailPhotoHexString': 'PRODUCTPHOTO_PRODUCTPHOTO_ThumbNailPhoto',
    'ThumbNailPhotoFileName': 'PRODUCTPHOTO_PRODUCTPHOTO_ThumbNailPhotoFileName',
    'LargePhotoHexString': 'PRODUCTPHOTO_PRODUCTPHOTO_LargePhoto',
    'LargePhotoFileName': 'PRODUCTPHOTO_PRODUCTPHOTO_LargePhotoFileName',
    'ProductID': 'PRODUCTPHOTO_PRODUCTPRODUCTPHOTO_ProductID',
    'Primary': 'PRODUCTPHOTO_PRODUCTPRODUCTPHOTO_Primary'
}, inplace=True)

# drop unneeded column
drop_modified_date_rowguid(productphotos)

productphotos.head()

### WorkOrder

In [None]:
# drop product id
adventureworks_production_workorderrouting = adventureworks_production_workorderrouting.drop(columns=('ProductID'))

#merge table
workorders = pd.merge(adventureworks_production_workorder, adventureworks_production_scrapreason, on="ScrapReasonID")
workorders = pd.merge(workorders, adventureworks_production_workorderrouting, on="WorkOrderID")

#rename
workorders.rename(columns={
    'WorkOrderID': 'WORKORDER_WORKORDER_WorkOrderID',
    'ProductID': 'WORKORDER_WORKORDER_ProductID',
    'OrderQty': 'WORKORDER_WORKORDER_OrderQty',
    'StockedQty': 'WORKORDER_WORKORDER_StockedQty',
    'ScrappedQty': 'WORKORDER_WORKORDER_ScrappedQty',
    'StartDate': 'WORKORDER_WORKORDER_StartDate',
    'EndDate': 'WORKORDER_WORKORDER_EndDate',
    'DueDate': 'WORKORDER_WORKORDER_DueDate',
    'ScrapReasonID': 'WORKORDER_SCRAPREASON_ScrapReasonID',
    'Name': 'WORKORDER_SCRAPREASON_Name',
    'OperationSequence': 'WORKORDER_WORKORDERINGROUTING_OperationSequence',
    'LocationID': 'WORKORDER_WORKORDERINGROUTING_LocationID',
    'ScheduledStartDate': 'WORKORDER_WORKORDERINGROUTING_ScheduledStartDate',
    'ScheduledEndDate': 'WORKORDER_WORKORDERINGROUTING_ScheduledEndDate',
    'ActualStartDate': 'WORKORDER_WORKORDERINGROUTING_ActualStartDate',
    'ActualEndDate': 'WORKORDER_WORKORDERINGROUTING_ActualEndDate',
    'ActualResourceHrs': 'WORKORDER_WORKORDERINGROUTING_ActualResourcesHrs',
    'PlannedCost': 'WORKORDER_WORKORDERINGROUTING_PlannedCost',
    'ActualCost': 'WORKORDER_WORKORDERINGROUTING_ActualCost'
}, inplace=True)

#drop unneeded column
drop_modified_date_rowguid(workorders)

workorders.head()

### BusinessEntityAddresses

In [None]:
# Combining the adventureworks Person.BusinessEntityAddress, Address and AddressType data
businessentityaddresses = pd.merge(adventureworks_person_businessentityaddress, adventureworks_person_address, left_on='AddressID', right_on='AddressID', suffixes=('', '_address'), how="outer")

businessentityaddresses = pd.merge(businessentityaddresses, adventureworks_person_address_type, left_on='AddressTypeID', right_on='AddressTypeID', suffixes=('', '_address_type'), how="outer")


# dropping the modified date and rowguid columns
drop_modified_date_rowguid(businessentityaddresses)

# renaming the columns
businessentityaddresses.rename(columns={'BusinessEntityID': 'BUSINESSENTITYADDRESS_BUSINESSENTITYADDRESS_BusinessEntityID', 'AddressID': 'BUSINESSENTITYADDRESS_ADDRESS_AddressID', 'AddressTypeID': 'BUSINESSENTITYADDRESS_ADDRESSTYPE_AddressTypeID', 'AddressLine1' : 'BUSINESSENTITYADDRESS_ADDRESS_AddressLine1', 'AddressLine2' : 'BUSINESSENTITYADDRESS_ADDRESS_AddressLine2', 'City' : 'BUSINESSENTITYADDRESS_ADDRESS_City', 'StateProvinceID' : 'BUSINESSENTITYADDRESS_ADDRESS_StateProvinceID', 'PostalCode' : 'BUSINESSENTITYADDRESS_ADDRESS_POSTALCODE', 'SpatialLocation' : 'BUSINESSENTITYADDRESS_ADDRESS_SpatialLocation', 'Name' : 'BUSINESSENTITYADDRESS_ADDRESSTYPE_Name'}, inplace=True)

businessentityaddresses.head()

### People

In [None]:
# combining person data from adventureworks HumanResources.Person, PersonPhone, PhoneNumberType, EmailAddress and Password
people = pd.merge(adventureworks_person_person, adventureworks_person_personphone, left_on='BusinessEntityID', right_on='BusinessEntityID', suffixes= ('_person', '_personphone'), how="outer")

people = pd.merge(people, adventureworks_person_phonenumbertype, left_on='PhoneNumberTypeID', right_on='PhoneNumberTypeID', suffixes=('', '_phonenumbertype'), how="outer")

people = pd.merge(people, adventureworks_person_emailaddress, left_on='BusinessEntityID', right_on='BusinessEntityID', suffixes=('','_emailaddress'), how="outer")

people = pd.merge(people, adventureworks_person_password, left_on='BusinessEntityID', right_on='BusinessEntityID', suffixes=('','_password'), how="outer")

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(people)

# renaming the columns
people.rename(columns={'BusinessEntityID': 'PERSON_PERSON_BusinessEntityID', 'PersonType': 'PERSON_PERSON_PersonType', 'NameStyle': 'PERSON_PERSON_NameStyle', 'Title': 'PERSON_PERSON_Title', 'FirstName': 'PERSON_PERSON_FirstName', 'MiddleName' : 'PERSON_PERSON_MiddleName', 'LastName' : 'PERSON_PERSON_LastName', 'Suffix': 'PERSON_PERSON_Suffix', 'EmailPromotion' : 'PERSON_PERSON_EmailPromotion', 'AdditionalContactInfo' : 'PERSON_PERSON_AdditionalContactInfo', 'Demographics' : 'PERSON_PERSON_Demographics', 'PhoneNumber' : 'PERSON_PERSONPHONE_PhoneNumber', 'PhoneNumberTypeID' : 'PERSON_PHONENUMBERTYPE_PhoneNumberTypeID', 'Name' : 'PERSON_PHONENUMBERTYPE_Name', 'EmailAddressID': 'PERSON_EMAILADDRESS_EmailAddressID', 'EmailAddress' : 'PERSON_EMAILADDRESS_EmailAddress', 'PasswordHash' : 'PERSON_PASSWORD_PasswordHash', 'PasswordSalt' : 'PERSON_PASSWORD_PasswordSalt'}, inplace=True)

people.head()

### JobCandidate

In [None]:
jobcandidates = adventureworks_humanresources_jobcandidate

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(jobcandidates)

# renaming the columns
jobcandidates.rename(columns={'JobCandidateID': 'JOBCANDIDATE_JOBCANDIDATE_JobCandidateID', 'BusinessEntityID': 'JOBCANDIDATE_JOBCANDIDATE_BusinessEntityID', 'Resume': 'JOBCANDIDATE_JOBCANDIDATE_Resume'}, inplace=True)

jobcandidates.head()

### EmployeeDepartmentHistory

In [None]:
employeedepartmenthistories = adventureworks_humanresources_employeedepartmenthistory

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(employeedepartmenthistories)

# renaming the columns
employeedepartmenthistories.rename(columns={'BusinessEntityID': 'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_BusinessEntityID', 'DepartmentID': 'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_DepartmentID', 'ShiftID': 'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_ShiftID', 'StartDate': 'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_StartDate', 'EndDate': 'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_EndDate'}, inplace=True)

employeedepartmenthistories.head()

### EmployeePayHistory

In [None]:
employeepayhistories = adventureworks_humanresources_employeepayhistory

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(employeepayhistories)

# renaming the columns
employeepayhistories.rename(columns={'BusinessEntityID': 'EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_BusinessEntityID', 'RateChangeDate': 'EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_RateChangeDate', 'Rate': 'EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_Rate', 'PayFrequency': 'EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_PayFrequency'}, inplace=True)

employeepayhistories.head()

### Shift

In [None]:
shifts = adventureworks_humanresources_shift

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(shifts)

# renaming the columns
shifts.rename(columns={'ShiftID': 'SHIFT_SHIFT_ShiftID', 'Name': 'SHIFT_SHIFT_Name', 'StartTime': 'SHIFT_SHIFT_StartTime', 'EndTime': 'SHIFT_SHIFT_EndTime'}, inplace=True)

shifts.head()

### SalesPerson

In [None]:
salespeople = adventureworks_sales_salesperson

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(salespeople)

# renaming the columns
salespeople.rename(columns={'BusinessEntityID': 'SALESPERSON_SALESPERSON_BusinessEntityID', 'TerritoryID': 'SALESPERSON_SALESPERSON_TerritoryID', 'SalesQuota': 'SALESPERSON_SALESPERSON_SalesQuota', 'Bonus': 'SALESPERSON_SALESPERSON_Bonus', 'CommissionPct': 'SALESPERSON_SALESPERSON_CommissionPct', 'SalesYTD': 'SALESPERSON_SALESPERSON_SalesYTD', 'SalesLastYear': 'SALESPERSON_SALESPERSON_SalesLastYear'}, inplace=True)

salespeople.head()

### ProductVendor

In [None]:
productvendors = adventureworks_purchasing_productvendor

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(productvendors)

# renaming the columns
productvendors.rename(columns={'ProductID': 'PRODUCTVENDOR_PRODUCTVENDOR_ProductID', 'BusinessEntityID': 'PRODUCTVENDOR_PRODUCTVENDOR_BusinessEntityID', 'AverageLeadTime': 'PRODUCTVENDOR_PRODUCTVENDOR_AverageLeadTime', 'StandardPrice': 'PRODUCTVENDOR_PRODUCTVENDOR_StandardPrice', 'LastReceiptCost': 'PRODUCTVENDOR_PRODUCTVENDOR_LastReceiptCost', 'LastReceiptDate': 'PRODUCTVENDOR_PRODUCTVENDOR_LastReceiptDate', 'MinOrderQty': 'PRODUCTVENDOR_PRODUCTVENDOR_MinOrderQty', 'MaxOrderQty': 'PRODUCTVENDOR_PRODUCTVENDOR_MaxOrderQty', 'OnOrderQty': 'PRODUCTVENDOR_PRODUCTVENDOR_OnOrderQty', 'UnitMeasureCode' : 'PRODUCTVENDOR_PRODUCTVENDOR_UnitMeasureCode'}, inplace=True)

productvendors.head()

### CustomerCustomerDemo

In [None]:
customercustomerdemos = northwind_customer_customer_demo

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(customercustomerdemos)

# renaming the columns
customercustomerdemos.rename(columns={'CustomerID': 'CUSTOMERCUSTOMERDEMO_CUSTOMERCUSTOMERDEMO_CustomerID', 'CustomerTypeID': 'CUSTOMERCUSTOMERDEMO_CUSTOMERCUSTOMERDEMO_CustomerTypeID'}, inplace=True)

customercustomerdemos.head()

### CustomerDemographics

In [None]:
customerdemographics = northwind_customer_demographics

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(customerdemographics)

# renaming the columns
customerdemographics.rename(columns={'CustomerTypeID': 'CUSTOMERDEMOGRAPHIC_CUSTOMERDEMOGRAPHICS_CustomerTypeID', 'CustomerDesc': 'CUSTOMERDEMOGRAPHIC_CUSTOMERDEMOGRAPHICS_CustomerDesc'}, inplace=True)

customerdemographics.head()

### SalesTerritoryHistory

In [None]:
salesterritoryhistories = adventureworks_sales_salesterritoryhistory

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(salesterritoryhistories)

# renaming the columns
salesterritoryhistories.rename(columns={'BusinessEntityID': 'SALESTERRITORYHISTORY_SALESTERRITORYHISTORY_BusinessEntityID', 'TerritoryID': 'SALESTERRITORYHISTORY_SALESTERRITORYHISTORY_TerritoryID', 'StartDate': 'SALESTERRITORYHISTORY_SALESTERRITORYHISTORY_StartDate', 'EndDate': 'SALESTERRITORYHISTORY_SALESTERRITORYHISTORY_EndDate'}, inplace=True)

salesterritoryhistories.head()

### ProductListPriceHistory

In [None]:
productlistpricehistories = adventureworks_production_productlistpricehistory

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(productlistpricehistories)

# renaming the columns
productlistpricehistories.rename(columns={'ProductID': 'PRODUCTLISTPRICEHISTORY_PRODUCTLISTPRICEHISTORY_ProductID', 'StartDate': 'PRODUCTLISTPRICEHISTORY_PRODUCTLISTPRICEHISTORY_StartDate', 'EndDate': 'PRODUCTLISTPRICEHISTORY_PRODUCTLISTPRICEHISTORY_EndDate', 'ListPrice': 'PRODUCTLISTPRICEHISTORY_PRODUCTLISTPRICEHISTORY_ListPrice'}, inplace=True)

productlistpricehistories.head()

### ProductCostHistory

In [None]:
productcosthistories = adventureworks_production_productcosthistory

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(productcosthistories)

# renaming the columns
productcosthistories.rename(columns={'ProductID': 'PRODUCTCOSTHISTORY_PRODUCTCOSTHISTORY_ProductID', 'StartDate': 'PRODUCTCOSTHISTORY_PRODUCTCOSTHISTORY_StartDate', 'EndDate': 'PRODUCTCOSTHISTORY_PRODUCTCOSTHISTORY_EndDate', 'StandardCost': 'PRODUCTCOSTHISTORY_PRODUCTCOSTHISTORY_StandardCost'}, inplace=True)

productcosthistories.head()

### ShoppingCartItem

In [None]:
shoppingcartitems = adventureworks_sales_shoppingcartitem

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(shoppingcartitems)

# renaming the columns
shoppingcartitems.rename(columns={'ShoppingCartItemID': 'SHOPPINGCARTITEM_SHOPPINGCARTITEM_ShoppingCartItemID', 'ShoppingCartID': 'SHOPPINGCARTITEM_SHOPPINGCARTITEM_ShoppingCartID', 'Quantity': 'SHOPPINGCARTITEM_SHOPPINGCARTITEM_Quantity', 'ProductID': 'SHOPPINGCARTITEM_SHOPPINGCARTITEM_ProductID', 'DateCreated': 'SHOPPINGCARTITEM_SHOPPINGCARTITEM_DateCreated'}, inplace=True)

shoppingcartitems.head()

### SalesPersonQuotaHistory

In [None]:
salespersonquotahistories = adventureworks_sales_salespersonquotahistory

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(salespersonquotahistories)

# renaming the columns
salespersonquotahistories.rename(columns={'BusinessEntityID': 'SALESPERSONQUOTAHISTORY_SALESPERSONQUOTAHISTORY_BusinessEntityID', 'QuotaDate': 'SALESPERSONQUOTAHISTORY_SALESPERSONQUOTAHISTORY_QuotaDate', 'SalesQuota': 'SALESPERSONQUOTAHISTORY_SALESPERSONQUOTAHISTORY_SalesQuota'}, inplace=True)

salespersonquotahistories.head()

### Customer

In [None]:
aenc_customer = aenc_customer[['fname', 'lname', 'state']]
northwind_customers = northwind_customers.drop(columns='CustomerID')

# combining all customer data
customers = pd.concat([northwind_customers, aenc_customer, adventureworks_sales_customer], ignore_index=True)

# rename
customers.rename(columns={
    'CustomerID': 'CUSTOMER_CUSTOMERS_ID',
    'CompanyName': 'CUSTOMER_CUSTOMERS_CompanyName',
    'ContactName': 'CUSTOMER_CUSTOMERS_ContactName',
    'ContactTitle': 'CUSTOMER_CUSTOMERS_ContactTitle',
    'Address': 'CUSTOMER_CUSTOMERS_Address',
    'City': 'CUSTOMER_CUSTOMERS_City',
    'Region': 'CUSTOMER_CUSTOMERS_Region',
    'PostalCode': 'CUSTOMER_CUSTOMERS_PostalCode',
    'Country': 'CUSTOMER_CUSTOMERS_Country',
    'Phone': 'CUSTOMER_CUSTOMERS_Phone',
    'Fax': 'CUSTOMER_CUSTOMERS_Fax',
    'fname': 'CUSTOMER_CUSTOMER_Fname',
    'lname': 'CUSTOMER_CUSTOMER_Lname',
    'state': 'CUSTOMER_CUSTOMER_State',
    'PersonID': 'CUSTOMER_CUSTOMER_PersonID',
    'StoreID': 'CUSTOMER_CUSTOMER_StoreID',
    'TerritoryID': 'CUSTOMER_CUSTOMER_TerritoryID',
    'AccountNumber': 'CUSTOMER_CUSTOMER_AccountNumber'
}, inplace=True)

drop_modified_date_rowguid(customers)

customers.head()

### SpecialOffer

In [None]:
# merge specialoffer with specialofferproduct
specialoffers = pd.merge(adventureworks_sales_specialoffer, adventureworks_sales_specialofferproduct, on="SpecialOfferID")

# drop rowguid and modifieddate
drop_modified_date_rowguid(specialoffers)

# rename to correct columns like in database
specialoffers = specialoffers.rename(columns={
    'SpecialOfferID': 'SPECIALOFFER_SPECIALOFFER_ID',
    'ProductID': 'SPECIALOFFER_SPECIALOFFERPRODUCT_ProductID',
    'Description': 'SPECIALOFFER_SPECIALOFFER_Description',
    'DiscountPct': 'SPECIALOFFER_SPECIALOFFER_DiscountPCT',
    'Type': 'SPECIALOFFER_SPECIALOFFER_Type',
    'Category': 'SPECIALOFFER_SPECIALOFFER_Category',
    'StartDate': 'SPECIALOFFER_SPECIALOFFER_StartDate',
    'EndDate': 'SPECIALOFFER_SPECIALOFFER_EndDate',
    'MinQty': 'SPECIALOFFER_SPECIALOFFER_MinQty',
    'MaxQty': 'SPECIALOFFER_SPECIALOFFER_MaxQty'
})

specialoffers.head()

### CreditCard

In [None]:
# merge creditcard with personcreditcard
creditcards = pd.merge(adventureworks_sales_creditcard, adventureworks_sales_personcreditcard, on="CreditCardID")

# drop rowguid and modifieddate
drop_modified_date_rowguid(creditcards)

# rename to correct columns like in database
creditcards = creditcards.rename(columns={
    'CreditCardID': 'CREDITCARD_CREDITCARD_ID',
    'CardType': 'CREDITCARD_CREDITCARD_CardType',
    'CardNumber': 'CREDITCARD_CREDITCARD_CardNumber ',
    'ExpMonth': 'CREDITCARD_CREDITCARD_ExpMonth',
    'ExpYear': 'CREDITCARD_CREDITCARD_ExpYear',
    'BusinessEntityID': 'CREDITCARD_PERSONCREDITCARD_BusinessEntityID'
})

creditcards.head()

### Supplier

In [None]:
# rename
suppliers = northwind_suppliers.rename(columns={
    'SupplierID': 'SUPPLIER_SUPPLIERS_SupplierID',
    'CompanyName': 'SUPPLIER_SUPPLIERS_CompanyName',
    'ContactName': 'SUPPLIER_SUPPLIERS_ContactName',
    'ContactTitle': 'SUPPLIER_SUPPLIERS_ContactTitle',
    'Address': 'SUPPLIER_SUPPLIERS_Address',
    'City': 'SUPPLIER_SUPPLIERS_City',
    'Region': 'SUPPLIER_SUPPLIERS_Region',
    'PostalCode': 'SUPPLIER_SUPPLIERS_PostalCode',
    'Country': 'SUPPLIER_SUPPLIERS_Country',
    'Phone': 'SUPPLIER_SUPPLIERS_Phone',
    'Fax': 'SUPPLIER_SUPPLIERS_Fax',
    'HomePage': 'SUPPLIER_SUPPLIERS_HomePage'
})

suppliers.head()

### Currency

In [None]:
# combine currency and countryregioncurrency
currencies = pd.merge(adventureworks_sales_currency, adventureworks_sales_countryregioncurrency, on="CurrencyCode")

# now combine with currencyrate
currencies = pd.concat([currencies, adventureworks_sales_currencyrate], ignore_index=True)

#drop the modifieddate
drop_modified_date_rowguid(currencies)

#rename 
currencies = currencies.rename(columns={
    'CurrencyCode': 'CURRENCY_CURRENCY_CurrencyCode',
    'Name': 'CURRENCY_CURRENCY_Name',
    'CountryRegionCode': 'CURRENCY_COUNTRYREGIONCURRENCY_CountryRegionCode',
    'CurrencyRateID': 'CURRENCY_CURRENCYRATE_CurrencyRateID',
    'CurrencyRateDate': 'CURRENCY_CURRENCYRATE_CurrencyRateDate',
    'FromCurrencyCode': 'CURRENCY_CURRENCYRATE_FromCurrencyCode',
    'ToCurrencyCode': 'CURRENCY_CURRENCYRATE_ToCurrencyCode',
    'AverageRate': 'CURRENCY_CURRENCYRATE_AverageRate ',
    'EndOfDayRate': 'CURRENCY_CURRENCYRATE_EndOfDayRate',
})

currencies.head()

### Territory

In [None]:
# combine sales territory first
adventureworks_combined_territory = pd.merge(adventureworks_person_countryregion, adventureworks_person_stateprovince, on="CountryRegionCode", suffixes=('_pcr', '_sts'))

# now merge with salestaxrate
adventureworks_combined_territory = pd.merge(adventureworks_combined_territory, adventureworks_sales_salestaxrate, on="StateProvinceID", suffixes=('_st', '_sst'))

# combine northwind region with territory
nw_combined_territory = pd.merge(northwind_region, northwind_territories, on="RegionID")

# do the same for aenc
aenc_combined_regionstate = pd.merge(aenc_region, aenc_state, on="region")

#concatinate the tables together
territories = pd.concat([nw_combined_territory, aenc_combined_regionstate, adventureworks_combined_territory], ignore_index=True)

# drop unneeded rowguid and modifieddate
drop_modified_date_rowguid(territories)

# rename columns
territories = territories.rename(columns={
    'RegionID': 'TERRITORY_REGION_RegionID',
    'RegionDescription': 'TERRITORY_REGION_RegionDescription',
    'TerritoryID': 'TERRITORY_TERRITORIES_TerritoryID',
    'TerritoryDescription': 'TERRITORY_TERRITORIES_TerritoryDescription',
    'region': 'TERRITORY_REGION_Region',
    'state_id': 'TERRITORY_STATE_StateID',
    'state_name': 'TERRITORY_STATE_StateName',
    'state_capital': 'TERRITORY_STATE_StateCapital',
    'country': 'TERRITORY_STATE_Country',
    'CountryRegionCode': 'TERRITORY_COUNTRYREGION_CountryRegionCode',
    'Name_pcr': 'TERRITORY_COUNTRYREGION_Name',
    'StateProvinceID': 'TERRITORY_STATEPROVINCE_StateProvinceID',
    'StateProvinceCode': 'TERRITORY_STATEPROVINCE_StateProvinceCode',
    'IsOnlyStateProvinceFlag': 'TERRITORY_STATEPROVINCE_IsOnlyStateProvinceFlag',
    'Name_sts': 'TERRITORY_STATEPROVINCE_Name',
    'SalesTaxRateID': 'TERRITORY_SALESTAXRATE_SalesTaxRateID',
    'TaxType': 'TERRITORY_SALESTAXRATE_TaxType',
    'TaxRate': 'TERRITORY_SALESTAXRATE_TaxRate',
    'Name': 'TERRITORY_SALESTAXRATE_Name'
})

territories.head()

### OrderHeader

In [None]:
# combine orderdetail with orderheader
orderheaders = pd.merge(adventureworks_sales_salesorderdetail, adventureworks_sales_salesorderheader, on="SalesOrderID", suffixes=('_sod', '_soh'))

# combine with salesorderheadersalesreason
orderheaders = pd.merge(orderheaders, adventureworks_sales_salesorderhearerrsaleseason, on="SalesOrderID")

# combine with salesreason
orderheaders = pd.merge(orderheaders, adventureworks_sales_salesreason, on="SalesReasonID")

#drop unneeded
drop_modified_date_rowguid(orderheaders)

# rename columns
orderheaders = orderheaders.rename(columns={
    'SalesOrderID': 'ORDERHEADER_SALESORDERDETAIL_SalesOrderID',
    'SalesOrderDetailID': 'ORDERHEADER_SALESORDERDETAIL_SalesOrderDetailID',
    'CarrierTrackingNumber': 'ORDERHEADER_SALESORDERDETAIL_CarrierTrackingNumber',
    'OrderQty': 'ORDERHEADER_SALESORDERDETAIL_OrderQty',
    'ProductID': 'ORDERHEADER_SALESORDERDETAIL_ProductID',
    'SpecialOfferID': 'ORDERHEADER_SALESORDERDETAIL_SpecialOfferID',
    'UnitPrice': 'ORDERHEADER_SALESORDERDETAIL_UnitPrice',
    'UnitPriceDiscount': 'ORDERHEADER_SALESORDERDETAIL_UnitPriceDiscount',
    'LineTotal': 'ORDERHEADER_SALESORDERDETAIL_LineTotal',
    'RevisionNumber': 'ORDERHEADER_SALESORDERHEADER_RevisionNumber',
    'OrderDate': 'ORDERHEADER_SALESORDERHEADER_OrderDate',
    'DueDate': 'ORDERHEADER_SALESORDERHEADER_DueDate',
    'ShipDate': 'ORDERHEADER_SALESORDERHEADER_ShipDate',
    'Status': 'ORDERHEADER_SALESORDERHEADER_Status',
    'OnlineOrderFlag': 'ORDERHEADER_SALESORDERHEADER_OnlineOrderFlag',
    'SalesOrderNumber': 'ORDERHEADER_SALESORDERHEADER_SalesOrderNumber',
    'PurchaseOrderNumber': 'ORDERHEADER_SALESORDERHEADER_PurchaseOrderNumber',
    'AccountNumber': 'ORDERHEADER_SALESORDERHEADER_AccountNumber',
    'CustomerID': 'ORDERHEADER_SALESORDERHEADER_CustomerID',
    'SalesPersonID': 'ORDERHEADER_SALESORDERHEADER_SalesPersonID',
    'TerritoryID': 'ORDERHEADER_SALESORDERHEADER_TerritoryID',
    'BillToAddressID': 'ORDERHEADER_SALESORDERHEADER_BillToAddress',
    'ShipToAddressID': 'ORDERHEADER_SALESORDERHEADER_ShipToAddress',
    'ShipMethodID': 'ORDERHEADER_SALESORDERHEADER_ShipMethodID',
    'CreditCardID': 'ORDERHEADER_SALESORDERHEADER_CreditCardID',
    'CreditCardApprovalCode': 'ORDERHEADER_SALESORDERHEADER_CreditCardApprovalCode',
    'CurrencyRateID': 'ORDERHEADER_SALESORDERHEADER_CurrencyRateID',
    'SubTotal': 'ORDERHEADER_SALESORDERHEADER_SubTotal',
    'TaxAmt': 'ORDERHEADER_SALESORDERHEADER_TaxAmt',
    'Freight': 'ORDERHEADER_SALESORDERHEADER_Freight',
    'TotalDue': 'ORDERHEADER_SALESORDERHEADER_TotalDue',
    'Comment': 'ORDERHEADER_SALESORDERHEADER_Comment',
    'SalesReasonID': 'ORDERHEADER_SALESREASON_SalesReasonID',
    'Name': 'ORDERHEADER_SALESREASON_Name',
    'ReasonType': 'ORDERHEADER_SALESREASON_ReasonType',
})

orderheaders.head()

### Date

In [None]:
def calculate_periods(periods_in_years): 
    return 365 * periods_in_years + (periods_in_years // 4) # divide by 4 for leap years

periods = calculate_periods(50)

date_table = {
    'DATE_ID': pd.date_range(start='1996-01-01', periods=periods).strftime('%Y%m%d').astype(int),
    'DATE_Date': pd.date_range(start='1996-01-01', periods=periods),
    'DATE_Weekday': pd.date_range(start='1996-01-01', periods=periods).strftime('%A'),
    'DATE_WeekdayNum': pd.date_range(start='1996-01-01', periods=periods).weekday + 1,
    'DATE_DayMonth': pd.date_range(start='1996-01-01', periods=periods).day,
    'DATE_DayOfYear': pd.date_range(start='1996-01-01', periods=periods).dayofyear,
    'DATE_WeekOfYear': pd.date_range(start='1996-01-01', periods=periods).isocalendar().week,
    'DATE_MonthNum': pd.date_range(start='1996-01-01', periods=periods).month,
    'DATE_MonthName': pd.date_range(start='1996-01-01', periods=periods).strftime('%B'),
    'DATE_MonthNameShort': pd.date_range(start='1996-01-01', periods=periods).strftime('%b'),
    'DATE_Quarter': pd.date_range(start='1996-01-01', periods=periods).quarter,
    'DATE_Year': pd.date_range(start='1996-01-01', periods=periods).year,
    'DATE_FirstDayOfMonth': pd.date_range(start='1996-01-01', periods=periods).to_period('M').start_time,
    'DATE_LastDayOfMonth': pd.date_range(start='1996-01-01', periods=periods).to_period('M').end_time.date,
    'DATE_YYYYMM': pd.date_range(start='1996-01-01', periods=periods).strftime('%Y-%m'),
    'DATE_WeekendIndr': pd.date_range(start='1996-01-01', periods=periods).weekday // 5
}

date_table = pd.DataFrame(date_table)
date_table['DATE_WeekendIndr'] = date_table['DATE_WeekendIndr'].replace({0: 'weekday', 1: 'weekend'})

date_table.tail()

### Time

In [None]:
# Define the number of minutes in a day
minutes_in_a_day = 24 * 60

# Create the data dictionary for time dimension
time_table = {
    'TIME_ID': list(range(minutes_in_a_day)),
    'TIME_Hour': [i // 60 for i in range(minutes_in_a_day)],
    'TIME_Minute': [i % 60 for i in range(minutes_in_a_day)],
    'TIME_HourMinute': [f'{hour:02d}:{minute:02d}' for hour in range(24) for minute in range(60)],
}

# Convert the dictionary to a DataFrame
time_table = pd.DataFrame(time_table)

# # Fetch existing data from the time dimension table
# existing_data_query = "SELECT TIME_ID FROM Time"
# existing_data = pd.read_sql(existing_data_query, united_outdoors_conn)

# # Filter new data to include only records that are not already in the table
# new_data_to_insert = time_table[~time_table['TIME_ID'].isin(existing_data['TIME_ID'])]

time_table.head()

### TransactionHistory

In [None]:
transactionhistories = adventureworks_production_transactionhistory

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(transactionhistories)

# renaming the columns
transactionhistories.rename(columns={'TransactionID': 'TRANSACTIONHISTORY_TRANSACTIONHISTORY_TransactionID', 'ProductID': 'TRANSACTIONHISTORY_TRANSACTIONHISTORY_ProductID', 'ReferenceOrderID': 'TRANSACTIONHISTORY_TRANSACTIONHISTORY_ReferenceOrderID', 'ReferenceOrderLineID': 'TRANSACTIONHISTORY_TRANSACTIONHISTORY_ReferenceOrderLineID', 'TransactionDate': 'TRANSACTIONHISTORY_TRANSACTIONHISTORY_TransactionDate', 'TransactionType': 'TRANSACTIONHISTORY_TRANSACTIONHISTORY_TransactionType', 'Quantity': 'TRANSACTIONHISTORY_TRANSACTIONHISTORY_Quantity', 'ActualCost': 'TRANSACTIONHISTORY_TRANSACTIONHISTORY_ActualCost'}, inplace=True)

transactionhistories.head()

### TransactionHistoryArchive

In [None]:
transactionhistoryarchives = adventureworks_production_transactionhistoryarchive

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(transactionhistoryarchives)

# renaming the columns
transactionhistoryarchives.rename(columns={'TransactionID': 'TRANSACTIONHISTORYARCHIVE_TRANSACTIONHISTORYARCHIVE_TransactionID', 'ProductID': 'TRANSACTIONHISTORYARCHIVE_TRANSACTIONHISTORYARCHIVE_ProductID', 'ReferenceOrderID': 'TRANSACTIONHISTORYARCHIVE_TRANSACTIONHISTORYARCHIVE_ReferenceOrderID', 'ReferenceOrderLineID': 'TRANSACTIONHISTORYARCHIVE_TRANSACTIONHISTORYARCHIVE_ReferenceOrderLineID', 'TransactionDate': 'TRANSACTIONHISTORYARCHIVE_TRANSACTIONHISTORYARCHIVE_TransactionDate', 'TransactionType': 'TRANSACTIONHISTORYARCHIVE_TRANSACTIONHISTORYARCHIVE_TransactionType', 'Quantity': 'TRANSACTIONHISTORYARCHIVE_TRANSACTIONHISTORYARCHIVE_Quantity', 'ActualCost': 'TRANSACTIONHISTORYARCHIVE_TRANSACTIONHISTORYARCHIVE_ActualCost'}, inplace=True)

transactionhistoryarchives.head()

### ProductReview

In [None]:
productreviews = adventureworks_production_productreview

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(productreviews)

# renaming the columns
productreviews.rename(columns={'ProductReviewID': 'PRODUCTREVIEW_PRODUCTREVIEW_ProductReviewID', 'ProductID': 'PRODUCTREVIEW_PRODUCTREVIEW_ProductID', 'ReviewerName': 'PRODUCTREVIEW_PRODUCTREVIEW_ReviewerName', 'ReviewDate': 'PRODUCTREVIEW_PRODUCTREVIEW_ReviewDate', 'EmailAddress': 'PRODUCTREVIEW_PRODUCTREVIEW_EmailAddress', 'Rating': 'PRODUCTREVIEW_PRODUCTREVIEW_Rating', 'Comments': 'PRODUCTREVIEW_PRODUCTREVIEW_Comments'}, inplace=True)

productreviews.head()

### Products (TODO CHANGE)

In [None]:
product_aenc = aenc_product.rename(columns={'id' : 'PRODUCT_PRODUCT_ID',
                                            'name' : 'PRODUCT_PRODUCT_Name',
                                            'description' : 'PRODUCT_PRODUCTDESCRIPTION_Desc',
                                            'prod_size' : 'PRODUCT_PRODUCT_Size',
                                            'color' : 'PRODUCT_PRODUCT_Color',
                                            'unit_price' : 'PRODUCT_PRODUCT_UnitPrice',
                                            'picture_name' : 'PRODUCT_PRODUCTPHOTO_ThumbnailPhotoFileName',
                                            'Category' : 'PRODUCT_PRODUCTCATEGORY_Category',
                                            })

product_aenc.drop(columns=['quantity'], inplace=True)

In [None]:
product_northwind = northwind_products.merge(northwind_categories, on='CategoryID', how='left')

product_northwind = product_northwind.rename(columns={'ProductID' : 'PRODUCT_PRODUCT_ID',
                                                        'ProductName' : 'PRODUCT_PRODUCT_Name',
                                                        'SupplierID' : 'PRODUCT_PRODUCT_SupplierID',
                                                        'CategoryID' : 'PRODUCT_PRODUCTSUBCATEGORY_CategoryID',
                                                        'QuantityPerUnit' : 'PRODUCT_PRODUCT_QuantityPerUnit',
                                                        'UnitPrice' : 'PRODUCT_PRODUCT_UnitPrice',
                                                        'UnitsOnOrder' : 'PRODUCT_PRODUCT_UnitsOnOrder',
                                                        'ReorderLevel' : 'PRODUCT_PRODUCT_ReorderLevel',
                                                        'Discontinued' : 'PRODUCT_PRODUCT_Discontinued',
                                                        'CategoryName' : 'PRODUCT_PRODUCTCATEGORY_Category',
                                                        'Description' : 'PRODUCT_PRODUCTDESCRIPTION_Desc',
                                                        'Picture' : 'PRODUCT_CATEGORY_Picture'
                                                        })

product_northwind.drop(columns=['UnitsInStock'], inplace=True)

product_northwind.dtypes

In [None]:
# combining all adventureworks product data
product_adventure_works = pd.merge(adventureworks_production_product, adventureworks_production_unitmeasure, left_on='SizeUnitMeasureCode', right_on='UnitMeasureCode', how='left')
product_adventure_works.rename(columns={'Name_y' : 'PRODUCT_PRODUCT_SizeUnitMeasureName'}, inplace=True)
product_adventure_works.rename(columns={'SizeUnitMeasureCode' : 'PRODUCT_PRODUCT_SizeUnitMeasureCode'}, inplace=True)
product_adventure_works.drop(columns=['UnitMeasureCode'], inplace=True)
product_adventure_works.drop(columns=['rowguid'], inplace=True)
product_adventure_works.drop(columns=['ModifiedDate_x'], inplace=True)
product_adventure_works.drop(columns=['ModifiedDate_y'], inplace=True)

product_adventure_works = pd.merge(product_adventure_works, adventureworks_production_unitmeasure, left_on='WeightUnitMeasureCode', right_on='UnitMeasureCode', how='left')
product_adventure_works.rename(columns={'Name' : 'PRODUCT_PRODUCT_WeightUnitMeasureName'}, inplace=True)
product_adventure_works.rename(columns={'WeightUnitMeasureCode' : 'PRODUCT_PRODUCT_WeightUnitMeasureCode'}, inplace=True)
product_adventure_works.drop(columns=['UnitMeasureCode'], inplace=True)
product_adventure_works.drop(columns=['ModifiedDate'], inplace=True)

product_adventure_works = pd.merge(product_adventure_works, adventureworks_production_productsubcategory, left_on='ProductSubcategoryID', right_on='ProductSubcategoryID', how='left')
product_adventure_works.rename(columns={'Name' : 'PRODUCT_PRODUCTSUBCATEGORY_SubCategory'}, inplace=True)
product_adventure_works.rename(columns={'ProductSubcategoryID' : 'PRODUCT_PRODUCT_SubCategoryID'}, inplace=True)
product_adventure_works.drop(columns=['rowguid'], inplace=True)
product_adventure_works.drop(columns=['ModifiedDate'], inplace=True)

product_adventure_works = pd.merge(product_adventure_works, adventureworks_production_productcategory, left_on='ProductCategoryID', right_on='ProductCategoryID', how='left')
product_adventure_works.rename(columns={'Name' : 'PRODUCT_PRODUCTCATEGORY_Category'}, inplace=True)
product_adventure_works.rename(columns={'ProductCategoryID' : 'PRODUCT_PRODUCTSUBCATEGORY_CategoryID'}, inplace=True)
product_adventure_works.drop(columns=['rowguid'], inplace=True)
product_adventure_works.drop(columns=['ModifiedDate'], inplace=True)

product_adventure_works = pd.merge(product_adventure_works, adventureworks_production_productmodel, left_on='ProductModelID', right_on='ProductModelID', how='left')
product_adventure_works.rename(columns={'Name' : 'PRODUCT_PRODUCTMODEL_Name'}, inplace=True)
product_adventure_works.rename(columns={'CatalogDescription' : 'PRODUCT_PRODUCTMODEL_CatalogDescription'}, inplace=True)
product_adventure_works.rename(columns={'Instructions' : 'PRODUCT_PRODUCTMODEL_Instructions'}, inplace=True)
product_adventure_works.drop(columns=['rowguid'], inplace=True)
product_adventure_works.drop(columns=['ModifiedDate'], inplace=True)

product_adventure_works = pd.merge(product_adventure_works, adventureworks_production_productmodelillustration, left_on='ProductModelID', right_on='ProductModelID', how='left')
product_adventure_works.rename(columns={'ProductModelID' : 'PRODUCT_PRODUCT_ModelID'}, inplace=True)
product_adventure_works.drop(columns=['ModifiedDate'], inplace=True)

product_adventure_works = pd.merge(product_adventure_works, adventureworks_production_illustration, left_on='IllustrationID', right_on='IllustrationID', how='left')
product_adventure_works.rename(columns={'IllustrationID' : 'PRODUCT_PRODUCTMODELILLUSTRATION_IllustrationID'}, inplace=True)
product_adventure_works.rename(columns={'Diagram' : 'PRODUCT_ILLUSTRATION_Diagram'}, inplace=True)
product_adventure_works.drop(columns=['ModifiedDate'], inplace=True)

product_adventure_works = pd.merge(product_adventure_works, adventureworks_production_productmodelproductdescriptionculture , left_on='PRODUCT_PRODUCT_ModelID', right_on='ProductModelID', how='left')
product_adventure_works.drop(columns=['ProductModelID'], inplace=True)
product_adventure_works.drop(columns=['ModifiedDate'], inplace=True)

product_adventure_works = pd.merge(product_adventure_works, adventureworks_production_culture , left_on='CultureID', right_on='CultureID', how='left')
product_adventure_works.rename(columns={'CultureID' : 'PRODUCT_PMPDC_CultureID'}, inplace=True)
product_adventure_works.rename(columns={'Name' : 'PRODUCT_CULTURE_Name'}, inplace=True)
product_adventure_works.drop(columns=['ModifiedDate'], inplace=True)

product_adventure_works = pd.merge(product_adventure_works, adventureworks_production_productdescription, left_on='ProductDescriptionID', right_on='ProductDescriptionID', how='left')
product_adventure_works.rename(columns={'ProductDescriptionID' : 'PRODUCT_PMPDC_DescriptionID'}, inplace=True)
product_adventure_works.rename(columns={'Description' : 'PRODUCT_PRODUCTDESCRIPTION_Desc'}, inplace=True)
product_adventure_works.drop(columns=['rowguid'], inplace=True)
product_adventure_works.drop(columns=['ModifiedDate'], inplace=True)

product_adventure_works = pd.merge(product_adventure_works, adventureworks_production_productproductphoto, left_on='ProductID', right_on='ProductID', how='left')
product_adventure_works.rename(columns={'Primary' : 'PRODUCT_PRODUCTPRODUCTPHOTO_Primary'}, inplace=True)
product_adventure_works.drop(columns=['ModifiedDate'], inplace=True)

product_adventure_works = pd.merge(product_adventure_works, adventureworks_production_productphoto, left_on='ProductPhotoID', right_on='ProductPhotoID', how='left')
product_adventure_works.rename(columns={'ProductPhotoID' : 'PRODUCT_PRODUCTPRODUCTPHOTO_PhotoID'}, inplace=True)
product_adventure_works.rename(columns={'ThumbNailPhotoHexString' : 'PRODUCT_PRODUCTPHOTO_ThumbnailPhotoHexString'}, inplace=True)
product_adventure_works.rename(columns={'LargePhotoHexString' : 'PRODUCT_PRODUCTPHOTO_LargePhotoHexString'}, inplace=True)
product_adventure_works.rename(columns={'ThumbNailPhotoFileName' : 'PRODUCT_PRODUCTPHOTO_ThumbnailPhotoFileName'}, inplace=True)
product_adventure_works.rename(columns={'LargePhotoFileName' : 'PRODUCT_PRODUCTPHOTO_LargePhotoFileName'}, inplace=True)
product_adventure_works.drop(columns=['ModifiedDate'], inplace=True)

product_adventure_works = product_adventure_works.rename(columns={'ProductID' : 'PRODUCT_PRODUCT_ID',
                                                                    'Name_x' : 'PRODUCT_PRODUCT_Name',
                                                                    'ProductNumber' : 'PRODUCT_PRODUCT_Number',
                                                                    'MakeFlag' : 'PRODUCT_PRODUCT_MakeFlag',
                                                                    'FinishedGoodsFlag' : 'PRODUCT_PRODUCT_FinishedGoodsFlag',
                                                                    'SafetyStockLevel' : 'PRODUCT_PRODUCT_SafetyStockLevel',
                                                                    'ReorderPoint' : 'PRODUCT_PRODUCT_ReorderPoint',
                                                                    'StandardCost' : 'PRODUCT_PRODUCT_StandardCost',
                                                                    'ListPrice' : 'PRODUCT_PRODUCT_ListPrice',
                                                                    'Size' : 'PRODUCT_PRODUCT_Size',
                                                                    'Weight' : 'PRODUCT_PRODUCT_Weight',
                                                                    'DaysToManufacture' : 'PRODUCT_PRODUCT_DaysToManufacture',
                                                                    'ProductLine' : 'PRODUCT_PRODUCT_ProductLine',
                                                                    'Class' : 'PRODUCT_PRODUCT_Class',
                                                                    'Style' : 'PRODUCT_PRODUCT_Style',
                                                                    'SellStartDate' : 'PRODUCT_PRODUCT_SellStartDate',
                                                                    'SellEndDate' : 'PRODUCT_PRODUCT_SellEndDate',
                                                                    'DiscontinuedDate' : 'PRODUCT_PRODUCT_DiscountedDate',
                                                                    'Color' : 'PRODUCT_PRODUCT_Color',
                                                                    }) 

# # applying the data types to the columns
product_adventure_works['PRODUCT_PRODUCT_ID'] = product_adventure_works['PRODUCT_PRODUCT_ID'].astype(float) # casting to float seems counterintuitive but it is necessary to avoid an error in Pandas

product_combined = pd.concat([product_adventure_works, product_aenc, product_northwind], ignore_index=True)


# Wasted hours of my life, sometimes it works and sometimes it doesn't.
# product_combined.drop(columns=['PRODUCT_CATEGORY_Picture'], inplace=True)       
# product_combined.drop(columns=['PRODUCT_PRODUCTPHOTO_ThumbnailPhotoHexString'], inplace=True) 
# product_combined.drop(columns=['PRODUCT_PRODUCTPHOTO_LargePhotoHexString'], inplace=True)     
# product_combined.drop(columns=['PRODUCT_PRODUCTMODEL_CatalogDescription'], inplace=True)     
# product_combined.drop(columns=['PRODUCT_PRODUCTMODEL_Instructions'], inplace=True)     
# product_combined.drop(columns=['PRODUCT_ILLUSTRATION_Diagram'], inplace=True)
# product_combined.drop(columns=['PRODUCT_PRODUCTPHOTO_ThumbnailPhotoFileName'], inplace=True)
# product_combined.drop(columns=['PRODUCT_PRODUCTPHOTO_LargePhotoFileName'], inplace=True)
# product_combined.drop(columns=['PRODUCT_PRODUCTDESCRIPTION_Desc'], inplace=True)

In [120]:
# combining all adventureworks product data
adventureworks_combined_products = pd.concat([adventureworks_production_product, adventureworks_production_productcategory, adventureworks_production_productsubcategory, adventureworks_production_productdescription, adventureworks_production_productdocument, adventureworks_production_productmodel, adventureworks_production_productmodelillustration, adventureworks_production_productmodelproductdescriptionculture, adventureworks_production_productphoto, adventureworks_production_productproductphoto  ], ignore_index=True)

adventureworks_combined_products.head()

Unnamed: 0,ProductID,Name,ProductNumber,MakeFlag,FinishedGoodsFlag,Color,SafetyStockLevel,ReorderPoint,StandardCost,ListPrice,...,CatalogDescription,Instructions,IllustrationID,CultureID,ProductPhotoID,ThumbNailPhotoHexString,ThumbNailPhotoFileName,LargePhotoHexString,LargePhotoFileName,Primary
0,1.0,Adjustable Race,AR-5381,False,False,,1000.0,750.0,0.0,0.0,...,,,,,,,,,,
1,2.0,Bearing Ball,BA-8327,False,False,,1000.0,750.0,0.0,0.0,...,,,,,,,,,,
2,3.0,BB Ball Bearing,BE-2349,True,False,,800.0,600.0,0.0,0.0,...,,,,,,,,,,
3,4.0,Headset Ball Bearings,BE-2908,False,False,,800.0,600.0,0.0,0.0,...,,,,,,,,,,
4,316.0,Blade,BL-2036,True,False,,800.0,600.0,0.0,0.0,...,,,,,,,,,,


In [121]:
# Combining all product data
products = pd.concat([northwind_products, aenc_product, adventureworks_combined_products], ignore_index=True)

# replacing the document node with documentnodestring
products['DocumentNode'] = products['DocumentNodeString']
products.drop(columns=['DocumentNodeString'], inplace=True)

# applying the data types to the columns
products['ProductID'] = products['ProductID'].astype(float) # casting to float seems counterintuitive but it is necessary to avoid an error in Pandas

# dropping documentnode column (need to fix later)
products.drop(columns=['DocumentNode'], inplace=True)


products.head()

Unnamed: 0,ProductID,ProductName,SupplierID,CategoryID,QuantityPerUnit,UnitPrice,UnitsInStock,UnitsOnOrder,ReorderLevel,Discontinued,...,CatalogDescription,Instructions,IllustrationID,CultureID,ProductPhotoID,ThumbNailPhotoHexString,ThumbNailPhotoFileName,LargePhotoHexString,LargePhotoFileName,Primary
0,1.0,Chai,1.0,1.0,10 boxes x 20 bags,18.0,39.0,0.0,10.0,False,...,,,,,,,,,,
1,2.0,Chang,1.0,1.0,24 - 12 oz bottles,19.0,17.0,40.0,25.0,False,...,,,,,,,,,,
2,3.0,Aniseed Syrup,1.0,2.0,12 - 550 ml bottles,10.0,13.0,70.0,25.0,False,...,,,,,,,,,,
3,4.0,Chef Anton's Cajun Seasoning,2.0,2.0,48 - 6 oz jars,22.0,53.0,0.0,0.0,False,...,,,,,,,,,,
4,5.0,Chef Anton's Gumbo Mix,2.0,2.0,36 boxes,21.35,0.0,0.0,0.0,True,...,,,,,,,,,,


### Regions (TODO CHANGE)

In [None]:
# Combining all region data
regions = pd.concat([northwind_region, aenc_region, adventureworks_person_stateprovince], ignore_index=True)

# combining regiondescription and region columns to create a name column (one is always None)
regions['RegionName'] = regions['RegionDescription'].combine_first(regions['region'])

# dropping the other columns
regions.drop(columns=['RegionDescription', 'region'], inplace=True)


regions.head()

### Customers

In [None]:
aenc_customer = aenc_customer[['fname', 'lname', 'state']]
#northwind_customers = northwind_customers.drop(columns='CustomerID')

# combining all customer data
customers = pd.concat([northwind_customers, aenc_customer, adventureworks_sales_customer], ignore_index=True)

# rename
customers.rename(columns={
    'CustomerID': 'CUSTOMER_CUSTOMERS_ID',
    'CompanyName': 'CUSTOMER_CUSTOMERS_CompanyName',
    'ContactName': 'CUSTOMER_CUSTOMERS_ContactName',
    'ContactTitle': 'CUSTOMER_CUSTOMERS_ContactTitle',
    'Address': 'CUSTOMER_CUSTOMERS_Address',
    'City': 'CUSTOMER_CUSTOMERS_City',
    'Region': 'CUSTOMER_CUSTOMERS_Region',
    'PostalCode': 'CUSTOMER_CUSTOMERS_PostalCode',
    'Country': 'CUSTOMER_CUSTOMERS_Country',
    'Phone': 'CUSTOMER_CUSTOMERS_Phone',
    'Fax': 'CUSTOMER_CUSTOMERS_Fax',
    'fname': 'CUSTOMER_CUSTOMER_Fname',
    'lname': 'CUSTOMER_CUSTOMER_Lname',
    'state': 'CUSTOMER_CUSTOMER_State',
    'PersonID': 'CUSTOMER_CUSTOMER_PersonID',
    'StoreID': 'CUSTOMER_CUSTOMER_StoreID',
    'TerritoryID': 'CUSTOMER_CUSTOMER_TerritoryID',
    'AccountNumber': 'CUSTOMER_CUSTOMER_AccountNumber'
}, inplace=True)

drop_modified_date_rowguid(customers)

customers.head()

In [123]:
# combining northwind customer data
northwind_combined_customer = pd.concat([northwind_customers, northwind_customer_customer_demo, northwind_customer_demographics], ignore_index=True)

# renaming northwind customer columns to match the other customer data
northwind_combined_customer.rename(columns={'PostalCode' : 'Zip'}, inplace=True)

northwind_combined_customer.head()

Unnamed: 0,CustomerID,CompanyName,ContactName,ContactTitle,Address,City,Region,Zip,Country,Phone,Fax,CUSTOMERCUSTOMERDEMO_CUSTOMERCUSTOMERDEMO_CustomerID,CUSTOMERCUSTOMERDEMO_CUSTOMERCUSTOMERDEMO_CustomerTypeID,CUSTOMERDEMOGRAPHIC_CUSTOMERDEMOGRAPHICS_CustomerTypeID,CUSTOMERDEMOGRAPHIC_CUSTOMERDEMOGRAPHICS_CustomerDesc
0,ALFKI,Alfreds Futterkiste,Maria Anders,Sales Representative,Obere Str. 57,Berlin,,12209,Germany,030-0074321,030-0076545,,,,
1,ANATR,Ana Trujillo Emparedados y helados,Ana Trujillo,Owner,Avda. de la ConstituciÃ³n 2222,MÃ©xico D.F.,,05021,Mexico,(5) 555-4729,(5) 555-3745,,,,
2,ANTON,Antonio Moreno TaquerÃ­a,Antonio Moreno,Owner,Mataderos 2312,MÃ©xico D.F.,,05023,Mexico,(5) 555-3932,,,,,
3,AROUT,Around the Horn,Thomas Hardy,Sales Representative,120 Hanover Sq.,London,,WA1 1DP,UK,(171) 555-7788,(171) 555-6750,,,,
4,BERGS,Berglunds snabbkÃ¶p,Christina Berglund,Order Administrator,BerguvsvÃ¤gen 8,LuleÃ¥,,S-958 22,Sweden,0921-12 34 65,0921-12 34 67,,,,


In [124]:
# combining all adventureworks customer data
adventureworks_combined_customers = pd.merge(adventureworks_sales_customer, adventureworks_person_person, left_on='PersonID', right_on='BusinessEntityID', how='outer', suffixes=('_sales_customer', '_person'))

adventureworks_combined_customers = pd.merge(adventureworks_combined_customers, adventureworks_person_businessentityaddress, left_on='BusinessEntityID', right_on='BusinessEntityID', suffixes=('', '_businessentityaddress'))

adventureworks_combined_customers = pd.merge(adventureworks_combined_customers, adventureworks_person_address, left_on='AddressID', right_on='AddressID', how='left', suffixes=('', '_address'))

adventureworks_combined_customers = pd.merge(adventureworks_combined_customers, adventureworks_person_address_type, left_on='AddressTypeID', right_on='AddressTypeID', how='left', suffixes=('', '_address_type'))

adventureworks_combined_customers = pd.merge(adventureworks_combined_customers, adventureworks_person_stateprovince, left_on=['StateProvinceID', 'TerritoryID'], right_on=['StateProvinceID', 'TerritoryID'], suffixes=('', '_stateprovince'))

adventureworks_combined_customers = pd.merge(adventureworks_combined_customers, adventureworks_person_countryregion, left_on='CountryRegionCode', right_on='CountryRegionCode', suffixes=('', '_countryregion'))

# combining fist, middle and last name columns to create a contact name column
adventureworks_combined_customers['ContactName'] = adventureworks_combined_customers['FirstName'] + ' ' + adventureworks_combined_customers['MiddleName'] + ' ' + adventureworks_combined_customers['LastName']
adventureworks_combined_customers.drop(columns=['FirstName', 'MiddleName', 'LastName'], inplace=True)

# Remove any double spaces caused by missing middle names
adventureworks_combined_customers['ContactName'] = adventureworks_combined_customers['ContactName'].str.replace('  ', ' ')

# Get all columns that contain 'rowguid' in their name
columns_to_drop = adventureworks_combined_customers.filter(like='rowguid').columns

# Drop these columns
adventureworks_combined_customers.drop(columns=columns_to_drop, inplace=True)

# only keeping the most recent modified date from the two tables
adventureworks_combined_customers['ModifiedDate'] = adventureworks_combined_customers[['ModifiedDate_sales_customer', 'ModifiedDate_person', 'ModifiedDate', 'ModifiedDate_address', 'ModifiedDate_address_type', 'ModifiedDate_stateprovince', 'ModifiedDate_countryregion']].max(axis=1)

# dropping the other modified date columns
adventureworks_combined_customers.drop(columns=['ModifiedDate_sales_customer', 'ModifiedDate_person', 'ModifiedDate', 'ModifiedDate_address', 'ModifiedDate_address_type', 'ModifiedDate_stateprovince', 'ModifiedDate_countryregion'], inplace=True)

# combining PersonID and BusinessEntityID columns
adventureworks_combined_customers['PersonID'] = adventureworks_combined_customers['PersonID'].combine_first(adventureworks_combined_customers['BusinessEntityID'])
adventureworks_combined_customers.drop(columns=['BusinessEntityID'], inplace=True)

# renaming columns to match the other customer data
adventureworks_combined_customers.rename(columns={'AddressLine1' : 'Address', 'PostalCode' : 'Zip', 'Name' : 'AddressType', 'Name_stateprovince' : 'StateProvince', 'Name_countryregion': 'CountryRegion' }, inplace=True)

adventureworks_combined_customers.head()

Unnamed: 0,CustomerID,PersonID,StoreID,TerritoryID,AccountNumber,PersonType,NameStyle,Title,Suffix,EmailPromotion,...,StateProvinceID,Zip,SpatialLocation,AddressType,StateProvinceCode,CountryRegionCode,IsOnlyStateProvinceFlag,StateProvince,CountryRegion,ContactName
0,11377.0,1699.0,,8.0,AW00011377,IN,False,Mr.,,1.0,...,53,42651,POINT (7.11082410683939 51.2015555665827),Home,NW,DE,False,Nordrhein-Westfalen,Germany,David R. Robinett
1,11913.0,1700.0,,9.0,AW00011913,IN,False,Ms.,,0.0,...,77,3198,POINT (145.141451560879 -38.0612939642931),Home,VIC,AU,False,Victoria,Australia,Rebecca A. Robinson
2,11952.0,1701.0,,9.0,AW00011952,IN,False,Ms.,,2.0,...,77,3220,POINT (144.201620782255 -38.1464342680786),Home,VIC,AU,False,Victoria,Australia,Dorothy B. Robinson
3,20164.0,1702.0,,10.0,AW00020164,IN,False,Ms.,,0.0,...,14,LA1 1LN,POINT (-2.80326155845985 54.1184442932464),Home,ENG,GB,True,England,United Kingdom,Carol Ann F. Rockne
4,20211.0,1703.0,,9.0,AW00020211,IN,False,Mr.,,0.0,...,64,4169,POINT (152.9802503342 -27.4802117164592),Home,QLD,AU,False,Queensland,Australia,Scott M. Rodgers


In [125]:
# renaming aenc customer columns to match the other customer data
aenc_customer.rename(columns={'id': 'CustomerID', 'address' : 'Address', 'city':'City' , 'state' : 'State', 'zip' : 'Zip', 'phone': 'Phone', 'company_name' : 'CompanyName'}, inplace=True)

# combining fname and lname columns to create a contact name column
aenc_customer['ContactName'] = aenc_customer['fname'] + ' ' + aenc_customer['lname']
aenc_customer.drop(columns=['fname', 'lname'], inplace=True)

aenc_customer.head()

Unnamed: 0,CustomerID,Address,City,State,Zip,Phone,CompanyName,ContactName
0,101,3114 Pioneer Avenue,Rutherford,NJ,7070,2015558966,The Power Group,Michaels Devlin
1,102,1033 Whippany Road,New York,NY,10154,2125558725,AMF Corp.,Beth Reiser
2,103,1990 Windsor Street,Paoli,PA,19301,2155556513,Darling Associates,Erin Niedringhaus
3,104,550 Dundas Street East,Knoxville,TN,37919,6155555463,P.S.C.,Meghan Mason
4,105,1210 Highway 36,Carmel,IN,46032,3175558437,Amo & Sons,Laura McCarthy


In [126]:
# Combining all customer data
customers = pd.concat([northwind_combined_customer, aenc_customer, adventureworks_combined_customers], ignore_index=True)

customers.head()

Unnamed: 0,CustomerID,CompanyName,ContactName,ContactTitle,Address,City,Region,Zip,Country,Phone,...,AddressTypeID,AddressLine2,StateProvinceID,SpatialLocation,AddressType,StateProvinceCode,CountryRegionCode,IsOnlyStateProvinceFlag,StateProvince,CountryRegion
0,ALFKI,Alfreds Futterkiste,Maria Anders,Sales Representative,Obere Str. 57,Berlin,,12209,Germany,030-0074321,...,,,,,,,,,,
1,ANATR,Ana Trujillo Emparedados y helados,Ana Trujillo,Owner,Avda. de la ConstituciÃ³n 2222,MÃ©xico D.F.,,05021,Mexico,(5) 555-4729,...,,,,,,,,,,
2,ANTON,Antonio Moreno TaquerÃ­a,Antonio Moreno,Owner,Mataderos 2312,MÃ©xico D.F.,,05023,Mexico,(5) 555-3932,...,,,,,,,,,,
3,AROUT,Around the Horn,Thomas Hardy,Sales Representative,120 Hanover Sq.,London,,WA1 1DP,UK,(171) 555-7788,...,,,,,,,,,,
4,BERGS,Berglunds snabbkÃ¶p,Christina Berglund,Order Administrator,BerguvsvÃ¤gen 8,LuleÃ¥,,S-958 22,Sweden,0921-12 34 65,...,,,,,,,,,,


In [127]:
def calculate_periods(periods_in_years): 
    return 365 * periods_in_years + (periods_in_years // 4) # divide by 4 for leap years

periods = calculate_periods(50)

date_table = {
    'DATE_ID': pd.date_range(start='1996-01-01', periods=periods).strftime('%Y%m%d').astype(int),
    'DATE_Date': pd.date_range(start='1996-01-01', periods=periods),
    'DATE_Weekday': pd.date_range(start='1996-01-01', periods=periods).strftime('%A'),
    'DATE_WeekdayNum': pd.date_range(start='1996-01-01', periods=periods).weekday + 1,
    'DATE_DayMonth': pd.date_range(start='1996-01-01', periods=periods).day,
    'DATE_DayOfYear': pd.date_range(start='1996-01-01', periods=periods).dayofyear,
    'DATE_WeekOfYear': pd.date_range(start='1996-01-01', periods=periods).isocalendar().week,
    'DATE_MonthNum': pd.date_range(start='1996-01-01', periods=periods).month,
    'DATE_MonthName': pd.date_range(start='1996-01-01', periods=periods).strftime('%B'),
    'DATE_MonthNameShort': pd.date_range(start='1996-01-01', periods=periods).strftime('%b'),
    'DATE_Quarter': pd.date_range(start='1996-01-01', periods=periods).quarter,
    'DATE_Year': pd.date_range(start='1996-01-01', periods=periods).year,
    'DATE_FirstDayOfMonth': pd.date_range(start='1996-01-01', periods=periods).to_period('M').start_time,
    'DATE_LastDayOfMonth': pd.date_range(start='1996-01-01', periods=periods).to_period('M').end_time.date,
    'DATE_YYYYMM': pd.date_range(start='1996-01-01', periods=periods).strftime('%Y-%m'),
    'DATE_WeekendIndr': pd.date_range(start='1996-01-01', periods=periods).weekday // 5
}

date_table = pd.DataFrame(date_table)
date_table['DATE_WeekendIndr'] = date_table['DATE_WeekendIndr'].replace({0: 'weekday', 1: 'weekend'})

date_table.tail()

Unnamed: 0,DATE_ID,DATE_Date,DATE_Weekday,DATE_WeekdayNum,DATE_DayMonth,DATE_DayOfYear,DATE_WeekOfYear,DATE_MonthNum,DATE_MonthName,DATE_MonthNameShort,DATE_Quarter,DATE_Year,DATE_FirstDayOfMonth,DATE_LastDayOfMonth,DATE_YYYYMM,DATE_WeekendIndr
2045-12-26,20451226,2045-12-26,Tuesday,2,26,360,52,12,December,Dec,4,2045,2045-12-01,2045-12-31,2045-12,weekday
2045-12-27,20451227,2045-12-27,Wednesday,3,27,361,52,12,December,Dec,4,2045,2045-12-01,2045-12-31,2045-12,weekday
2045-12-28,20451228,2045-12-28,Thursday,4,28,362,52,12,December,Dec,4,2045,2045-12-01,2045-12-31,2045-12,weekday
2045-12-29,20451229,2045-12-29,Friday,5,29,363,52,12,December,Dec,4,2045,2045-12-01,2045-12-31,2045-12,weekday
2045-12-30,20451230,2045-12-30,Saturday,6,30,364,52,12,December,Dec,4,2045,2045-12-01,2045-12-31,2045-12,weekend


In [128]:
# Define the number of minutes in a day
minutes_in_a_day = 24 * 60

# Create the data dictionary for time dimension
time_table = {
    'TIME_ID': list(range(minutes_in_a_day)),
    'TIME_Hour': [i // 60 for i in range(minutes_in_a_day)],
    'TIME_Minute': [i % 60 for i in range(minutes_in_a_day)],
    'TIME_HourMinute': [f'{hour:02d}:{minute:02d}' for hour in range(24) for minute in range(60)],
}

# Convert the dictionary to a DataFrame
time_table = pd.DataFrame(time_table)

# # Fetch existing data from the time dimension table
# existing_data_query = "SELECT TIME_ID FROM Time"
# existing_data = pd.read_sql(existing_data_query, united_outdoors_conn)

# # Filter new data to include only records that are not already in the table
# new_data_to_insert = time_table[~time_table['TIME_ID'].isin(existing_data['TIME_ID'])]

time_table

Unnamed: 0,TIME_ID,TIME_Hour,TIME_Minute,TIME_HourMinute
0,0,0,0,00:00
1,1,0,1,00:01
2,2,0,2,00:02
3,3,0,3,00:03
4,4,0,4,00:04
...,...,...,...,...
1435,1435,23,55,23:55
1436,1436,23,56,23:56
1437,1437,23,57,23:57
1438,1438,23,58,23:58


## Loading the data into the UnitedOutdoors datawarehouse

### Date

In [None]:
date_dtypes = {
    'DATE_ID': Integer,
    'DATE_Date': DATE,
    'DATE_Weekday': VARCHAR(10),
    'DATE_WeekdayNum': Integer,
    'DATE_DayMonth': Integer,
    'DATE_DayOfYear': Integer,
    'DATE_WeekOfYear': Integer,
    'DATE_MonthNum': Integer,
    'DATE_MonthName': VARCHAR(10),
    'DATE_MonthNameShort': CHAR(10),
    'DATE_Quarter': Integer,
    'DATE_Year': Integer,
    'DATE_FirstDayOfMonth': DATE,
    'DATE_LastDayOfMonth': DATE,
    'DATE_YYYYMM': CHAR(10),
    'DATE_WeekendIndr': CHAR(15)
}

prepare_and_insert(date_table, date_dtypes, 'Date', united_outdoors_engine)

### Time

In [None]:
time_dtypes = {
    'TIME_ID': Integer,
    'TIME_Hour': Integer,
    'TIME_Minute': Integer,
    'TIME_HourMinute': VARCHAR(10),
}

prepare_and_insert(time_table, time_dtypes, 'Time', united_outdoors_engine)

### BillOfMaterial

In [None]:
billofmaterial_dtypes = {
    'BILLOFMATERIAL_BILLOFMATERIAL_BillOfMaterialID': Integer,
    'BILLOFMATERIAL_BILLOFMATERIAL_ProductAssemblyID': Integer,
    'BILLOFMATERIAL_BILLOFMATERIAL_ComponentID': Integer,
    'BILLOFMATERIAL_BILLOFMATERIAL_StartDate': DATE,
    'BILLOFMATERIAL_BILLOFMATERIAL_EndDate': DATE,
    'BILLOFMATERIAL_BILLOFMATERIAL_UnitMeasureCode': CHAR(3),
    'BILLOFMATERIAL_BILLOFMATERIAL_BOMLevel': Integer,
    'BILLOFMATERIAL_BILLOFMATERIAL_PerAssemblyQty': DECIMAL(8,2),
    'BILLOFMATERIAL_UNITMEASURE_Name': NVARCHAR(50)
}

prepare_and_insert(billofmaterials, billofmaterial_dtypes, "BillOfMaterial", united_outdoors_engine)

### Document

In [None]:
document_dtypes = {
    'DOCUMENT_DOCUMENT_DocumentNode': NVARCHAR,
    'DOCUMENT_DOCUMENT_DocumentLevel': Integer,
    'DOCUMENT_DOCUMENT_Title': NVARCHAR(50),
    'DOCUMENT_DOCUMENT_Owner': Integer,
    'DOCUMENT_DOCUMENT_FolderFlag': BIT,
    'DOCUMENT_DOCUMENT_FileName': NVARCHAR(400),
    'DOCUMENT_DOCUMENT_FileExtension': NVARCHAR(8),
    'DOCUMENT_DOCUMENT_Revision': CHAR(5),
    'DOCUMENT_DOCUMENT_ChangeNumber': Integer,
    'DOCUMENT_DOCUMENT_Status': Integer,
    'DOCUMENT_DOCUMENT_DocumentSummary': NVARCHAR,
    'DOCUMENT_DOCUMENT_Document': LargeBinary,
    'DOCUMENT_PRODUCTDOCUMENT_ProductID': Integer,
}

prepare_and_insert(documents, document_dtypes, "Document", united_outdoors_engine)

### Illustration

In [None]:
# illustration_dtypes = {
#     'ILLUSTRATION_ILLUSTRATION_IllustrationID': Integer,
#     'ILLUSTRATION_ILLUSTRATION_Diagram': XML,
#     'ILLUSTRATION_PRODUCTMODELILLUSTRATION_ProductModelID': Integer,
# }

# illustration_nk_sk_dict = prepare_and_insert_return_sk(illustrations, illustration_dtypes, "Illustration", united_outdoors_engine, 'ILLUSTRATION_ILLUSTRATION_IllustrationID')

### ProductPhoto

In [None]:
# productphoto_dtypes = {
#     'PRODUCTPHOTO_PRODUCTPHOTO_ProductPhotoID': Integer,
#     'PRODUCTPHOTO_PRODUCTPHOTO_ThumbNailPhoto': LargeBinary,
#     'PRODUCTPHOTO_PRODUCTPHOTO_ThumbNailPhotoFileName': NVARCHAR(50),
#     'PRODUCTPHOTO_PRODUCTPHOTO_LargePhoto': LargeBinary,
#     'PRODUCTPHOTO_PRODUCTPHOTO_LargePhotoFileName': NVARCHAR(50),
#     'PRODUCTPHOTO_PRODUCTPRODUCTPHOTO_ProductID': Integer,
#     'PRODUCTPHOTO_PRODUCTPRODUCTPHOTO_Primary': Integer,
# }

# productphotos['PRODUCTPHOTO_PRODUCTPHOTO_ThumbNailPhoto'] = productphotos['PRODUCTPHOTO_PRODUCTPHOTO_ThumbNailPhoto'].apply(lambda x: x.encode('utf-8') if isinstance(x, str) else x)
# productphotos['PRODUCTPHOTO_PRODUCTPHOTO_LargePhoto'] = productphotos['PRODUCTPHOTO_PRODUCTPHOTO_LargePhoto'].apply(lambda x: x.encode('utf-8') if isinstance(x, str) else x)

# productphoto_nk_sk_dict = prepare_and_insert_return_sk(productphotos, productphoto_dtypes, "ProductPhoto", united_outdoors_engine, 'PRODUCTPHOTO_PRODUCTPHOTO_ProductPhotoID')

### WorkOrder

In [None]:
workorder_dtypes = {
    'WORKORDER_WORKORDER_WorkOrderID': Integer,
    'WORKORDER_WORKORDER_ProductID': Integer,
    'WORKORDER_WORKORDER_OrderQty': Integer,
    'WORKORDER_WORKORDER_StockedQty': Integer,
    'WORKORDER_WORKORDER_ScrappedQty': Integer,
    'WORKORDER_WORKORDER_StartDate': DATE,
    'WORKORDER_WORKORDER_EndDate': DATE,
    'WORKORDER_WORKORDER_DueDate': DATE,
    'WORKORDER_WORKORDER_ScrapReasonID': Integer,
    'WORKORDER_WORKORDERINGROUTING_ProductID': Integer,
    'WORKORDER_WORKORDERINGROUTING_OperationSequence': Integer,
    'WORKORDER_WORKORDERINGROUTING_LocationID': Integer,
    'WORKORDER_WORKORDERINGROUTING_ScheduledStartDate': DATE,
    'WORKORDER_WORKORDERINGROUTING_ScheduledEndDate': DATE,
    'WORKORDER_WORKORDERINGROUTING_ActualStartDate': DATE,
    'WORKORDER_WORKORDERINGROUTING_ActualEndDate': DATE,
    'WORKORDER_WORKORDERINGROUTING_ActualResourcesHrs': DECIMAL(9,4),
    'WORKORDER_WORKORDERINGROUTING_PlannedCost': MONEY,
    'WORKORDER_WORKORDERINGROUTING_ActualCost': MONEY,
    'WORKORDER_SCRAPREASON_ScrapReasonID': Integer,
    'WORKORDER_SCRAPREASON_Name': NVARCHAR(50)
}

prepare_and_insert(workorders, workorder_dtypes, "WorkOrder", united_outdoors_engine)

### Departments

In [None]:
departments_dtypes = {
    'DEPARTMENT_DEPARTMENT_DeptID': Integer,
    'DEPARTMENT_DEPARTMENT_DeptName': String(100),
    'DEPARTMENT_DEPARTMENT_GroupName': String(100),
    'DEPARTMENT_DEPARTMENT_DeptHeadID': Integer,
    'DEPARTMENT_source_database': String(100)
}

# TODO dept_head_id needs to refer to an employee
departments_nk_sk_dict = prepare_and_insert_return_sk(departments, departments_dtypes, 'Department', united_outdoors_engine, 'DEPARTMENT_DEPARTMENT_DeptID', { 'DEPARTMENT_DEPARTMENT_DeptHeadID' : {}})

### Employees

In [None]:
employees_dtypes = {
    'EMPLOYEE_EMPLOYEE_EmployeeID': Integer,
    'EMPLOYEE_EMPLOYEE_DepartmentID': Integer,
    'EMPLOYEE_EMPLOYEE_ManagerID': Integer,
    'EMPLOYEE_EMPLOYEETERRITORIES_TerritoryID': Integer,
    'EMPLOYEE_EMPLOYEE_Emp_Fname': NVARCHAR(255),
    'EMPLOYEE_EMPLOYEE_Emp_Lname': NVARCHAR(255),
    'EMPLOYEE_EMPLOYEE_Street': NVARCHAR(150),
    'EMPLOYEE_EMPLOYEE_City': NVARCHAR(100),
    'EMPLOYEE_EMPLOYEE_State': CHAR(1),
    'EMPLOYEE_EMPLOYEE_Zip_Code': CHAR(5),
    'EMPLOYEE_EMPLOYEE_Phone': NVARCHAR(20),
    'EMPLOYEE_EMPLOYEE_Status': CHAR(1),
    'EMPLOYEE_EMPLOYEE_SS_Number': Integer,
    'EMPLOYEE_EMPLOYEE_Salary': Integer,
    'EMPLOYEE_EMPLOYEE_Start_Date': DATE,
    'EMPLOYEE_EMPLOYEE_Termination': DATE,
    'EMPLOYEE_EMPLOYEE_Birth_Date': DATE,
    'EMPLOYEE_EMPLOYEE_Bene_Health_Ins': CHAR(1),
    'EMPLOYEE_EMPLOYEE_Bene_Life_Ins': CHAR(1),
    'EMPLOYEE_EMPLOYEE_Bene_Day_Care': CHAR(1),
    'EMPLOYEE_EMPLOYEE_Sex': CHAR(1),
    'EMPLOYEE_BONUS_Bonus_Date': DATE,
    'EMPLOYEE_BONUS_Bonus_Amount': Integer,
    'EMPLOYEE_EMPLOYEES_Title': NVARCHAR(50),
    'EMPLOYEE_EMPLOYEES_TitleOfCourtesy': NVARCHAR(50),
    'EMPLOYEE_EMPLOYEES_HireDate': DATE,
    'EMPLOYEE_EMPLOYEES_HomePhone': NVARCHAR(20),
    'EMPLOYEE_EMPLOYEES_Extension': Integer,
    'EMPLOYEE_EMPLOYEES_PhotoHexString': String,
    'EMPLOYEE_EMPLOYEES_PhotoPath': NVARCHAR(255),
    'EMPLOYEE_EMPLOYEES_Notes': String,
}

prepare_and_insert(employees, employees_dtypes, 'Employee', united_outdoors_engine, { 'EMPLOYEE_EMPLOYEE_DepartmentID' : departments_nk_sk_dict, 'EMPLOYEE_EMPLOYEE_ManagerID' : departments_nk_sk_dict})

### BusinessEntities

In [None]:
businessentities_dtypes = {
    'BUSINESSENTITY_BUSINESSENTITY_BusinessEntityID': Integer,
    'BUSINESSENTITY_BUSINESSENTITYCONTACT_PersonID': Integer,
    'BUSINESSENTITY_CONTACTTYPE_ContactTypeID': Integer,
    'BUSINESSENTITY_CONTACTTYPE_Name': String(100)
}

businessentities_nk_sk_dict = prepare_and_insert_return_sk(businessentities, businessentities_dtypes, 'BusinessEntity',united_outdoors_engine, 'BUSINESSENTITY_BUSINESSENTITY_BusinessEntityID')

### People

In [None]:
people_dtypes = {
    'PERSON_PERSON_BusinessEntityID': Integer,
    'PERSON_PERSON_PersonType': String(2),
    'PERSON_PERSON_NameStyle': BIT,
    'PERSON_PERSON_Title': String(100),
    'PERSON_PERSON_FirstName': String(100),
    'PERSON_PERSON_MiddleName': String(100),
    'PERSON_PERSON_LastName': String(100),
    'PERSON_PERSON_Suffix': String(100),
    'PERSON_PERSON_EmailPromotion': Integer,
    'PERSON_PERSON_AdditionalContactInfo': XML,
    'PERSON_PERSON_Demographics': XML,
    'PERSON_PERSONPHONE_PhoneNumber': String(100),
    'PERSON_PHONENUMBERTYPE_PhoneNumberTypeID': Integer,
    'PERSON_PHONENUMBERTYPE_Name': String(100),
    'PERSON_EMAILADDRESS_EmailAddressID': Integer,
    'PERSON_EMAILADDRESS_EmailAddress': String(100),
    'PERSON_PASSWORD_PasswordHash': LargeBinary,
    'PERSON_PASSWORD_PasswordSalt': LargeBinary
}

# Convert the 'PERSON_PASSWORD_PasswordHash' and 'PERSON_PASSWORD_PasswordSalt' columns to bytes
people['PERSON_PASSWORD_PasswordHash'] = people['PERSON_PASSWORD_PasswordHash'].apply(lambda x: x.encode('utf-8') if isinstance(x, str) else x)
people['PERSON_PASSWORD_PasswordSalt'] = people['PERSON_PASSWORD_PasswordSalt'].apply(lambda x: x.encode('utf-8') if isinstance(x, str) else x)

prepare_and_insert(people, people_dtypes, 'Person',united_outdoors_engine, { 'PERSON_PERSON_BusinessEntityID' : businessentities_nk_sk_dict})

### Updating the BusinessEntity table
replacing the natural keys with the surrogate keys for the PersonID column

In [55]:
prepare_and_update('BusinessEntity', united_outdoors_engine, { 'BUSINESSENTITY_BUSINESSENTITYCONTACT_PersonID' : businessentities_nk_sk_dict})

### BusinessEntityAddresses

In [None]:
businessentityaddresses_dtypes = {
    'BUSINESSENTITYADDRESS_BUSINESSENTITYADDRESS_BusinessEntityID': Integer,
    'BUSINESSENTITYADDRESS_ADDRESSTYPE_AddressTypeID': Integer,
    'BUSINESSENTITYADDRESS_ADDRESSTYPE_Name': String(100),
    'BUSINESSENTITYADDRESS_ADDRESS_AddressID': Integer,
    'BUSINESSENTITYADDRESS_ADDRESS_AddressLine1': String(100),
    'BUSINESSENTITYADDRESS_ADDRESS_AddressLine2': String(100),
    'BUSINESSENTITYADDRESS_ADDRESS_City': String(100),
    'BUSINESSENTITYADDRESS_ADDRESS_POSTALCODE': String(100),
    'BUSINESSENTITYADDRESS_ADDRESS_SpatialLocation': VARCHAR,
    'BUSINESSENTITYADDRESS_ADDRESS_StateProvinceID': Integer
}

# TODO the StateProvinceID needs to refer to sk of Territory
prepare_and_insert(businessentityaddresses, businessentityaddresses_dtypes, 'BusinessEntityAddress',united_outdoors_engine, { 'BUSINESSENTITYADDRESS_BUSINESSENTITYADDRESS_BusinessEntityID' : businessentities_nk_sk_dict, 'BUSINESSENTITYADDRESS_ADDRESS_StateProvinceID' : {}})

In [134]:
jobcandidates_dtypes = {
    'JOBCANDIDATE_JOBCANDIDATE_JobCandidateID': Integer,
    'JOBCANDIDATE_JOBCANDIDATE_BusinessEntityID': Integer,
    'JOBCANDIDATE_JOBCANDIDATE_Resume': XML
}

prepare_and_insert(jobcandidates, jobcandidates_dtypes, 'JobCandidate',united_outdoors_engine, { 'JOBCANDIDATE_JOBCANDIDATE_BusinessEntityID' : businessentities_nk_sk_dict})

Replacing natural keys with surrogate keys for column: JOBCANDIDATE_JOBCANDIDATE_BusinessEntityID
Inserting data into table: JobCandidate with chunk size: 666


In [135]:
employeedepartmenthistories_dtypes = {
    'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_BusinessEntityID': Integer,
    'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_DepartmentID': Integer,
    'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_ShiftID': Integer,
    'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_StartDate': DATE,
    'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_EndDate': DATE
}

prepare_and_insert(employeedepartmenthistories, employeedepartmenthistories_dtypes, 'EmployeeDepartmentHistory',united_outdoors_engine, { 'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_BusinessEntityID' : businessentities_nk_sk_dict, 'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_DepartmentID' : departments_nk_sk_dict, 'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_ShiftID' : {}})

Replacing natural keys with surrogate keys for column: EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_BusinessEntityID
Replacing natural keys with surrogate keys for column: EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_DepartmentID
Replacing natural keys with surrogate keys for column: EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_ShiftID
Inserting data into table: EmployeeDepartmentHistory with chunk size: 400


In [136]:
employeepayhistories_dtypes = {
    'EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_BusinessEntityID': Integer,
    'EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_RateChangeDate': DATE,
    'EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_Rate': MONEY,
    'EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_PayFrequency': Integer
}

prepare_and_insert(employeepayhistories, employeepayhistories_dtypes, 'EmployeePayHistory',united_outdoors_engine, { 'EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_BusinessEntityID' : businessentities_nk_sk_dict})

Replacing natural keys with surrogate keys for column: EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_BusinessEntityID
Inserting data into table: EmployeePayHistory with chunk size: 500


In [137]:
shifts_dtypes = {
    'SHIFT_SHIFT_ShiftID': Integer,
    'SHIFT_SHIFT_Name': String(100),
    'SHIFT_SHIFT_StartTime': TIME,
    'SHIFT_SHIFT_EndTime': TIME
}

prepare_and_insert(shifts, shifts_dtypes, 'Shift',united_outdoors_engine)

Inserting data into table: Shift with chunk size: 500


In [138]:
salespeople_dtypes = {
    'SALESPERSON_SALESPERSON_BusinessEntityID': Integer,
    'SALESPERSON_SALESPERSON_TerritoryID': Integer,
    'SALESPERSON_SALESPERSON_SalesQuota': MONEY,
    'SALESPERSON_SALESPERSON_Bonus': MONEY,
    'SALESPERSON_SALESPERSON_CommissionPct': DECIMAL(8,4),
    'SALESPERSON_SALESPERSON_SalesYTD': MONEY,
    'SALESPERSON_SALESPERSON_SalesLastYear': MONEY
}

prepare_and_insert(salespeople, salespeople_dtypes, 'SalesPerson',united_outdoors_engine, { 'SALESPERSON_SALESPERSON_BusinessEntityID' : businessentities_nk_sk_dict, 'SALESPERSON_SALESPERSON_TerritoryID' : {}})

Replacing natural keys with surrogate keys for column: SALESPERSON_SALESPERSON_BusinessEntityID
Replacing natural keys with surrogate keys for column: SALESPERSON_SALESPERSON_TerritoryID
Inserting data into table: SalesPerson with chunk size: 285


In [139]:
productvendors_dtypes = {
    'PRODUCTVENDOR_PRODUCTVENDOR_ProductID': Integer,
    'PRODUCTVENDOR_PRODUCTVENDOR_BusinessEntityID': Integer,
    'PRODUCTVENDOR_PRODUCTVENDOR_AverageLeadTime': Integer,
    'PRODUCTVENDOR_PRODUCTVENDOR_StandardPrice': MONEY,
    'PRODUCTVENDOR_PRODUCTVENDOR_LastReceiptCost': MONEY,
    'PRODUCTVENDOR_PRODUCTVENDOR_LastReceiptDate': DATE,
    'PRODUCTVENDOR_PRODUCTVENDOR_MinOrderQty': Integer,
    'PRODUCTVENDOR_PRODUCTVENDOR_MaxOrderQty': Integer,
    'PRODUCTVENDOR_PRODUCTVENDOR_OnOrderQty': Integer,
    'PRODUCTVENDOR_PRODUCTVENDOR_UnitMeasureCode': CHAR(3)
}

prepare_and_insert(productvendors, productvendors_dtypes, 'ProductVendor', united_outdoors_engine, { 'PRODUCTVENDOR_PRODUCTVENDOR_ProductID' : {}, 'PRODUCTVENDOR_PRODUCTVENDOR_BusinessEntityID' : businessentities_nk_sk_dict, 'PRODUCTVENDOR_PRODUCTVENDOR_UnitMeasureCode' : {}})

Replacing natural keys with surrogate keys for column: PRODUCTVENDOR_PRODUCTVENDOR_ProductID
Replacing natural keys with surrogate keys for column: PRODUCTVENDOR_PRODUCTVENDOR_BusinessEntityID
Replacing natural keys with surrogate keys for column: PRODUCTVENDOR_PRODUCTVENDOR_UnitMeasureCode
Inserting data into table: ProductVendor with chunk size: 200


In [140]:
customerdemographics_dtypes = {
    'CUSTOMERDEMOGRAPHIC_CUSTOMERDEMOGRAPHICS_CustomerTypeID': NVARCHAR(10),
    'CUSTOMERDEMOGRAPHIC_CUSTOMERDEMOGRAPHICS_CustomerDesc': String(100)
}

customerdemographics_nk_sk_dict = prepare_and_insert_return_sk(customerdemographics, customerdemographics_dtypes, 'CustomerDemographic',united_outdoors_engine, 'CUSTOMERDEMOGRAPHIC_CUSTOMERDEMOGRAPHICS_CustomerTypeID')

Inserting data into table: CustomerDemographic with chunk size: 1000


In [141]:
customercustomerdemos_dtypes = {
    'CUSTOMERCUSTOMERDEMO_CUSTOMERCUSTOMERDEMO_CustomerID': NVARCHAR(10),
    'CUSTOMERCUSTOMERDEMO_CUSTOMERCUSTOMERDEMO_CustomerTypeID': NVARCHAR(10)
}

# TODO how to handle fk? the nk is nvarchar, not an integer

prepare_and_insert(customercustomerdemos, customercustomerdemos_dtypes, 'CustomerCustomerDemo',united_outdoors_engine, { 'CUSTOMERCUSTOMERDEMO_CUSTOMERCUSTOMERDEMO_CustomerTypeID' : customerdemographics_nk_sk_dict, 'CUSTOMERCUSTOMERDEMO_CUSTOMERCUSTOMERDEMO_CustomerID' : {}})

Replacing natural keys with surrogate keys for column: CUSTOMERCUSTOMERDEMO_CUSTOMERCUSTOMERDEMO_CustomerTypeID
Replacing natural keys with surrogate keys for column: CUSTOMERCUSTOMERDEMO_CUSTOMERCUSTOMERDEMO_CustomerID
Inserting data into table: CustomerCustomerDemo with chunk size: 1000


In [142]:
salesterritoryhistories_dtypes = {
    'SALESTERRITORYHISTORY_SALESTERRITORYHISTORY_BusinessEntityID': Integer,
    'SALESTERRITORYHISTORY_SALESTERRITORYHISTORY_TerritoryID': Integer,
    'SALESTERRITORYHISTORY_SALESTERRITORYHISTORY_StartDate': DATE,
    'SALESTERRITORYHISTORY_SALESTERRITORYHISTORY_EndDate': DATE
}

prepare_and_insert(salesterritoryhistories, salesterritoryhistories_dtypes, 'SalesTerritoryHistory',united_outdoors_engine, { 'SALESTERRITORYHISTORY_SALESTERRITORYHISTORY_BusinessEntityID' : businessentities_nk_sk_dict, 'SALESTERRITORYHISTORY_SALESTERRITORYHISTORY_TerritoryID' : {}})

Replacing natural keys with surrogate keys for column: SALESTERRITORYHISTORY_SALESTERRITORYHISTORY_BusinessEntityID
Replacing natural keys with surrogate keys for column: SALESTERRITORYHISTORY_SALESTERRITORYHISTORY_TerritoryID
Inserting data into table: SalesTerritoryHistory with chunk size: 500


In [143]:
productlistpricehistories_dtypes = {
    'PRODUCTLISTPRICEHISTORY_PRODUCTLISTPRICEHISTORY_ProductID': Integer,
    'PRODUCTLISTPRICEHISTORY_PRODUCTLISTPRICEHISTORY_StartDate': DATE,
    'PRODUCTLISTPRICEHISTORY_PRODUCTLISTPRICEHISTORY_EndDate': DATE,
    'PRODUCTLISTPRICEHISTORY_PRODUCTLISTPRICEHISTORY_ListPrice': MONEY
}

prepare_and_insert(productlistpricehistories, productlistpricehistories_dtypes, 'ProductListPriceHistory',united_outdoors_engine, { 'PRODUCTLISTPRICEHISTORY_PRODUCTLISTPRICEHISTORY_ProductID' : {}})

Replacing natural keys with surrogate keys for column: PRODUCTLISTPRICEHISTORY_PRODUCTLISTPRICEHISTORY_ProductID
Inserting data into table: ProductListPriceHistory with chunk size: 500


In [144]:
productcosthistories_dtypes = {
    'PRODUCTCOSTHISTORY_PRODUCTCOSTHISTORY_ProductID': Integer,
    'PRODUCTCOSTHISTORY_PRODUCTCOSTHISTORY_StartDate': DATE,
    'PRODUCTCOSTHISTORY_PRODUCTCOSTHISTORY_EndDate': DATE,
    'PRODUCTCOSTHISTORY_PRODUCTCOSTHISTORY_StandardCost': MONEY
}

prepare_and_insert(productcosthistories, productcosthistories_dtypes, 'ProductCostHistory', united_outdoors_engine, { 'PRODUCTCOSTHISTORY_PRODUCTCOSTHISTORY_ProductID' : {}})

Replacing natural keys with surrogate keys for column: PRODUCTCOSTHISTORY_PRODUCTCOSTHISTORY_ProductID
Inserting data into table: ProductCostHistory with chunk size: 500


In [145]:
shoppingcartitems_dtypes = {
    'SHOPPINGCARTITEM_SHOPPINGCARTITEM_ShoppingCartItemID': Integer,
    'SHOPPINGCARTITEM_SHOPPINGCARTITEM_ShoppingCartID': Integer,
    'SHOPPINGCARTITEM_SHOPPINGCARTITEM_ProductID': Integer,
    'SHOPPINGCARTITEM_SHOPPINGCARTITEM_Quantity': Integer,
    'SHOPPINGCARTITEM_SHOPPINGCARTITEM_DateCreated': DATE
}

prepare_and_insert(shoppingcartitems, shoppingcartitems_dtypes, 'ShoppingCartItem', united_outdoors_engine, { 'SHOPPINGCARTITEM_SHOPPINGCARTITEM_ProductID' : {}, 'SHOPPINGCARTITEM_SHOPPINGCARTITEM_ShoppingCartID' : {}})

Replacing natural keys with surrogate keys for column: SHOPPINGCARTITEM_SHOPPINGCARTITEM_ProductID
Replacing natural keys with surrogate keys for column: SHOPPINGCARTITEM_SHOPPINGCARTITEM_ShoppingCartID
Inserting data into table: ShoppingCartItem with chunk size: 400


In [146]:
salespersonquotahistories_dtypes = {
    'SALESPERSONQUOTAHISTORY_SALESPERSONQUOTAHISTORY_BusinessEntityID': Integer,
    'SALESPERSONQUOTAHISTORY_SALESPERSONQUOTAHISTORY_QuotaDate': DATE,
    'SALESPERSONQUOTAHISTORY_SALESPERSONQUOTAHISTORY_SalesQuota': MONEY
}

prepare_and_insert(salespersonquotahistories, salespersonquotahistories_dtypes, 'SalesPersonQuotaHistory', united_outdoors_engine, { 'SALESPERSONQUOTAHISTORY_SALESPERSONQUOTAHISTORY_BusinessEntityID' : businessentities_nk_sk_dict})

Replacing natural keys with surrogate keys for column: SALESPERSONQUOTAHISTORY_SALESPERSONQUOTAHISTORY_BusinessEntityID
Inserting data into table: SalesPersonQuotaHistory with chunk size: 666


### Products

In [None]:
product_dtypes =  {
    'PRODUCT_SK': Integer,
    'PRODUCT_PRODUCT_ID': Integer,
    'PRODUCT_PRODUCT_SizeUnitMeasureCode': VARCHAR(10),
    'PRODUCT_PRODUCT_WeightUnitMeasureCode': VARCHAR(10),
    'PRODUCT_PRODUCT_SubCategoryID': Integer,
    'PRODUCT_PRODUCTSUBCATEGORY_CategoryID': Integer,
    'PRODUCT_PRODUCT_ModelID': Integer,
    'PRODUCT_PRODUCTMODELILLUSTRATION_IllustrationID': Integer,
    'PRODUCT_PRODUCTPRODUCTPHOTO_PhotoID': Integer,
    'PRODUCT_PMPDC_DescriptionID': Integer,
    'PRODUCT_PMPDC_CultureID': Integer,
    'PRODUCT_PRODUCT_Name': VARCHAR(150),
    'PRODUCT_PRODUCT_Number': VARCHAR(25),
    'PRODUCT_PRODUCT_MakeFlag': Integer,
    'PRODUCT_PRODUCT_FinishedGoodsFlag': Integer,
    'PRODUCT_PRODUCT_Color': VARCHAR(25),
    'PRODUCT_PRODUCT_SafetyStockLevel': Integer,
    'PRODUCT_PRODUCT_ReorderPoint': Integer,
    'PRODUCT_PRODUCT_ReorderLevel': Integer,
    'PRODUCT_PRODUCT_StandardCost': DECIMAL(8, 4),
    'PRODUCT_PRODUCT_UnitPrice': DECIMAL(19, 4),
    'PRODUCT_PRODUCT_ListPrice': DECIMAL(19, 4),
    'PRODUCT_PRODUCT_Size': VARCHAR(50),
    'PRODUCT_PRODUCT_SizeUnitMeasureName': VARCHAR(50),
    'PRODUCT_PRODUCT_WeightUnitMeasureName': VARCHAR(50),
    'PRODUCT_Color': VARCHAR(50),
    'PRODUCT_PRODUCT_Weight': DECIMAL(8, 2),
    'PRODUCT_PRODUCT_DaysToManufacture': Integer,
    'PRODUCT_PRODUCT_ProductLine': VARCHAR(10),
    'PRODUCT_PRODUCT_Class': VARCHAR(10),
    'PRODUCT_PRODUCT_Style': VARCHAR(10),
    'PRODUCT_PRODUCTSUBCATEGORY_SubCategory': VARCHAR(50),
    'PRODUCT_PRODUCTCATEGORY_Category': VARCHAR(50),
    'PRODUCT_CATEGORY_Picture': LargeBinary,
    'PRODUCT_PRODUCTMODEL_Name': VARCHAR(100),
    'PRODUCT_PRODUCTMODEL_CatalogDescription': XML,
    'PRODUCT_PRODUCTMODEL_Instructions': XML,
    'PRODUCT_ILLUSTRATION_Diagram': XML,
    'PRODUCT_CULTURE_Name': VARCHAR(50),
    'PRODUCT_PRODUCTDESCRIPTION_Desc': NVARCHAR,
    'PRODUCT_PRODUCTPRODUCTPHOTO_Primary': Integer,
    'PRODUCT_PRODUCTPHOTO_ThumbnailPhoto': LargeBinary,
    'PRODUCT_PRODUCTPHOTO_ThumbnailPhotoHexString': NVARCHAR,
    'PRODUCT_PRODUCTPHOTO_ThumbnailPhotoFileName': VARCHAR(50),
    'PRODUCT_PRODUCTPHOTO_LargePhoto': LargeBinary,
    'PRODUCT_PRODUCTPHOTO_LargePhotoHexString': NVARCHAR,
    'PRODUCT_PRODUCTPHOTO_LargePhotoFileName': VARCHAR(50),
    'PRODUCT_PRODUCT_SellStartDate': DATE,
    'PRODUCT_DATE_SellStartDateFK': Integer,
    'PRODUCT_PRODUCT_SellEndDate': DATE,
    'PRODUCT_DATE_SellEndDateFK': Integer,
    'PRODUCT_PRODUCT_DiscountedDate': DATE,
    'PRODUCT_DATE_DiscountedDateFK': Integer,
    'PRODUCT_PRODUCT_Discontinued': Integer,
    'PRODUCT_DATE_DateTimeAdded': Integer
}

<<<<<<< local
prepare_and_insert(products, products_dtypes, 'Product', united_outdoors_engine)
=======
prepare_and_insert(product_combined, product_dtypes, 'Product', united_outdoors_engine)
>>>>>>> remote

### JobCandidate

In [None]:
jobcandidates_dtypes = {
    'JOBCANDIDATE_JOBCANDIDATE_JobCandidateID': Integer,
    'JOBCANDIDATE_JOBCANDIDATE_BusinessEntityID': Integer,
    'JOBCANDIDATE_JOBCANDIDATE_Resume': XML
}

prepare_and_insert(jobcandidates, jobcandidates_dtypes, 'JobCandidate',united_outdoors_engine, { 'JOBCANDIDATE_JOBCANDIDATE_BusinessEntityID' : businessentities_nk_sk_dict})

### EmployeeDepartmentHistory

In [None]:
employeedepartmenthistories_dtypes = {
    'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_BusinessEntityID': Integer,
    'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_DepartmentID': Integer,
    'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_ShiftID': Integer,
    'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_StartDate': DATE,
    'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_EndDate': DATE
}

prepare_and_insert(employeedepartmenthistories, employeedepartmenthistories_dtypes, 'EmployeeDepartmentHistory',united_outdoors_engine, { 'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_BusinessEntityID' : businessentities_nk_sk_dict, 'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_DepartmentID' : departments_nk_sk_dict, 'EMPLOYEEDEPARTMENTHISTORY_EMPLOYEEDEPARTMENTHISTORY_ShiftID' : {}})

### EmployeePayHistory

In [None]:
employeepayhistories_dtypes = {
    'EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_BusinessEntityID': Integer,
    'EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_RateChangeDate': DATE,
    'EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_Rate': MONEY,
    'EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_PayFrequency': Integer
}

prepare_and_insert(employeepayhistories, employeepayhistories_dtypes, 'EmployeePayHistory',united_outdoors_engine, { 'EMPLOYEEPAYHISTORY_EMPLOYEEPAYHISTORY_BusinessEntityID' : businessentities_nk_sk_dict})

### Shift

In [None]:
shifts_dtypes = {
    'SHIFT_SHIFT_ShiftID': Integer,
    'SHIFT_SHIFT_Name': String(100),
    'SHIFT_SHIFT_StartTime': TIME,
    'SHIFT_SHIFT_EndTime': TIME
}

prepare_and_insert(shifts, shifts_dtypes, 'Shift',united_outdoors_engine)

### SalesPerson

In [None]:
salespeople_dtypes = {
    'SALESPERSON_SALESPERSON_BusinessEntityID': Integer,
    'SALESPERSON_SALESPERSON_TerritoryID': Integer,
    'SALESPERSON_SALESPERSON_SalesQuota': MONEY,
    'SALESPERSON_SALESPERSON_Bonus': MONEY,
    'SALESPERSON_SALESPERSON_CommissionPct': DECIMAL(8,4),
    'SALESPERSON_SALESPERSON_SalesYTD': MONEY,
    'SALESPERSON_SALESPERSON_SalesLastYear': MONEY
}

prepare_and_insert(salespeople, salespeople_dtypes, 'SalesPerson',united_outdoors_engine, { 'SALESPERSON_SALESPERSON_BusinessEntityID' : businessentities_nk_sk_dict, 'SALESPERSON_SALESPERSON_TerritoryID' : {}})

### ProductVendor

In [None]:
productvendors_dtypes = {
    'PRODUCTVENDOR_PRODUCTVENDOR_ProductID': Integer,
    'PRODUCTVENDOR_PRODUCTVENDOR_BusinessEntityID': Integer,
    'PRODUCTVENDOR_PRODUCTVENDOR_AverageLeadTime': Integer,
    'PRODUCTVENDOR_PRODUCTVENDOR_StandardPrice': MONEY,
    'PRODUCTVENDOR_PRODUCTVENDOR_LastReceiptCost': MONEY,
    'PRODUCTVENDOR_PRODUCTVENDOR_LastReceiptDate': DATE,
    'PRODUCTVENDOR_PRODUCTVENDOR_MinOrderQty': Integer,
    'PRODUCTVENDOR_PRODUCTVENDOR_MaxOrderQty': Integer,
    'PRODUCTVENDOR_PRODUCTVENDOR_OnOrderQty': Integer,
    'PRODUCTVENDOR_PRODUCTVENDOR_UnitMeasureCode': CHAR(3)
}

prepare_and_insert(productvendors, productvendors_dtypes, 'ProductVendor', united_outdoors_engine, { 'PRODUCTVENDOR_PRODUCTVENDOR_ProductID' : {}, 'PRODUCTVENDOR_PRODUCTVENDOR_BusinessEntityID' : businessentities_nk_sk_dict, 'PRODUCTVENDOR_PRODUCTVENDOR_UnitMeasureCode' : {}})

### CustomerDemographics

In [None]:
customerdemographics_dtypes = {
    'CUSTOMERDEMOGRAPHIC_CUSTOMERDEMOGRAPHICS_CustomerTypeID': NVARCHAR(10),
    'CUSTOMERDEMOGRAPHIC_CUSTOMERDEMOGRAPHICS_CustomerDesc': String(100)
}

customerdemographics_nk_sk_dict = prepare_and_insert_return_sk(customerdemographics, customerdemographics_dtypes, 'CustomerDemographic',united_outdoors_engine, 'CUSTOMERDEMOGRAPHIC_CUSTOMERDEMOGRAPHICS_CustomerTypeID')

### CustomerCustomerDemo

In [None]:
customercustomerdemos_dtypes = {
    'CUSTOMERCUSTOMERDEMO_CUSTOMERCUSTOMERDEMO_CustomerID': NVARCHAR(10),
    'CUSTOMERCUSTOMERDEMO_CUSTOMERCUSTOMERDEMO_CustomerTypeID': NVARCHAR(10)
}

# TODO how to handle fk? the nk is nvarchar, not an integer

prepare_and_insert(customercustomerdemos, customercustomerdemos_dtypes, 'CustomerCustomerDemo',united_outdoors_engine, { 'CUSTOMERCUSTOMERDEMO_CUSTOMERCUSTOMERDEMO_CustomerTypeID' : customerdemographics_nk_sk_dict, 'CUSTOMERCUSTOMERDEMO_CUSTOMERCUSTOMERDEMO_CustomerID' : {}})

### SalesTerritoryHistory

In [None]:
salesterritoryhistories_dtypes = {
    'SALESTERRITORYHISTORY_SALESTERRITORYHISTORY_BusinessEntityID': Integer,
    'SALESTERRITORYHISTORY_SALESTERRITORYHISTORY_TerritoryID': Integer,
    'SALESTERRITORYHISTORY_SALESTERRITORYHISTORY_StartDate': DATE,
    'SALESTERRITORYHISTORY_SALESTERRITORYHISTORY_EndDate': DATE
}

prepare_and_insert(salesterritoryhistories, salesterritoryhistories_dtypes, 'SalesTerritoryHistory',united_outdoors_engine, { 'SALESTERRITORYHISTORY_SALESTERRITORYHISTORY_BusinessEntityID' : businessentities_nk_sk_dict, 'SALESTERRITORYHISTORY_SALESTERRITORYHISTORY_TerritoryID' : {}})

### ProductListPriceHistory

In [None]:
productlistpricehistories_dtypes = {
    'PRODUCTLISTPRICEHISTORY_PRODUCTLISTPRICEHISTORY_ProductID': Integer,
    'PRODUCTLISTPRICEHISTORY_PRODUCTLISTPRICEHISTORY_StartDate': DATE,
    'PRODUCTLISTPRICEHISTORY_PRODUCTLISTPRICEHISTORY_EndDate': DATE,
    'PRODUCTLISTPRICEHISTORY_PRODUCTLISTPRICEHISTORY_ListPrice': MONEY
}

prepare_and_insert(productlistpricehistories, productlistpricehistories_dtypes, 'ProductListPriceHistory',united_outdoors_engine, { 'PRODUCTLISTPRICEHISTORY_PRODUCTLISTPRICEHISTORY_ProductID' : {}})

### ProductCostHistory

In [None]:
productcosthistories_dtypes = {
    'PRODUCTCOSTHISTORY_PRODUCTCOSTHISTORY_ProductID': Integer,
    'PRODUCTCOSTHISTORY_PRODUCTCOSTHISTORY_StartDate': DATE,
    'PRODUCTCOSTHISTORY_PRODUCTCOSTHISTORY_EndDate': DATE,
    'PRODUCTCOSTHISTORY_PRODUCTCOSTHISTORY_StandardCost': MONEY
}

prepare_and_insert(productcosthistories, productcosthistories_dtypes, 'ProductCostHistory', united_outdoors_engine, { 'PRODUCTCOSTHISTORY_PRODUCTCOSTHISTORY_ProductID' : {}})

### ShoppingCartItem

In [None]:
shoppingcartitems_dtypes = {
    'SHOPPINGCARTITEM_SHOPPINGCARTITEM_ShoppingCartItemID': Integer,
    'SHOPPINGCARTITEM_SHOPPINGCARTITEM_ShoppingCartID': Integer,
    'SHOPPINGCARTITEM_SHOPPINGCARTITEM_ProductID': Integer,
    'SHOPPINGCARTITEM_SHOPPINGCARTITEM_Quantity': Integer,
    'SHOPPINGCARTITEM_SHOPPINGCARTITEM_DateCreated': DATE
}

prepare_and_insert(shoppingcartitems, shoppingcartitems_dtypes, 'ShoppingCartItem', united_outdoors_engine, { 'SHOPPINGCARTITEM_SHOPPINGCARTITEM_ProductID' : {}, 'SHOPPINGCARTITEM_SHOPPINGCARTITEM_ShoppingCartID' : {}})

### SalesPersonQuotaHistory

In [None]:
salespersonquotahistories_dtypes = {
    'SALESPERSONQUOTAHISTORY_SALESPERSONQUOTAHISTORY_BusinessEntityID': Integer,
    'SALESPERSONQUOTAHISTORY_SALESPERSONQUOTAHISTORY_QuotaDate': DATE,
    'SALESPERSONQUOTAHISTORY_SALESPERSONQUOTAHISTORY_SalesQuota': MONEY
}

prepare_and_insert(salespersonquotahistories, salespersonquotahistories_dtypes, 'SalesPersonQuotaHistory', united_outdoors_engine, { 'SALESPERSONQUOTAHISTORY_SALESPERSONQUOTAHISTORY_BusinessEntityID' : businessentities_nk_sk_dict})

In [148]:
regions_dtypes = {
    'RegionID': Integer,
    'RegionName': VARCHAR(10),
    'StateProvinceID': Integer,
    'StateProvinceCode': VARCHAR(10),
    'CountryRegionCode': CHAR(2),
    'IsOnlyStateProvinceFlag': BIT,
    'Name': VARCHAR(50),
    'TerritoryID': Integer,
    'ModifiedDate': DATE
}

regions['StateProvinceCode'] = regions['StateProvinceCode'].astype(str)

prepare_and_insert(regions, regions_dtypes, 'Region', united_outdoors_engine)

Inserting data into table: Region with chunk size: 200


### Customers

In [None]:
customers_dtypes = {
    'CUSTOMER_CUSTOMERS_ID': Integer,
    'CUSTOMER_CUSTOMER_PersonID': Integer,
    'CUSTOMER_CUSTOMER_StoreID': Integer,
    'CUSTOMER_CUSTOMER_TerritoryID': Integer,
    'CUSTOMER_CUSTOMER_AccountNumber': CHAR(10),
    'CUSTOMER_CUSTOMERS_CompanyName': NVARCHAR(100),
    'CUSTOMER_CUSTOMERS_ContactName': NVARCHAR(255),
    'CUSTOMER_CUSTOMERS_ContactTitle': NVARCHAR(100),
    'CUSTOMER_CUSTOMERS_Address': NVARCHAR(255),
    'CUSTOMER_CUSTOMERS_City': NVARCHAR(100),
    'CUSTOMER_CUSTOMERS_Region': NVARCHAR(50),
    'CUSTOMER_CUSTOMERS_PostalCode': NVARCHAR(20),
    'CUSTOMER_CUSTOMERS_Country': NVARCHAR(150),
    'CUSTOMER_CUSTOMERS_Phone': NVARCHAR(24),
    'CUSTOMER_CUSTOMERS_Fax': NVARCHAR(24),
    'CUSTOMER_CUSTOMER_Fname': NVARCHAR(255),
    'CUSTOMER_CUSTOMER_Lname': NVARCHAR(255),
    'CUSTOMER_CUSTOMER_State': CHAR(2)
}

customers_nk_sk_dict = prepare_and_insert_return_sk(customers, customers_dtypes, 'Customer', united_outdoors_engine, 'CUSTOMER_CUSTOMERS_ID', {'CUSTOMER_CUSTOMER_PersonID': {}, 'CUSTOMER_CUSTOMER_StoreID': {}, 'CUSTOMER_CUSTOMER_TerritoryID': {}})

### SpecialOffer

In [None]:
specialoffers_dtypes = {
    'SPECIALOFFER_SPECIALOFFER_ID': Integer,
    'SPECIALOFFER_SPECIALOFFER_Description': String,
    'SPECIALOFFER_SPECIALOFFER_DiscountPCT': Integer,
    'SPECIALOFFER_SPECIALOFFER_Type': NVARCHAR(100),
    'SPECIALOFFER_SPECIALOFFER_Category': NVARCHAR(100),
    'SPECIALOFFER_SPECIALOFFER_StartDate': DATE,
    'SPECIALOFFER_SPECIALOFFER_EndDate': DATE,
    'SPECIALOFFER_SPECIALOFFER_MinQty': Integer,
    'SPECIALOFFER_SPECIALOFFER_MaxQty': Integer,
    'SPECIALOFFER_SPECIALOFFERPRODUCT_ProductID': Integer
}

specialoffer_nk_sk_dict = prepare_and_insert_return_sk(specialoffers, specialoffers_dtypes, 'SpecialOffer', united_outdoors_engine, 'SPECIALOFFER_SPECIALOFFER_ID')

### TransactionHistory

In [None]:
transactionhistories_dtypes = {
    'TRANSACTIONHISTORY_TRANSACTIONHISTORY_TransactionID': Integer,
    'TRANSACTIONHISTORY_TRANSACTIONHISTORY_ProductID': Integer,
    'TRANSACTIONHISTORY_TRANSACTIONHISTORY_ReferenceOrderID': Integer,
    'TRANSACTIONHISTORY_TRANSACTIONHISTORY_ReferenceOrderLineID': Integer,
    'TRANSACTIONHISTORY_TRANSACTIONHISTORY_TransactionDate': DATE,
    'TRANSACTIONHISTORY_TRANSACTIONHISTORY_TransactionType': CHAR(1),
    'TRANSACTIONHISTORY_TRANSACTIONHISTORY_Quantity': Integer,
    'TRANSACTIONHISTORY_TRANSACTIONHISTORY_ActualCost': MONEY,
}

prepare_and_insert(transactionhistories, transactionhistories_dtypes, 'TransactionHistory', united_outdoors_engine, { 'TRANSACTIONHISTORY_TRANSACTIONHISTORY_ProductID' : {}})

### CreditCard

In [None]:
creditcards_dtypes = {
    'CREDITCARD_CREDITCARD_ID': Integer,
    'CREDITCARD_CREDITCARD_CardType': NVARCHAR(100),
    'CREDITCARD_CREDITCARD_CardNumber': BigInteger,
    'CREDITCARD_CREDITCARD_ExpMonth': Integer,
    'CREDITCARD_CREDITCARD_ExpYear': Integer,
    'CREDITCARD_PERSONCREDITCARD_BusinessEntityID': Integer
}

creditcards_nk_sk_dict = prepare_and_insert_return_sk(creditcards, creditcards_dtypes, 'CreditCard', united_outdoors_engine, 'CREDITCARD_CREDITCARD_ID')

### Supplier

In [None]:
suppliers_dtypes = {
    'SUPPLIER_SUPPLIERS_SupplierID': Integer,
    'SUPPLIER_SUPPLIERS_CompanyName': NVARCHAR(100),
    'SUPPLIER_SUPPLIERS_ContactName': NVARCHAR(255),
    'SUPPLIER_SUPPLIERS_ContactTitle': NVARCHAR(100),
    'SUPPLIER_SUPPLIERS_Address': NVARCHAR(150),
    'SUPPLIER_SUPPLIERS_City': NVARCHAR(50),
    'SUPPLIER_SUPPLIERS_Region': NVARCHAR(50),
    'SUPPLIER_SUPPLIERS_PostalCode': NVARCHAR(20),
    'SUPPLIER_SUPPLIERS_Country': NVARCHAR(100),
    'SUPPLIER_SUPPLIERS_Phone': VARCHAR(20),
    'SUPPLIER_SUPPLIERS_Fax': NVARCHAR(30),
    'SUPPLIER_SUPPLIERS_HomePage': NVARCHAR(255)
}

prepare_and_insert(suppliers, suppliers_dtypes, 'Supplier', united_outdoors_engine)

### Currency

In [None]:
currencies_dtypes = {
    'CURRENCY_CURRENCY_CurrencyCode': NVARCHAR(10),
    'CURRENCY_CURRENCY_Name': NVARCHAR(100),
    'CURRENCY_COUNTRYREGIONCURRENCY_CountryRegionCode': NVARCHAR(10),
    'CURRENCY_CURRENCYRATE_CurrencyRateID': Integer,
    'CURRENCY_CURRENCYRATE_CurrencyRateDate': DATE,
    'CURRENCY_CURRENCYRATE_FromCurrencyCode': NVARCHAR(10),
    'CURRENCY_CURRENCYRATE_ToCurrencyCode': NVARCHAR(10),
    'CURRENCY_CURRENCYRATE_AverageRate': Integer,
    'CURRENCY_CURRENCYRATE_EndOfDayRate': Integer
}

prepare_and_insert(currencies, currencies_dtypes, 'Currency', united_outdoors_engine, {'CURRENCY_CURRENCYRATE_CurrencyRateID': {}, 'CURRENCY_CURRENCYRATE_FromCurrencyCode': {}, 'CURRENCY_CURRENCYRATE_ToCurrencyCode': {}})

### Territory

In [None]:
territories_dtypes = {
    'TERRITORY_REGION_RegionID': Integer,
    'TERRITORY_REGION_RegionDescription': String,
    'TERRITORY_TERRITORIES_TerritoryID': Integer,
    'TERRITORY_TERRITORIES_TerritoryDescription': String,
    'TERRITORY_REGION_Region': NVARCHAR(50),
    'TERRITORY_STATE_StateID': NVARCHAR(10),
    'TERRITORY_STATE_StateName': NVARCHAR(100),
    'TERRITORY_STATE_StateCapital': NVARCHAR(50),
    'TERRITORY_STATE_Country': NVARCHAR(50),
    'TERRITORY_COUNTRYREGION_CountryRegionCode': NVARCHAR(10),
    'TERRITORY_COUNTRYREGION_Name': NVARCHAR(100),
    'TERRITORY_STATEPROVINCE_StateProvinceID': Integer,
    'TERRITORY_STATEPROVINCE_StateProvinceCode': NVARCHAR(10),
    'TERRITORY_STATEPROVINCE_IsOnlyStateProvinceFlag': CHAR(1),
    'TERRITORY_STATEPROVINCE_Name': NVARCHAR(100),
    'TERRITORY_SALESTAXRATE_SalesTaxRateID': Integer,
    'TERRITORY_SALESTAXRATE_TaxType': NVARCHAR(50),
    'TERRITORY_SALESTAXRATE_TaxRate': DECIMAL(8,2),
    'TERRITORY_SALESTAXRATE_Name': NVARCHAR(150)
}

prepare_and_insert(territories, territories_dtypes, 'Territory', united_outdoors_engine, {'TERRITORY_TERRITORIES_TerritoryID': {}, 'TERRITORY_STATE_StateID': {}, 'TERRITORY_COUNTRYREGION_CountryRegionCode': {}, 'TERRITORY_STATEPROVINCE_StateProvinceID': {}, 'TERRITORY_SALESTAXRATE_SalesTaxRateID': {}})

### OrderHeader

In [None]:
orderheaders_dtypes = {
    'ORDERHEADER_SALESORDERDETAIL_SalesOrderID': Integer,
    'ORDERHEADER_SALESORDERDETAIL_SalesOrderDetailID': Integer,
    'ORDERHEADER_SALESORDERDETAIL_CarrierTrackingNumber': NVARCHAR(25),
    'ORDERHEADER_SALESORDERDETAIL_OrderQty': Integer,
    'ORDERHEADER_SALESORDERDETAIL_ProductID': Integer,
    'ORDERHEADER_SALESORDERDETAIL_SpecialOfferID': Integer,
    'ORDERHEADER_SALESORDERDETAIL_UnitPrice': DECIMAL(8,2),
    'ORDERHEADER_SALESORDERDETAIL_UnitPriceDiscount': DECIMAL(8,2),
    'ORDERHEADER_SALESORDERDETAIL_LineTotal': NVARCHAR,
    'ORDERHEADER_SALESORDERHEADER_RevisionNumber': Integer,
    'ORDERHEADER_SALESORDERHEADER_OrderDate': DATE,
    'ORDERHEADER_SALESORDERHEADER_DueDate': DATE,
    'ORDERHEADER_SALESORDERHEADER_ShipDate': DATE,
    'ORDERHEADER_SALESORDERHEADER_Status': CHAR(1),
    'ORDERHEADER_SALESORDERHEADER_OnlineOrderFlag': BIT,
    'ORDERHEADER_SALESORDERHEADER_SalesOrderNumber': VARCHAR(100),
    'ORDERHEADER_SALESORDERHEADER_PurchaseOrderNumber': VARCHAR(25),
    'ORDERHEADER_SALESORDERHEADER_AccountNumber': NVARCHAR(15),
    'ORDERHEADER_SALESORDERHEADER_CustomerID': Integer,
    'ORDERHEADER_SALESORDERHEADER_SalesPersonID': Integer,
    'ORDERHEADER_SALESORDERHEADER_TerritoryID': Integer,
    'ORDERHEADER_SALESORDERHEADER_BillToAddress': Integer,
    'ORDERHEADER_SALESORDERHEADER_ShipToAddress': Integer,
    'ORDERHEADER_SALESORDERHEADER_ShipMethodID': Integer,
    'ORDERHEADER_SALESORDERHEADER_CreditCardID': Integer,
    'ORDERHEADER_SALESORDERHEADER_CreditCardApprovalCode': NVARCHAR(15),
    'ORDERHEADER_SALESORDERHEADER_CurrencyRateID': Integer,
    'ORDERHEADER_SALESORDERHEADER_SubTotal': DECIMAL(11,2),
    'ORDERHEADER_SALESORDERHEADER_TaxAmt': DECIMAL(11,2),
    'ORDERHEADER_SALESORDERHEADER_Freight': DECIMAL(11,2),
    'ORDERHEADER_SALESORDERHEADER_TotalDue': DECIMAL(11,2),
    'ORDERHEADER_SALESORDERHEADER_Comment': NVARCHAR(128),
    'ORDERHEADER_SALESORDERHEADERSALESREASON_SalesOrderID': Integer,
    'ORDERHEADER_SALESREASON_SalesReasonID': Integer,
    'ORDERHEADER_SALESREASON_Name': NVARCHAR(100),
    'ORDERHEADER_SALESREASON_ReasonType': NVARCHAR(100),
}

prepare_and_insert(orderheaders, orderheaders_dtypes, 'OrderHeader', united_outdoors_engine, { 'ORDERHEADER_SALESORDERHEADER_CustomerID' : {}, 'ORDERHEADER_SALESORDERDETAIL_SpecialOfferID' : specialoffer_nk_sk_dict, 'ORDERHEADER_SALESORDERHEADER_CreditCardID': creditcards_nk_sk_dict, 'ORDERHEADER_SALESORDERHEADER_CurrencyRateID': {}})

### TransactionHistoryArchive

In [None]:
transactionhistoryarchives_dtypes = {
    'TRANSACTIONHISTORYARCHIVE_TRANSACTIONHISTORYARCHIVE_TransactionID': Integer,
    'TRANSACTIONHISTORYARCHIVE_TRANSACTIONHISTORYARCHIVE_ProductID': Integer,
    'TRANSACTIONHISTORYARCHIVE_TRANSACTIONHISTORYARCHIVE_ReferenceOrderID': Integer,
    'TRANSACTIONHISTORYARCHIVE_TRANSACTIONHISTORYARCHIVE_ReferenceOrderLineID': Integer,
    'TRANSACTIONHISTORYARCHIVE_TRANSACTIONHISTORYARCHIVE_TransactionDate': DATE,
    'TRANSACTIONHISTORYARCHIVE_TRANSACTIONHISTORYARCHIVE_TransactionType': CHAR(1),
    'TRANSACTIONHISTORYARCHIVE_TRANSACTIONHISTORYARCHIVE_Quantity': Integer,
    'TRANSACTIONHISTORYARCHIVE_TRANSACTIONHISTORYARCHIVE_ActualCost': MONEY,
}   

prepare_and_insert(transactionhistoryarchives, transactionhistoryarchives_dtypes, 'TransactionHistoryArchive', united_outdoors_engine, { 'TRANSACTIONHISTORYARCHIVE_TRANSACTIONHISTORYARCHIVE_ProductID' : {}})

### ProductReview

In [None]:
productreviews_dtypes = {
    'PRODUCTREVIEW_PRODUCTREVIEW_ProductReviewID': Integer,
    'PRODUCTREVIEW_PRODUCTREVIEW_ProductID': Integer,
    'PRODUCTREVIEW_PRODUCTREVIEW_ReviewerName': NVARCHAR(50),
    'PRODUCTREVIEW_PRODUCTREVIEW_ReviewDate': DATE,
    'PRODUCTREVIEW_PRODUCTREVIEW_EmailAddress': NVARCHAR(50),
    'PRODUCTREVIEW_PRODUCTREVIEW_Rating': Integer,
    'PRODUCTREVIEW_PRODUCTREVIEW_Comments': NVARCHAR(3850),
}

prepare_and_insert(productreviews, productreviews_dtypes, 'ProductReview', united_outdoors_engine, { 'PRODUCTREVIEW_PRODUCTREVIEW_ProductID' : {}})

### Regions (TODO CHANGE)

In [None]:
regions_dtypes = {
    'RegionID': Integer,
    'RegionName': VARCHAR(10),
    'StateProvinceID': Integer,
    'StateProvinceCode': VARCHAR(10),
    'CountryRegionCode': CHAR(2),
    'IsOnlyStateProvinceFlag': BIT,
    'Name': VARCHAR(50),
    'TerritoryID': Integer,
    'ModifiedDate': DATE
}

regions['StateProvinceCode'] = regions['StateProvinceCode'].astype(str)

prepare_and_insert(regions, regions_dtypes, 'Region', united_outdoors_engine)

In [151]:
date_dtypes = {
    'DATE_ID': Integer,
    'DATE_Date': DATE,
    'DATE_Weekday': VARCHAR(10),
    'DATE_WeekdayNum': Integer,
    'DATE_DayMonth': Integer,
    'DATE_DayOfYear': Integer,
    'DATE_WeekOfYear': Integer,
    'DATE_MonthNum': Integer,
    'DATE_MonthName': VARCHAR(10),
    'DATE_MonthNameShort': CHAR(10),
    'DATE_Quarter': Integer,
    'DATE_Year': Integer,
    'DATE_FirstDayOfMonth': DATE,
    'DATE_LastDayOfMonth': DATE,
    'DATE_YYYYMM': CHAR(10),
    'DATE_WeekendIndr': CHAR(15)
}

prepare_and_insert(date_table, date_dtypes, 'Date', united_outdoors_engine)

Inserting data into table: Date with chunk size: 125


In [152]:
time_dtypes = {
    'TIME_ID': Integer,
    'TIME_Hour': Integer,
    'TIME_Minute': Integer,
    'TIME_HourMinute': VARCHAR(10),
}

prepare_and_insert(time_table, time_dtypes, 'Time', united_outdoors_engine)

Inserting data into table: Time with chunk size: 500


## Constraints
altering the tables to add the (foreign key) constraints

In [None]:
# opening the UnitedOutdoors_constraints.sql file
with open('sql/UnitedOutdoors_constraints.sql', 'r') as file:
    sql_script = file.read()

# Execute the script
split_and_execute_sql_script(sql_script, united_outdoors_engine)

## Closing connections

In [None]:
try:
    united_outdoors_engine.dispose()
    northwind_engine.dispose()
    aenc_engine.dispose()
    adventureworks_engine.dispose()
except OperationalError as e:
        print(f'Error: {e}')

# Time elapsed

In [None]:
end_time = time.time()

print(f'Time elapsed: {end_time - start_time} seconds')