# United outdoors datawarehouse

## Imports

In [294]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, Integer, VARCHAR, CHAR, NVARCHAR, Date, DECIMAL, DATE, String, LargeBinary
from sqlalchemy.dialects.mssql import XML, BIT, MONEY
from sqlalchemy.exc import OperationalError
import urllib
import re

## Database connection details

In [295]:
DB = {
    'servername' : '(local)\\SQLEXPRESS',
    'united_outdoors_database' : 'UnitedOutdoors',
    'northwind_database' : 'Northwind',
    'aenc_database' : 'Aenc',
    'adventureworks_database' : 'AdventureWorks2019',
    'master' : 'master'
}

In [296]:
def create_connection(servername, database):
    params = urllib.parse.quote_plus(f'DRIVER={{SQL Server}};SERVER={servername};DATABASE={database};Trusted_Connection=yes')
    engine = create_engine(f'mssql+pyodbc:///?odbc_connect={params}', use_setinputsizes=False, connect_args={'options': '-c search_path=dbo'}) # setinputsizes needs to be turned off for sql server, idk why but gives errors otherwise
    try:
        conn = engine.connect()
        print(f'Connection to {database} database successful')
        return conn, engine
    except OperationalError as e:
        print(f'Error: {e}')
        return None, None

In [297]:
def split_and_execute_sql_script(script, connection):
    # splitting the script into the database creation and the rest
    commands = re.split(r'GO\n', script)
    # removing all \bGO\b from the commands
    commands = [re.sub(r'\bGO\b', '', command) for command in commands]
    
    # Execute the commands
    for command in commands:
        command = command.strip()
        # Skip if the command is empty or 'GO'
        if not command or command.upper() == 'GO':
            continue
        try:
            connection.connection.execute(command)
            connection.connection.commit()
            #print(f'Command executed: {command}')
        except OperationalError as e:
            print(f'Error: {e} at command: {command}')

In [298]:
def prepare(dataframe, nk_sk_dict=None):
    # replacing the natural keys with the surrogate keys
    if nk_sk_dict:
        for column in nk_sk_dict:
            # Check for duplicate keys
            if len(nk_sk_dict[column]) != len(set(nk_sk_dict[column])):
                raise ValueError(f'Duplicate keys found in nk_sk_dict for column: {column}')
            else:
                print(f'Replacing natural keys with surrogate keys for column: {column}')
                for natural_key in nk_sk_dict[column]:
                    dataframe[column] = dataframe[column].replace(natural_key, nk_sk_dict[column][natural_key])
    
    # replace empty values with None
    dataframe = dataframe.where(pd.notnull(dataframe), None)
    dataframe = dataframe.replace({np.nan: None})
    # stripping all columns with string data
    dataframe = dataframe.map(lambda x: x.strip() if isinstance(x, str) else x)
    # replacing all empty strings with None
    dataframe = dataframe.replace(r'^\s*$', None, regex=True)

In [299]:
def prepare_and_insert(dataframe, dtypes, table_name, nk_sk_dict=None):
    """
    Prepares the dataframe for insertion into the database and inserts it into the database.
        @param dataframe: The dataframe to be inserted into the database
        @param dtypes: The data types of the columns in the dataframe
        @param table_name: The name of the table in the database
        @param nk_sk_dict: A 3d dictionary containing the natural keys and their corresponding surrogate keys, per column (so nk_sk_dict[column][natural_key] = surrogate_key)
    """
       
    prepare(dataframe, nk_sk_dict)
    
    # adding the data to the database
    print(f'Inserting data into table: {table_name}')
    dataframe.to_sql(name=table_name, schema='dbo', con=united_outdoors_engine, if_exists='append', index=False, dtype=dtypes)

In [300]:
def prepare_and_insert_return_sk(dataframe, dtypes, table_name, natural_key_column, nk_sk_dict=None):
    """
    Prepares the dataframe for insertion into the database and inserts it into the database.
        @param dataframe: The dataframe to be inserted into the database
        @param dtypes: The data types of the columns in the dataframe
        @param table_name: The name of the table in the database
        @param natural_key_column: The name of the column containing the natural keys
        @param nk_sk_dict: A 3d dictionary containing the natural keys and their corresponding surrogate keys, per column (so nk_sk_dict[column][natural_key] = surrogate_key)
        @return: A dictionary containing the natural keys and their corresponding surrogate keys
    """
    
    prepare(dataframe, nk_sk_dict)
    
    # adding the data to the database
    dataframe.to_sql(name=table_name, schema='dbo', con=united_outdoors_engine, if_exists='append', index=False, dtype=dtypes)
    
    # making a cleaned dictionary without None/nan values in the natural_key_column
    mask = dataframe[natural_key_column].notnull()
    filtered_dataframe = dataframe[mask]
    
    # getting the natural keys and their corresponding surrogate keys
    # TODO FIX THIS, relying on the dataframe index is not a good idea
    nk_sk_dict = dict(zip(filtered_dataframe[natural_key_column], dataframe.index))
    
    # adding +1 to the index to get the surrogate key
    nk_sk_dict = {k: v + 1 for k, v in nk_sk_dict.items()}
    
    return nk_sk_dict

In [301]:
def prepare_and_update(table_name, connection, nk_sk_dict=None):    
    # adding the data to the database
    print(f'Updating data in table: {table_name}')
    for column in nk_sk_dict:
        for natural_key in nk_sk_dict[column]:
            #print(f'Updating column: {column} with natural key: {natural_key} and surrogate key: {nk_sk_dict[column][natural_key]}')
            # TODO this would not work with updated data in datawarehouse, since this would replace the old and new data
            connection.connection.execute(f'UPDATE {table_name} SET {column} = {nk_sk_dict[column][natural_key]} WHERE {column} = {int(natural_key)}')
    connection.connection.commit()

In [302]:
def drop_modified_date_rowguid(dataframe):
    # dropping all columns with 'rowguid' in their name
    columns_to_drop_mr = dataframe.filter(like='rowguid').columns
    
    # dropping all columns with 'ModifiedDate' in their name
    columns_to_drop_mr = columns_to_drop_mr.append(dataframe.filter(like='ModifiedDate').columns)
    
    # dropping the columns
    dataframe.drop(columns=columns_to_drop_mr, inplace=True)

## Create the UnitedOutdoors datawarehouse

In [303]:
conn, creation_engine = create_connection(DB["servername"], DB["master"])

# Open the SQL script file and read its contents
with open('sql/UnitedOutdoors_creation.sql', 'r') as file:
    sql_script = file.read()

split_and_execute_sql_script(sql_script, conn)

conn.close()
creation_engine.dispose()

Connection to master database successful


## Connecting to the UnitedOutdoors datawarehouse

In [304]:
united_outdoors_conn , united_outdoors_engine = create_connection(DB["servername"], DB["united_outdoors_database"])

Connection to UnitedOutdoors database successful


## Loading the data from the source databases

### Northwind database

#### Connection

In [305]:
northwind_conn, northwind_engine = create_connection(DB["servername"], DB["northwind_database"])

Connection to Northwind database successful


#### Loading data

In [306]:
# Load the data from the source database
northwind_categories = pd.read_sql('SELECT * FROM Categories', northwind_conn)
northwind_customer_customer_demo = pd.read_sql('SELECT * FROM CustomerCustomerDemo', northwind_conn)
northwind_customer_demographics = pd.read_sql('SELECT * FROM CustomerDemographics', northwind_conn)
northwind_customers = pd.read_sql('SELECT * FROM Customers', northwind_conn)
northwind_employees = pd.read_sql('SELECT * FROM Employees', northwind_conn)
northwind_employee_territories = pd.read_sql('SELECT * FROM EmployeeTerritories', northwind_conn)
northwind_order_details = pd.read_sql('SELECT * FROM [Order Details]', northwind_conn)
northwind_orders = pd.read_sql('SELECT * FROM Orders', northwind_conn)
northwind_products = pd.read_sql('SELECT * FROM Products', northwind_conn)
northwind_region = pd.read_sql('SELECT * FROM Region', northwind_conn)
northwind_shippers = pd.read_sql('SELECT * FROM Shippers', northwind_conn)
northwind_suppliers = pd.read_sql('SELECT * FROM Suppliers', northwind_conn)
northwind_territories = pd.read_sql('SELECT * FROM Territories', northwind_conn)

### Aenc database

#### Connection

In [307]:
aenc_conn , aenc_engine = create_connection(DB["servername"], DB["aenc_database"])

Connection to Aenc database successful


#### Loading data

In [308]:
aenc_bonus              = pd.read_sql('SELECT * FROM Bonus', aenc_conn)
aenc_customer           = pd.read_sql('SELECT * FROM Customer', aenc_conn)
aenc_department         = pd.read_sql('SELECT * FROM Department', aenc_conn)
aenc_employee           = pd.read_sql('SELECT * FROM Employee', aenc_conn)
aenc_product            = pd.read_sql('SELECT * FROM Product', aenc_conn)
aenc_region             = pd.read_sql('SELECT * FROM Region', aenc_conn)
aenc_sales_order        = pd.read_sql('SELECT * FROM SalesOrder', aenc_conn)
aenc_sales_order_item   = pd.read_sql('SELECT * FROM SalesOrderItem', aenc_conn)
aenc_state              = pd.read_sql('SELECT * FROM State', aenc_conn)

### AdventureWorks database

#### Connection

In [309]:
adventureworks_conn, adventureworks_engine = create_connection(DB["servername"], DB["adventureworks_database"])

Connection to AdventureWorks2019 database successful


#### Loading data

In [310]:
adventureworks_humanresources_department = pd.read_sql('SELECT * FROM HumanResources.Department', adventureworks_conn)
adventureworks_humanresources_employee = pd.read_sql('SELECT * FROM HumanResources.Employee', adventureworks_conn)
adventureworks_humanresources_employeedepartmenthistory = pd.read_sql('SELECT * FROM HumanResources.EmployeeDepartmentHistory', adventureworks_conn)
adventureworks_humanresources_employeepayhistory = pd.read_sql('SELECT * FROM HumanResources.EmployeePayHistory', adventureworks_conn)
adventureworks_humanresources_jobcandidate = pd.read_sql('SELECT * FROM HumanResources.JobCandidate', adventureworks_conn)
adventureworks_humanresources_shift = pd.read_sql('SELECT * FROM HumanResources.Shift', adventureworks_conn)

In [311]:
adventureworks_person_address = pd.read_sql('SELECT AddressID, AddressLine1, AddressLine2, City, StateProvinceID, PostalCode, CAST(SpatialLocation AS VARCHAR(MAX)) AS SpatialLocation,rowguid, ModifiedDate   FROM Person.Address', adventureworks_conn)
adventureworks_person_address_type = pd.read_sql('SELECT * FROM Person.AddressType', adventureworks_conn)
adventureworks_person_businessentity = pd.read_sql('SELECT * FROM Person.BusinessEntity', adventureworks_conn)
adventureworks_person_businessentityaddress = pd.read_sql('SELECT * FROM Person.BusinessEntityAddress', adventureworks_conn)
adventureworks_person_businessentitycontact = pd.read_sql('SELECT * FROM Person.BusinessEntityContact', adventureworks_conn)
adventureworks_person_contacttype = pd.read_sql('SELECT * FROM Person.ContactType', adventureworks_conn)
adventureworks_person_countryregion = pd.read_sql('SELECT * FROM Person.CountryRegion', adventureworks_conn)
adventureworks_person_emailaddress = pd.read_sql('SELECT * FROM Person.EmailAddress', adventureworks_conn)
adventureworks_person_password = pd.read_sql('SELECT * FROM Person.Password', adventureworks_conn)
adventureworks_person_person = pd.read_sql('SELECT * FROM Person.Person', adventureworks_conn)
adventureworks_person_personphone = pd.read_sql('SELECT * FROM Person.PersonPhone', adventureworks_conn)
adventureworks_person_phonenumbertype = pd.read_sql('SELECT * FROM Person.PhoneNumberType', adventureworks_conn)
adventureworks_person_stateprovince = pd.read_sql('SELECT * FROM Person.StateProvince', adventureworks_conn)

In [312]:
adventureworks_production_bill_of_materials = pd.read_sql('SELECT * FROM Production.BillOfMaterials', adventureworks_conn)
adventureworks_production_culture = pd.read_sql('SELECT * FROM Production.Culture', adventureworks_conn)
adventureworks_production_document = pd.read_sql('SELECT * FROM Production.Document', adventureworks_conn)
adventureworks_production_illustration = pd.read_sql('SELECT * FROM Production.Illustration', adventureworks_conn)
adventureworks_production_location = pd.read_sql('SELECT * FROM Production.Location', adventureworks_conn)
adventureworks_production_product = pd.read_sql('SELECT * FROM Production.Product', adventureworks_conn)
adventureworks_production_productcategory = pd.read_sql('SELECT * FROM Production.ProductCategory', adventureworks_conn)
adventureworks_production_productcosthistory = pd.read_sql('SELECT * FROM Production.ProductCostHistory', adventureworks_conn)
adventureworks_production_productdescription = pd.read_sql('SELECT * FROM Production.ProductDescription', adventureworks_conn)
adventureworks_production_productdocument = pd.read_sql('SELECT * , CAST(DocumentNode AS VARCHAR(MAX)) AS DocumentNodeString  FROM Production.ProductDocument', adventureworks_conn)
adventureworks_production_productinventory = pd.read_sql('SELECT * FROM Production.ProductInventory', adventureworks_conn)
adventureworks_production_productlistpricehistory = pd.read_sql('SELECT * FROM Production.ProductListPriceHistory', adventureworks_conn)
adventureworks_production_productmodel = pd.read_sql('SELECT * FROM Production.ProductModel', adventureworks_conn)
adventureworks_production_productmodelillustration = pd.read_sql('SELECT * FROM Production.ProductModelIllustration', adventureworks_conn)
adventureworks_production_productmodelproductdescriptionculture = pd.read_sql('SELECT * FROM Production.ProductModelProductDescriptionCulture', adventureworks_conn)
adventureworks_production_productphoto = pd.read_sql('SELECT ProductPhotoID, CONVERT(VARCHAR(MAX),ThumbNailPhoto, 1) as ThumbNailPhotoHexString, ThumbNailPhotoFileName, CONVERT(VARCHAR(MAX), LargePhoto, 1) as LargePhotoHexString, LargePhotoFileName, ModifiedDate FROM Production.ProductPhoto', adventureworks_conn)
adventureworks_production_productproductphoto = pd.read_sql('SELECT * FROM Production.ProductProductPhoto', adventureworks_conn)
adventureworks_production_productreview = pd.read_sql('SELECT * FROM Production.ProductReview', adventureworks_conn)
adventureworks_production_productsubcategory = pd.read_sql('SELECT * FROM Production.ProductSubcategory', adventureworks_conn)
adventureworks_production_scrapreason = pd.read_sql('SELECT * FROM Production.ScrapReason', adventureworks_conn)
adventureworks_production_transactionhistory = pd.read_sql('SELECT * FROM Production.TransactionHistory', adventureworks_conn)
adventureworks_production_transactionhistoryarchive = pd.read_sql('SELECT * FROM Production.TransactionHistoryArchive', adventureworks_conn)
adventureworks_production_unitmeasure = pd.read_sql('SELECT * FROM Production.UnitMeasure', adventureworks_conn)
adventureworks_production_workorder = pd.read_sql('SELECT * FROM Production.WorkOrder', adventureworks_conn)
adventureworks_production_workorderrouting = pd.read_sql('SELECT * FROM Production.WorkOrderRouting', adventureworks_conn)

In [313]:
adventureworks_purchasing_productvendor = pd.read_sql('SELECT * FROM Purchasing.ProductVendor', adventureworks_conn)
adventureworks_purchasing_purchaseorderdetail = pd.read_sql('SELECT * FROM Purchasing.PurchaseOrderDetail', adventureworks_conn)
adventureworks_purchasing_purchaseorderheader = pd.read_sql('SELECT * FROM Purchasing.PurchaseOrderHeader', adventureworks_conn)
adventureworks_purchasing_shipmethod = pd.read_sql('SELECT * FROM Purchasing.ShipMethod', adventureworks_conn)
adventureworks_purchasing_vendor = pd.read_sql('SELECT * FROM Purchasing.Vendor', adventureworks_conn)

In [314]:
adventureworks_sales_countryregioncurrency = pd.read_sql('SELECT * FROM Sales.CountryRegionCurrency', adventureworks_conn)
adventureworks_sales_creditcard = pd.read_sql('SELECT * FROM Sales.CreditCard', adventureworks_conn)
adventureworks_sales_currency = pd.read_sql('SELECT * FROM Sales.Currency', adventureworks_conn)
adventureworks_sales_currencyrate = pd.read_sql('SELECT * FROM Sales.CurrencyRate', adventureworks_conn)
adventureworks_sales_customer = pd.read_sql('SELECT * FROM Sales.Customer', adventureworks_conn)
adventureworks_sales_personcreditcard = pd.read_sql('SELECT * FROM Sales.PersonCreditCard', adventureworks_conn)
adventureworks_sales_salesorderdetail = pd.read_sql('SELECT * FROM Sales.SalesOrderDetail', adventureworks_conn)
adventureworks_sales_salesorderheader = pd.read_sql('SELECT * FROM Sales.SalesOrderHeader', adventureworks_conn)
adventureworks_sales_salesorderhearerrsaleseason = pd.read_sql('SELECT * FROM Sales.SalesOrderHeaderSalesReason', adventureworks_conn)
adventureworks_sales_salesperson = pd.read_sql('SELECT * FROM Sales.SalesPerson', adventureworks_conn)
adventureworks_sales_salespersonquotahistory = pd.read_sql('SELECT * FROM Sales.SalesPersonQuotaHistory', adventureworks_conn)
adventureworks_sales_salesreason = pd.read_sql('SELECT * FROM Sales.SalesReason', adventureworks_conn)
adventureworks_sales_salestaxrate = pd.read_sql('SELECT * FROM Sales.SalesTaxRate', adventureworks_conn)
adventureworks_sales_salesterritory = pd.read_sql('SELECT * FROM Sales.SalesTerritory', adventureworks_conn)
adventureworks_sales_salesterritoryhistory = pd.read_sql('SELECT * FROM Sales.SalesTerritoryHistory', adventureworks_conn)
adventureworks_sales_shoppingcartitem = pd.read_sql('SELECT * FROM Sales.ShoppingCartItem', adventureworks_conn)
adventureworks_sales_specialoffer = pd.read_sql('SELECT * FROM Sales.SpecialOffer', adventureworks_conn)
adventureworks_sales_specialofferproduct = pd.read_sql('SELECT * FROM Sales.SpecialOfferProduct', adventureworks_conn)
adventureworks_sales_store = pd.read_sql('SELECT * FROM Sales.Store', adventureworks_conn)

## Combining the data
ORDER MATTERS, CAUSE SURROGATE KEYS
SK STILL NEEDS TO BE DONE

### Departments
Combining aenc and adventureworks department data

In [315]:
# adding DEPARTMENT_source_database columns to the dataframes
aenc_department['DEPARTMENT_source_database'] = 'aenc'
adventureworks_humanresources_department['DEPARTMENT_source_database'] = 'adventureworks'

# combining all department data
departments = pd.concat([aenc_department, adventureworks_humanresources_department], ignore_index=True)

# combining name and department name columns to create a name column
departments['DEPARTMENT_DEPARTMENT_dept_name '] = departments['Name'].combine_first(departments['dept_name'])
# combining dept_id and DepartmentID columns
departments['DEPARTMENT_DEPARTMENT_dept_id'] = departments['dept_id'].combine_first(departments['DepartmentID'])

# dropping the redundant columns
drop_modified_date_rowguid(departments)
departments.drop(columns=['dept_id', 'Name', 'dept_name', 'DepartmentID'], inplace=True)

# renaming the remaining columns
departments.rename(columns={'dept_head_id': 'DEPARTMENT_DEPARTMENT_dept_head_id', 'GroupName': 'DEPARTMENT_DEPARTMENT_group_name'}, inplace=True)

departments

  departments['DEPARTMENT_DEPARTMENT_dept_id'] = departments['dept_id'].combine_first(departments['DepartmentID'])


Unnamed: 0,DEPARTMENT_DEPARTMENT_dept_head_id,DEPARTMENT_source_database,DEPARTMENT_DEPARTMENT_group_name,DEPARTMENT_DEPARTMENT_dept_name,DEPARTMENT_DEPARTMENT_dept_id
0,,adventureworks,Research and Development,Engineering,1.0
1,,adventureworks,Research and Development,Tool Design,2.0
2,,adventureworks,Sales and Marketing,Sales,3.0
3,,adventureworks,Sales and Marketing,Marketing,4.0
4,,adventureworks,Inventory Management,Purchasing,5.0
5,,adventureworks,Research and Development,Research and Development,6.0
6,,adventureworks,Manufacturing,Production,7.0
7,,adventureworks,Manufacturing,Production Control,8.0
8,,adventureworks,Executive General and Administration,Human Resources,9.0
9,,adventureworks,Executive General and Administration,Finance,10.0


### Employee

In [316]:
# merge northwind with territory and employee
employee1 = pd.merge(northwind_employees, northwind_employee_territories, on='EmployeeID')
employee1 = employee1.drop(columns=['EmployeeID', 'LastName', 'FirstName', 'BirthDate', 'Address', 'City', 'Region', 'PostalCode', 'Country', 'HomePhone', 'ReportsTo'])

# merge employee from aenc with bonus
employee2 = pd.merge(aenc_employee, aenc_bonus, on='emp_id')

# after that concat together
employees = pd.concat([employee1, employee2], ignore_index=True)

# rename
employees.rename(columns={
    'emp_id': 'EMPLOYEE_EMPLOYEE_EmployeeID', 
    'dept_id': 'EMPLOYEE_EMPLOYEE_DeptID', 
    'manager_id': 'EMPLOYEE_EMPLOYEE_ManagerID', 
    'territory_id' : 'EMPLOYEE_EMPLOYEETERRITORIES_TerritoryID',
    'emp_fname' : 'EMPLOYEE_EMPLOYEE_Emp_Fname',
    'emp_lname' : 'EMPLOYEE_EMPLOYEE_Emp_Lname',
    'street' : 'EMPLOYEE_EMPLOYEE_Street',
    'city' : 'EMPLOYEE_EMPLOYEE_City',
    'state' : 'EMPLOYEE_EMPLOYEE_State',
    'zip_code' : 'EMPLOYEE_EMPLOYEE_Zip_Code',
    'phone' : 'EMPLOYEE_EMPLOYEE_Phone',
    'status' : 'EMPLOYEE_EMPLOYEE_Status',
    'ss_number' : 'EMPLOYEE_EMPLOYEE_SS_Number',
    'salary' : 'EMPLOYEE_EMPLOYEE_Salary',
    'start_date' : 'EMPLOYEE_EMPLOYEE_Start_Date',
    'termination_date' : 'EMPLOYEE_EMPLOYEE_Termination',
    'birth_date' : 'EMPLOYEE_EMPLOYEE_Birth_Date',
    'bene_health_ins' : 'EMPLOYEE_EMPLOYEE_Bene_Health_Ins',
    'bene_life_ins' : 'EMPLOYEE_EMPLOYEE_Bene_Life_Ins',
    'bene_day_care' : 'EMPLOYEE_EMPLOYEE_Bene_Day_Care',
    'sex' : 'EMPLOYEE_EMPLOYEE_Sex',
    'bonus_date' : 'EMPLOYEE_BONUS_Bonus_Date',
    'bonus_amount' : 'EMPLOYEE_BONUS_Bonus_Amount',
    'Title' : 'EMPLOYEE_EMPLOYEES_Title',
    'TitleOfCourtesy' : 'EMPLOYEE_EMPLOYEES_TitleOfCourtesy',
    'HireDate' : 'EMPLOYEE_EMPLOYEES_HireDate',
    'HomePhone' : 'EMPLOYEE_EMPLOYEES_HomePhone',
    'Extension' : 'EMPLOYEE_EMPLOYEES_Extension',
    'Photo' : 'EMPLOYEE_EMPLOYEES_Photo',
    'PhotoPath' : 'EMPLOYEE_EMPLOYEES_PhotoPath',
    'Notes' : 'EMPLOYEE_EMPLOYEES_Notes'
}, inplace=True)

desired_columns_order = ['EMPLOYEE_EMPLOYEE_EmployeeID', 'EMPLOYEE_EMPLOYEE_DeptID', 'EMPLOYEE_EMPLOYEE_ManagerID', 'EMPLOYEE_EMPLOYEETERRITORIES_TerritoryID','EMPLOYEE_EMPLOYEE_Emp_Fname','EMPLOYEE_EMPLOYEE_Emp_Lname','EMPLOYEE_EMPLOYEE_Street','EMPLOYEE_EMPLOYEE_City','EMPLOYEE_EMPLOYEE_State','EMPLOYEE_EMPLOYEE_Zip_Code','EMPLOYEE_EMPLOYEE_Phone','EMPLOYEE_EMPLOYEE_Status','EMPLOYEE_EMPLOYEE_SS_Number','EMPLOYEE_EMPLOYEE_Salary','EMPLOYEE_EMPLOYEE_Start_Date','EMPLOYEE_EMPLOYEE_Termination','EMPLOYEE_EMPLOYEE_Birth_Date','EMPLOYEE_EMPLOYEE_Bene_Health_Ins','EMPLOYEE_EMPLOYEE_Bene_Life_Ins','EMPLOYEE_EMPLOYEE_Bene_Day_Care','EMPLOYEE_EMPLOYEE_Sex','EMPLOYEE_BONUS_Bonus_Date','EMPLOYEE_BONUS_Bonus_Amount','EMPLOYEE_EMPLOYEES_Title','EMPLOYEE_EMPLOYEES_TitleOfCourtesy','EMPLOYEE_EMPLOYEES_HireDate','EMPLOYEE_EMPLOYEES_HomePhone','EMPLOYEE_EMPLOYEES_Extension','EMPLOYEE_EMPLOYEES_Photo','EMPLOYEE_EMPLOYEES_PhotoPath','EMPLOYEE_EMPLOYEES_Notes']

#order
employees = employees.reindex(columns=desired_columns_order)

employees

Unnamed: 0,EMPLOYEE_EMPLOYEE_EmployeeID,EMPLOYEE_EMPLOYEE_DeptID,EMPLOYEE_EMPLOYEE_ManagerID,EMPLOYEE_EMPLOYEETERRITORIES_TerritoryID,EMPLOYEE_EMPLOYEE_Emp_Fname,EMPLOYEE_EMPLOYEE_Emp_Lname,EMPLOYEE_EMPLOYEE_Street,EMPLOYEE_EMPLOYEE_City,EMPLOYEE_EMPLOYEE_State,EMPLOYEE_EMPLOYEE_Zip_Code,...,EMPLOYEE_BONUS_Bonus_Date,EMPLOYEE_BONUS_Bonus_Amount,EMPLOYEE_EMPLOYEES_Title,EMPLOYEE_EMPLOYEES_TitleOfCourtesy,EMPLOYEE_EMPLOYEES_HireDate,EMPLOYEE_EMPLOYEES_HomePhone,EMPLOYEE_EMPLOYEES_Extension,EMPLOYEE_EMPLOYEES_Photo,EMPLOYEE_EMPLOYEES_PhotoPath,EMPLOYEE_EMPLOYEES_Notes
0,,,,,,,,,,,...,,,Sales Representative,Ms.,1992-05-01,,5467,b'\x15\x1c/\x00\x02\x00\x00\x00\r\x00\x0e\x00\...,http://accweb/emmployees/davolio.bmp,Education includes a BA in psychology from Col...
1,,,,,,,,,,,...,,,Sales Representative,Ms.,1992-05-01,,5467,b'\x15\x1c/\x00\x02\x00\x00\x00\r\x00\x0e\x00\...,http://accweb/emmployees/davolio.bmp,Education includes a BA in psychology from Col...
2,,,,,,,,,,,...,,,"Vice President, Sales",Dr.,1992-08-14,,3457,b'\x15\x1c/\x00\x02\x00\x00\x00\r\x00\x0e\x00\...,http://accweb/emmployees/fuller.bmp,Andrew received his BTS commercial in 1974 and...
3,,,,,,,,,,,...,,,"Vice President, Sales",Dr.,1992-08-14,,3457,b'\x15\x1c/\x00\x02\x00\x00\x00\r\x00\x0e\x00\...,http://accweb/emmployees/fuller.bmp,Andrew received his BTS commercial in 1974 and...
4,,,,,,,,,,,...,,,"Vice President, Sales",Dr.,1992-08-14,,3457,b'\x15\x1c/\x00\x02\x00\x00\x00\r\x00\x0e\x00\...,http://accweb/emmployees/fuller.bmp,Andrew received his BTS commercial in 1974 and...
5,,,,,,,,,,,...,,,"Vice President, Sales",Dr.,1992-08-14,,3457,b'\x15\x1c/\x00\x02\x00\x00\x00\r\x00\x0e\x00\...,http://accweb/emmployees/fuller.bmp,Andrew received his BTS commercial in 1974 and...
6,,,,,,,,,,,...,,,"Vice President, Sales",Dr.,1992-08-14,,3457,b'\x15\x1c/\x00\x02\x00\x00\x00\r\x00\x0e\x00\...,http://accweb/emmployees/fuller.bmp,Andrew received his BTS commercial in 1974 and...
7,,,,,,,,,,,...,,,"Vice President, Sales",Dr.,1992-08-14,,3457,b'\x15\x1c/\x00\x02\x00\x00\x00\r\x00\x0e\x00\...,http://accweb/emmployees/fuller.bmp,Andrew received his BTS commercial in 1974 and...
8,,,,,,,,,,,...,,,"Vice President, Sales",Dr.,1992-08-14,,3457,b'\x15\x1c/\x00\x02\x00\x00\x00\r\x00\x0e\x00\...,http://accweb/emmployees/fuller.bmp,Andrew received his BTS commercial in 1974 and...
9,,,,,,,,,,,...,,,Sales Representative,Ms.,1992-04-01,,3355,b'\x15\x1c/\x00\x02\x00\x00\x00\r\x00\x0e\x00\...,http://accweb/emmployees/leverling.bmp,Janet has a BS degree in chemistry from Boston...


### BusinessEntities

In [317]:
# Combining the adventureworks Person.BusinessEntity, BusinessEntityContact and ContactType data
businessentities= pd.merge(adventureworks_person_businessentity, adventureworks_person_businessentitycontact, left_on='BusinessEntityID', right_on='BusinessEntityID', suffixes=('_person_businessentity', '_businessentitycontact'), how="outer")

businessentities = pd.merge(businessentities, adventureworks_person_contacttype, left_on='ContactTypeID', right_on='ContactTypeID', suffixes=('', '_contacttype'), how="outer")

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(businessentities)

# renaming the columns
businessentities.rename(columns={'BusinessEntityID': 'BUSINESSENTITY_BUSINESSENTITY_BusinessEntityID', 'PersonID': 'BUSINESSENTITY_BUSINESSENTITYCONTACT_PersonID', 'ContactTypeID': 'BUSINESSENTITY_CONTACTTYPE_ContactTypeID', 'Name' : 'BUSINESSENTITY_CONTACTTYPE_Name'}, inplace=True)

businessentities

Unnamed: 0,BUSINESSENTITY_BUSINESSENTITY_BusinessEntityID,BUSINESSENTITY_BUSINESSENTITYCONTACT_PersonID,BUSINESSENTITY_CONTACTTYPE_ContactTypeID,BUSINESSENTITY_CONTACTTYPE_Name
0,,,1.0,Accounting Manager
1,1510.0,1509.0,2.0,Assistant Sales Agent
2,1518.0,1517.0,2.0,Assistant Sales Agent
3,1522.0,1521.0,2.0,Assistant Sales Agent
4,1528.0,1527.0,2.0,Assistant Sales Agent
...,...,...,...,...
20889,20773.0,,,
20890,20774.0,,,
20891,20775.0,,,
20892,20776.0,,,


### BusinessEntityAddresses

In [318]:
# Combining the adventureworks Person.BusinessEntityAddress, Address and AddressType data
businessentityaddresses = pd.merge(adventureworks_person_businessentityaddress, adventureworks_person_address, left_on='AddressID', right_on='AddressID', suffixes=('', '_address'), how="outer")

businessentityaddresses = pd.merge(businessentityaddresses, adventureworks_person_address_type, left_on='AddressTypeID', right_on='AddressTypeID', suffixes=('', '_address_type'), how="outer")


# dropping the modified date and rowguid columns
drop_modified_date_rowguid(businessentityaddresses)

# renaming the columns
businessentityaddresses.rename(columns={'BusinessEntityID': 'BUSINESSENTITYADDRESS_BUSINESSENTITYADDRESS_BusinessEntityID', 'AddressID': 'BUSINESSENTITYADDRESS_ADDRESS_AddressID', 'AddressTypeID': 'BUSINESSENTITYADDRESS_ADDRESSTYPE_AddressTypeID', 'AddressLine1' : 'BUSINESSENTITYADDRESS_ADDRESS_AddressLine1', 'AddressLine2' : 'BUSINESSENTITYADDRESS_ADDRESS_AddressLine2', 'City' : 'BUSINESSENTITYADDRESS_ADDRESS_City', 'StateProvinceID' : 'BUSINESSENTITYADDRESS_ADDRESS_StateProvinceID', 'PostalCode' : 'BUSINESSENTITYADDRESS_ADDRESS_POSTALCODE', 'SpatialLocation' : 'BUSINESSENTITYADDRESS_ADDRESS_SpatialLocation', 'Name' : 'BUSINESSENTITYADDRESS_ADDRESSTYPE_Name'}, inplace=True)

businessentityaddresses

Unnamed: 0,BUSINESSENTITYADDRESS_BUSINESSENTITYADDRESS_BusinessEntityID,BUSINESSENTITYADDRESS_ADDRESS_AddressID,BUSINESSENTITYADDRESS_ADDRESSTYPE_AddressTypeID,BUSINESSENTITYADDRESS_ADDRESS_AddressLine1,BUSINESSENTITYADDRESS_ADDRESS_AddressLine2,BUSINESSENTITYADDRESS_ADDRESS_City,BUSINESSENTITYADDRESS_ADDRESS_StateProvinceID,BUSINESSENTITYADDRESS_ADDRESS_POSTALCODE,BUSINESSENTITYADDRESS_ADDRESS_SpatialLocation,BUSINESSENTITYADDRESS_ADDRESSTYPE_Name
0,,,1,,,,,,,Billing
1,12.0,1.0,2,1970 Napa Ct.,,Bothell,79.0,98011,POINT (-122.164644615406 47.7869921906598),Home
2,123.0,2.0,2,9833 Mt. Dias Blv.,,Bothell,79.0,98011,POINT (-122.250185528911 47.6867097047995),Home
3,285.0,3.0,2,7484 Roundtree Drive,,Bothell,79.0,98011,POINT (-122.274625789912 47.7631154083121),Home
4,251.0,4.0,2,9539 Glenside Dr,,Bothell,79.0,98011,POINT (-122.335726442416 47.7392386259644),Home
...,...,...,...,...,...,...,...,...,...,...
19612,1126.0,11382.0,5,99 Front Street,,Minneapolis,36.0,55402,POINT (-93.3841566986425 44.9534539399322),Shipping
19613,8892.0,11383.0,5,1010 Maple,,Baltimore,32.0,21201,POINT (-76.4103239443251 39.357562662092),Shipping
19614,5479.0,11384.0,5,500 35th Ave NE,,Los Angeles,9.0,90012,POINT (-118.29597673611 34.1025534611904),Shipping
19615,16746.0,11385.0,5,9 Olive Way,,Seattle,79.0,98104,POINT (-122.442024277584 47.5435883766252),Shipping


### People

In [319]:
# combining person data from adventureworks HumanResources.Person, PersonPhone, PhoneNumberType, EmailAddress and Password
people = pd.merge(adventureworks_person_person, adventureworks_person_personphone, left_on='BusinessEntityID', right_on='BusinessEntityID', suffixes= ('_person', '_personphone'), how="outer")

people = pd.merge(people, adventureworks_person_phonenumbertype, left_on='PhoneNumberTypeID', right_on='PhoneNumberTypeID', suffixes=('', '_phonenumbertype'), how="outer")

people = pd.merge(people, adventureworks_person_emailaddress, left_on='BusinessEntityID', right_on='BusinessEntityID', suffixes=('','_emailaddress'), how="outer")

people = pd.merge(people, adventureworks_person_password, left_on='BusinessEntityID', right_on='BusinessEntityID', suffixes=('','_password'), how="outer")

# dropping the modified date and rowguid columns
drop_modified_date_rowguid(people)

# renaming the columns
people.rename(columns={'BusinessEntityID': 'PERSON_PERSON_BusinessEntityID', 'PersonType': 'PERSON_PERSON_PersonType', 'NameStyle': 'PERSON_PERSON_NameStyle', 'Title': 'PERSON_PERSON_Title', 'FirstName': 'PERSON_PERSON_FirstName', 'MiddleName' : 'PERSON_PERSON_MiddleName', 'LastName' : 'PERSON_PERSON_LastName', 'Suffix': 'PERSON_PERSON_Suffix', 'EmailPromotion' : 'PERSON_PERSON_EmailPromotion', 'AdditionalContactInfo' : 'PERSON_PERSON_AdditionalContactInfo', 'Demographics' : 'PERSON_PERSON_Demographics', 'PhoneNumber' : 'PERSON_PERSONPHONE_PhoneNumber', 'PhoneNumberTypeID' : 'PERSON_PHONENUMBERTYPE_PhoneNumberTypeID', 'Name' : 'PERSON_PHONENUMBERTYPE_Name', 'EmailAddressID': 'PERSON_EMAILADDRESS_EmailAddressID', 'EmailAddress' : 'PERSON_EMAILADDRESS_EmailAddress', 'PasswordHash' : 'PERSON_PASSWORD_PasswordHash', 'PasswordSalt' : 'PERSON_PASSWORD_PasswordSalt'}, inplace=True)

people

Unnamed: 0,PERSON_PERSON_BusinessEntityID,PERSON_PERSON_PersonType,PERSON_PERSON_NameStyle,PERSON_PERSON_Title,PERSON_PERSON_FirstName,PERSON_PERSON_MiddleName,PERSON_PERSON_LastName,PERSON_PERSON_Suffix,PERSON_PERSON_EmailPromotion,PERSON_PERSON_AdditionalContactInfo,PERSON_PERSON_Demographics,PERSON_PERSONPHONE_PhoneNumber,PERSON_PHONENUMBERTYPE_PhoneNumberTypeID,PERSON_PHONENUMBERTYPE_Name,PERSON_EMAILADDRESS_EmailAddressID,PERSON_EMAILADDRESS_EmailAddress,PERSON_PASSWORD_PasswordHash,PERSON_PASSWORD_PasswordSalt
0,1,EM,False,,Ken,J,Sánchez,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",697-555-0142,1,Cell,1,ken0@adventure-works.com,pbFwXWE99vobT6g+vPWFy93NtUU/orrIWafF01hccfM=,bE3XiWw=
1,2,EM,False,,Terri,Lee,Duffy,,1,,"<IndividualSurvey xmlns=""http://schemas.micros...",819-555-0175,3,Work,2,terri0@adventure-works.com,bawRVNrZQYQ05qF05Gz6VLilnviZmrqBReTTAGAudm0=,EjJaC3U=
2,3,EM,False,,Roberto,,Tamburello,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",212-555-0187,1,Cell,3,roberto0@adventure-works.com,8BUXrZfDqO1IyHCWOYzYmqN1IhTUn3CJMpdx/UCQ3iY=,wbPZqMw=
3,4,EM,False,,Rob,,Walters,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",612-555-0100,1,Cell,4,rob0@adventure-works.com,SjLXpiarHSlz+6AG+H+4QpB/IPRzras/+9q/5Wr7tf8=,PwSunQU=
4,5,EM,False,Ms.,Gail,A,Erickson,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",849-555-0139,1,Cell,5,gail0@adventure-works.com,8FYdAiY6gWuBsgjCFdg0UibtsqOcWHf9TyaHIP7+paA=,qYhZRiM=
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19967,20773,IN,False,,Crystal,,Guo,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",1 (11) 500 555-0171,1,Cell,19968,crystal18@adventure-works.com,4gSNTcSKHtKW1k9te824egho2RixU5Gc+LRDNYyMDx4=,qh4YKRQ=
19968,20774,IN,False,,Isabella,F,Richardson,,2,,"<IndividualSurvey xmlns=""http://schemas.micros...",910-555-0166,1,Cell,19969,isabella91@adventure-works.com,gOO6OEoRpCe9TiQ4+1fX1qXIzavOQ0Ccvl1JHS/Pseg=,s+EMJTA=
19969,20775,IN,False,,Crystal,S,He,,0,,"<IndividualSurvey xmlns=""http://schemas.micros...",813-555-0148,2,Home,19970,crystal19@adventure-works.com,r5nZct0C8mWL6KM0DE4pM8fO/0nmUYAtya8ref2efg8=,axcde7k=
19970,20776,IN,False,,Crystal,,Zheng,,1,,"<IndividualSurvey xmlns=""http://schemas.micros...",1 (11) 500 555-0171,2,Home,19971,crystal20@adventure-works.com,5eVmZbWYJXVwZkBkvpxlhA3/bKMLRReav9CgRP4NRbU=,nJbmm88=


### Products (TODO CHANGE)

In [320]:
# renaming aenc product columns to match the other product data
aenc_product.rename(columns={'id': 'ProductID', 'name' : 'ProductName', 'description':'Description' , 'prod_size' : 'ProdSize', 'color': 'Color', 'quantity': 'Quantity','unit_price' : 'UnitPrice'}, inplace=True)

aenc_product

Unnamed: 0,ProductID,ProductName,Description,ProdSize,Color,Quantity,UnitPrice,picture_name,Category


In [321]:
# combining all adventureworks product data
adventureworks_combined_products = pd.concat([adventureworks_production_product, adventureworks_production_productcategory, adventureworks_production_productsubcategory, adventureworks_production_productdescription, adventureworks_production_productdocument, adventureworks_production_productmodel, adventureworks_production_productmodelillustration, adventureworks_production_productmodelproductdescriptionculture, adventureworks_production_productphoto, adventureworks_production_productproductphoto  ], ignore_index=True)

adventureworks_combined_products

Unnamed: 0,ProductID,Name,ProductNumber,MakeFlag,FinishedGoodsFlag,Color,SafetyStockLevel,ReorderPoint,StandardCost,ListPrice,...,CatalogDescription,Instructions,IllustrationID,CultureID,ProductPhotoID,ThumbNailPhotoHexString,ThumbNailPhotoFileName,LargePhotoHexString,LargePhotoFileName,Primary
0,1.0,Adjustable Race,AR-5381,False,False,,1000.0,750.0,0.0,0.0,...,,,,,,,,,,
1,2.0,Bearing Ball,BA-8327,False,False,,1000.0,750.0,0.0,0.0,...,,,,,,,,,,
2,3.0,BB Ball Bearing,BE-2349,True,False,,800.0,600.0,0.0,0.0,...,,,,,,,,,,
3,4.0,Headset Ball Bearings,BE-2908,False,False,,800.0,600.0,0.0,0.0,...,,,,,,,,,,
4,316.0,Blade,BL-2036,True,False,,800.0,600.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2836,995.0,,,,,,,,,,...,,,,,1.0,,,,,True
2837,996.0,,,,,,,,,,...,,,,,1.0,,,,,True
2838,997.0,,,,,,,,,,...,,,,,102.0,,,,,True
2839,998.0,,,,,,,,,,...,,,,,102.0,,,,,True


In [322]:
# Combining all product data
products = pd.concat([northwind_products, aenc_product, adventureworks_combined_products], ignore_index=True)

# replacing the document node with documentnodestring
products['DocumentNode'] = products['DocumentNodeString']
products.drop(columns=['DocumentNodeString'], inplace=True)

# applying the data types to the columns
products['ProductID'] = products['ProductID'].astype(float) # casting to float seems counterintuitive but it is necessary to avoid an error in Pandas

# dropping documentnode column (need to fix later)
products.drop(columns=['DocumentNode'], inplace=True)


products

  products = pd.concat([northwind_products, aenc_product, adventureworks_combined_products], ignore_index=True)


Unnamed: 0,ProductID,ProductName,SupplierID,CategoryID,QuantityPerUnit,UnitPrice,UnitsInStock,UnitsOnOrder,ReorderLevel,Discontinued,...,CatalogDescription,Instructions,IllustrationID,CultureID,ProductPhotoID,ThumbNailPhotoHexString,ThumbNailPhotoFileName,LargePhotoHexString,LargePhotoFileName,Primary
0,1.0,Chai,1.0,1.0,10 boxes x 20 bags,18.00,39.0,0.0,10.0,False,...,,,,,,,,,,
1,2.0,Chang,1.0,1.0,24 - 12 oz bottles,19.00,17.0,40.0,25.0,False,...,,,,,,,,,,
2,3.0,Aniseed Syrup,1.0,2.0,12 - 550 ml bottles,10.00,13.0,70.0,25.0,False,...,,,,,,,,,,
3,4.0,Chef Anton's Cajun Seasoning,2.0,2.0,48 - 6 oz jars,22.00,53.0,0.0,0.0,False,...,,,,,,,,,,
4,5.0,Chef Anton's Gumbo Mix,2.0,2.0,36 boxes,21.35,0.0,0.0,0.0,True,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2913,995.0,,,,,,,,,,...,,,,,1.0,,,,,True
2914,996.0,,,,,,,,,,...,,,,,1.0,,,,,True
2915,997.0,,,,,,,,,,...,,,,,102.0,,,,,True
2916,998.0,,,,,,,,,,...,,,,,102.0,,,,,True


### Regions (TODO CHANGE)

In [323]:
# Combining all region data
regions = pd.concat([northwind_region, aenc_region, adventureworks_person_stateprovince], ignore_index=True)

# combining regiondescription and region columns to create a name column (one is always None)
regions['RegionName'] = regions['RegionDescription'].combine_first(regions['region'])

# dropping the other columns
regions.drop(columns=['RegionDescription', 'region'], inplace=True)


regions

Unnamed: 0,RegionID,StateProvinceID,StateProvinceCode,CountryRegionCode,IsOnlyStateProvinceFlag,Name,TerritoryID,rowguid,ModifiedDate,RegionName
0,1.0,,,,,,,,NaT,Eastern ...
1,2.0,,,,,,,,NaT,Western ...
2,3.0,,,,,,,,NaT,Northern ...
3,4.0,,,,,,,,NaT,Southern ...
4,,1.0,AB,CA,False,Alberta,6.0,298C2880-AB1C-4982-A5AD-A36EB4BA0D34,2014-02-08 10:17:21.587,
...,...,...,...,...,...,...,...,...,...,...
180,,177.0,91,FR,False,Essonne,7.0,35894A81-C267-4511-A706-99EA2C08181F,2008-04-30 00:00:00.000,
181,,178.0,92,FR,False,Hauts de Seine,7.0,F8FD6D62-A913-4F10-9E42-D348EDA02BD9,2008-04-30 00:00:00.000,
182,,179.0,93,FR,False,Seine Saint Denis,7.0,466C15BC-46EC-427D-99DB-98C380634527,2008-04-30 00:00:00.000,
183,,180.0,94,FR,False,Val de Marne,7.0,FE0A2A02-FE1D-4B79-B970-167EC7F724FC,2008-04-30 00:00:00.000,


### Customers

In [324]:
aenc_customer = aenc_customer[['fname', 'lname', 'state']]
northwind_customers = northwind_customers.drop(columns='CustomerID')

# combining all customer data
customers = pd.concat([northwind_customers, aenc_customer, adventureworks_sales_customer], ignore_index=True)

# rename
customers.rename(columns={
    'CustomerID': 'CUSTOMER_CUSTOMERS_ID',
    'CompanyName': 'CUSTOMER_CUSTOMERS_CompanyName',
    'ContactName': 'CUSTOMER_CUSTOMERS_ContactName',
    'ContactTitle': 'CUSTOMER_CUSTOMERS_ContactTitle',
    'Address': 'CUSTOMER_CUSTOMERS_Address',
    'City': 'CUSTOMER_CUSTOMERS_City',
    'Region': 'CUSTOMER_CUSTOMERS_Region',
    'PostalCode': 'CUSTOMER_CUSTOMERS_PostalCode',
    'Country': 'CUSTOMER_CUSTOMERS_Country',
    'Phone': 'CUSTOMER_CUSTOMERS_Phone',
    'Fax': 'CUSTOMER_CUSTOMERS_Fax',
    'fname': 'CUSTOMER_CUSTOMER_Fname',
    'lname': 'CUSTOMER_CUSTOMER_Lname',
    'state': 'CUSTOMER_CUSTOMER_State',
    'PersonID': 'CUSTOMER_CUSTOMER_PersonID',
    'StoreID': 'CUSTOMER_CUSTOMER_StoreID',
    'TerritoryID': 'CUSTOMER_CUSTOMER_TerritoryID',
    'AccountNumber': 'CUSTOMER_CUSTOMER_AccountNumber'
}, inplace=True)

drop_modified_date_rowguid(customers)

customers

Unnamed: 0,CUSTOMER_CUSTOMERS_CompanyName,CUSTOMER_CUSTOMERS_ContactName,CUSTOMER_CUSTOMERS_ContactTitle,CUSTOMER_CUSTOMERS_Address,CUSTOMER_CUSTOMERS_City,CUSTOMER_CUSTOMERS_Region,CUSTOMER_CUSTOMERS_PostalCode,CUSTOMER_CUSTOMERS_Country,CUSTOMER_CUSTOMERS_Phone,CUSTOMER_CUSTOMERS_Fax,CUSTOMER_CUSTOMER_Fname,CUSTOMER_CUSTOMER_Lname,CUSTOMER_CUSTOMER_State,CUSTOMER_CUSTOMERS_ID,CUSTOMER_CUSTOMER_PersonID,CUSTOMER_CUSTOMER_StoreID,CUSTOMER_CUSTOMER_TerritoryID,CUSTOMER_CUSTOMER_AccountNumber
0,Alfreds Futterkiste,Maria Anders,Sales Representative,Obere Str. 57,Berlin,,12209,Germany,030-0074321,030-0076545,,,,,,,,
1,Ana Trujillo Emparedados y helados,Ana Trujillo,Owner,Avda. de la Constitución 2222,México D.F.,,05021,Mexico,(5) 555-4729,(5) 555-3745,,,,,,,,
2,Antonio Moreno Taquería,Antonio Moreno,Owner,Mataderos 2312,México D.F.,,05023,Mexico,(5) 555-3932,,,,,,,,,
3,Around the Horn,Thomas Hardy,Sales Representative,120 Hanover Sq.,London,,WA1 1DP,UK,(171) 555-7788,(171) 555-6750,,,,,,,,
4,Berglunds snabbköp,Christina Berglund,Order Administrator,Berguvsvägen 8,Luleå,,S-958 22,Sweden,0921-12 34 65,0921-12 34 67,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19906,,,,,,,,,,,,,,30114.0,1985.0,1986.0,7.0,AW00030114
19907,,,,,,,,,,,,,,30115.0,1987.0,1988.0,6.0,AW00030115
19908,,,,,,,,,,,,,,30116.0,1989.0,1990.0,4.0,AW00030116
19909,,,,,,,,,,,,,,30117.0,1991.0,1992.0,4.0,AW00030117


## Loading the data into the UnitedOutdoors datawarehouse

### Departments

In [325]:
departments_dtypes = {
    'DEPARTMENT_DEPARTMENT_dept_id': Integer,
    'DEPARTMENT_DEPARTMENT_dept_name': String(100),
    'DEPARTMENT_DEPARTMENT_group_name': String(100),
    'DEPARTMENT_DEPARTMENT_dept_head_id': Integer,
    'DEPARTMENT_source_database': String(100)
}

# TODO dept_head_id needs to refer to an employee
prepare_and_insert(departments, departments_dtypes, 'Department')

Inserting data into table: Department


ProgrammingError: (pyodbc.ProgrammingError) ('42S22', "[42S22] [Microsoft][ODBC SQL Server Driver][SQL Server]Invalid column name 'DEPARTMENT_DEPARTMENT_dept_head_id'. (207) (SQLExecDirectW); [42S22] [Microsoft][ODBC SQL Server Driver][SQL Server]Invalid column name 'DEPARTMENT_DEPARTMENT_group_name'. (207); [42S22] [Microsoft][ODBC SQL Server Driver][SQL Server]Invalid column name 'DEPARTMENT_DEPARTMENT_dept_name '. (207); [42S22] [Microsoft][ODBC SQL Server Driver][SQL Server]Invalid column name 'DEPARTMENT_DEPARTMENT_dept_id'. (207); [42S22] [Microsoft][ODBC SQL Server Driver][SQL Server]Invalid column name 'DEPARTMENT_DEPARTMENT_dept_head_id'. (207); [42S22] [Microsoft][ODBC SQL Server Driver][SQL Server]Invalid column name 'DEPARTMENT_DEPARTMENT_group_name'. (207); [42S22] [Microsoft][ODBC SQL Server Driver][SQL Server]Invalid column name 'DEPARTMENT_DEPARTMENT_dept_name '. (207); [42S22] [Microsoft][ODBC SQL Server Driver][SQL Server]Invalid column name 'DEPARTMENT_DEPARTMENT_dept_id'. (207); [42S22] [Microsoft][ODBC SQL Server Driver][SQL Server]Statement(s) could not be prepared. (8180)")
[SQL: INSERT INTO dbo.[Department] ([DEPARTMENT_DEPARTMENT_dept_head_id], [DEPARTMENT_source_database], [DEPARTMENT_DEPARTMENT_group_name], [DEPARTMENT_DEPARTMENT_dept_name ], [DEPARTMENT_DEPARTMENT_dept_id]) VALUES (?, ?, ?, ?, ?), (?, ?, ?, ?, ?), (?, ?, ?, ?, ?), (?, ?, ?, ?, ?), (?, ?, ?, ?, ?), (?, ?, ?, ?, ?), (?, ?, ?, ?, ?), (?, ?, ?, ?, ?), (?, ?, ?, ?, ?), (?, ?, ?, ?, ?), (?, ?, ?, ?, ?), (?, ?, ?, ?, ?), (?, ?, ?, ?, ?), (?, ?, ?, ?, ?), (?, ?, ?, ?, ?), (?, ?, ?, ?, ?)]
[parameters: (None, 'adventureworks', 'Research and Development', 'Engineering', 1.0, None, 'adventureworks', 'Research and Development', 'Tool Design', 2.0, None, 'adventureworks', 'Sales and Marketing', 'Sales', 3.0, None, 'adventureworks', 'Sales and Marketing', 'Marketing', 4.0, None, 'adventureworks', 'Inventory Management', 'Purchasing', 5.0, None, 'adventureworks', 'Research and Development', 'Research and Development', 6.0, None, 'adventureworks', 'Manufacturing', 'Production', 7.0, None, 'adventureworks', 'Manufacturing', 'Production Control', 8.0, None, 'adventureworks', 'Executive General and Administration', 'Human Resources', 9.0, None, 'adventureworks', 'Executive General and Administration', 'Finance', 10.0, None, 'adventureworks', 'Executive General and Administration', 'Information Services', 11.0, None, 'adventureworks', 'Quality Assurance', 'Document Control', 12.0, None, 'adventureworks', 'Quality Assurance', 'Quality Assurance', 13.0, None, 'adventureworks', 'Executive General and Administration', 'Facilities and Maintenance', 14.0, None, 'adventureworks', 'Inventory Management', 'Shipping and Receiving', 15.0, None, 'adventureworks', 'Executive General and Administration', 'Executive', 16.0)]
(Background on this error at: https://sqlalche.me/e/20/f405)

### Employee

In [326]:
employees_dtypes = {
    'EMPLOYEE_EMPLOYEE_EmployeeID': Integer,
    'EMPLOYEE_EMPLOYEE_DeptID': Integer,
    'EMPLOYEE_EMPLOYEE_ManagerID': Integer,
    'EMPLOYEE_EMPLOYEETERRITORIES_TerritoryID': Integer,
    'EMPLOYEE_EMPLOYEE_Emp_Fname': NVARCHAR(255),
    'EMPLOYEE_EMPLOYEE_Emp_Lname': NVARCHAR(255),
    'EMPLOYEE_EMPLOYEE_Street': NVARCHAR(150),
    'EMPLOYEE_EMPLOYEE_City': NVARCHAR(100),
    'EMPLOYEE_EMPLOYEE_State': CHAR(1),
    'EMPLOYEE_EMPLOYEE_Zip_Code': CHAR(5),
    'EMPLOYEE_EMPLOYEE_Phone': Integer,
    'EMPLOYEE_EMPLOYEE_Status': CHAR(1),
    'EMPLOYEE_EMPLOYEE_SS_Number': Integer,
    'EMPLOYEE_EMPLOYEE_Salary': Integer,
    'EMPLOYEE_EMPLOYEE_Start_Date': DATE,
    'EMPLOYEE_EMPLOYEE_Termination': DATE,
    'EMPLOYEE_EMPLOYEE_Birth_Date': DATE,
    'EMPLOYEE_EMPLOYEE_Bene_Health_Ins': CHAR(1),
    'EMPLOYEE_EMPLOYEE_Bene_Life_Ins': CHAR(1),
    'EMPLOYEE_EMPLOYEE_Bene_Day_Care': CHAR(1),
    'EMPLOYEE_EMPLOYEE_Sex': CHAR(1),
    'EMPLOYEE_BONUS_Bonus_Date': DATE,
    'EMPLOYEE_BONUS_Bonus_Amount': Integer,
    'EMPLOYEE_EMPLOYEES_Title': NVARCHAR(50),
    'EMPLOYEE_EMPLOYEES_TitleOfCourtesy': NVARCHAR(50),
    'EMPLOYEE_EMPLOYEES_HireDate': DATE,
    'EMPLOYEE_EMPLOYEES_HomePhone': NVARCHAR(20),
    'EMPLOYEE_EMPLOYEES_Extension': Integer,
    'EMPLOYEE_EMPLOYEES_Photo': String,
    'EMPLOYEE_EMPLOYEES_PhotoPath': NVARCHAR(255),
    'EMPLOYEE_EMPLOYEES_Notes': String,
}

prepare_and_insert(employees, employees_dtypes, 'Employee')

Inserting data into table: Employee


### BusinessEntities

In [None]:
businessentities_dtypes = {
    'BUSINESSENTITY_BUSINESSENTITY_BusinessEntityID': Integer,
    'BUSINESSENTITY_BUSINESSENTITYCONTACT_PersonID': Integer,
    'BUSINESSENTITY_CONTACTTYPE_ContactTypeID': Integer,
    'BUSINESSENTITY_CONTACTTYPE_Name': String(100)
}

businessentities_nk_sk_dict = prepare_and_insert_return_sk(businessentities, businessentities_dtypes, 'BusinessEntity', 'BUSINESSENTITY_BUSINESSENTITY_BusinessEntityID')

### People

In [None]:
people_dtypes = {
    'PERSON_PERSON_BusinessEntityID': Integer,
    'PERSON_PERSON_PersonType': String(2),
    'PERSON_PERSON_NameStyle': BIT,
    'PERSON_PERSON_Title': String(100),
    'PERSON_PERSON_FirstName': String(100),
    'PERSON_PERSON_MiddleName': String(100),
    'PERSON_PERSON_LastName': String(100),
    'PERSON_PERSON_Suffix': String(100),
    'PERSON_PERSON_EmailPromotion': Integer,
    'PERSON_PERSON_AdditionalContactInfo': XML,
    'PERSON_PERSON_Demographics': XML,
    'PERSON_PERSONPHONE_PhoneNumber': String(100),
    'PERSON_PHONENUMBERTYPE_PhoneNumberTypeID': Integer,
    'PERSON_PHONENUMBERTYPE_Name': String(100),
    'PERSON_EMAILADDRESS_EmailAddressID': Integer,
    'PERSON_EMAILADDRESS_EmailAddress': String(100),
    'PERSON_PASSWORD_PasswordHash': LargeBinary,
    'PERSON_PASSWORD_PasswordSalt': LargeBinary
}

# Convert the 'PERSON_PASSWORD_PasswordHash' and 'PERSON_PASSWORD_PasswordSalt' columns to bytes
people['PERSON_PASSWORD_PasswordHash'] = people['PERSON_PASSWORD_PasswordHash'].apply(lambda x: x.encode('utf-8') if isinstance(x, str) else x)
people['PERSON_PASSWORD_PasswordSalt'] = people['PERSON_PASSWORD_PasswordSalt'].apply(lambda x: x.encode('utf-8') if isinstance(x, str) else x)

prepare_and_insert(people, people_dtypes, 'Person', { 'PERSON_PERSON_BusinessEntityID' : businessentities_nk_sk_dict})

Replacing natural keys with surrogate keys for column: PERSON_PERSON_BusinessEntityID
Inserting data into table: Person


### Updating the BusinessEntity table
replacing the natural keys with the surrogate keys for the PersonID column

In [None]:
prepare_and_update('BusinessEntity', united_outdoors_conn, { 'BUSINESSENTITY_BUSINESSENTITYCONTACT_PersonID' : businessentities_nk_sk_dict})

Updating data in table: BusinessEntity


### BusinessEntityAddresses

In [None]:
businessentityaddresses_dtypes = {
    'BUSINESSENTITYADDRESS_BUSINESSENTITYADDRESS_BusinessEntityID': Integer,
    'BUSINESSENTITYADDRESS_ADDRESSTYPE_AddressTypeID': Integer,
    'BUSINESSENTITYADDRESS_ADDRESSTYPE_Name': String(100),
    'BUSINESSENTITYADDRESS_ADDRESS_AddressID': Integer,
    'BUSINESSENTITYADDRESS_ADDRESS_AddressLine1': String(100),
    'BUSINESSENTITYADDRESS_ADDRESS_AddressLine2': String(100),
    'BUSINESSENTITYADDRESS_ADDRESS_City': String(100),
    'BUSINESSENTITYADDRESS_ADDRESS_POSTALCODE': String(100),
    'BUSINESSENTITYADDRESS_ADDRESS_SpatialLocation': VARCHAR,
    'BUSINESSENTITYADDRESS_ADDRESS_StateProvinceID': Integer
}

# TODO the StateProvinceID needs to refer to sk of Territory
prepare_and_insert(businessentityaddresses, businessentityaddresses_dtypes, 'BusinessEntityAddress', { 'BUSINESSENTITYADDRESS_BUSINESSENTITYADDRESS_BusinessEntityID' : businessentities_nk_sk_dict, 'BUSINESSENTITYADDRESS_ADDRESS_StateProvinceID' : {}})

Replacing natural keys with surrogate keys for column: BUSINESSENTITYADDRESS_BUSINESSENTITYADDRESS_BusinessEntityID
Replacing natural keys with surrogate keys for column: BUSINESSENTITYADDRESS_ADDRESS_StateProvinceID
Inserting data into table: BusinessEntityAddress


### Products (TODO CHANGE)

In [None]:
products_dtypes = {
    'ProductID': Integer,
    'ProductName': NVARCHAR(50),
    'SupplierID': Integer,
    'CategoryID': Integer,
    'QuantityPerUnit': NVARCHAR(30),
    'UnitPrice': MONEY,
    'UnitsInStock': Integer,
    'UnitsOnOrder': Integer,
    'ReorderLevel': Integer,
    'Discontinued': BIT,
    'Description': NVARCHAR,
    'ProdSize': NVARCHAR(50),
    'Color': NVARCHAR(15),
    'Quantity': Integer,
    'picture_name': NVARCHAR(50),
    'Category': NVARCHAR(20),
    'Name': NVARCHAR(50),
    'ProductNumber': NVARCHAR(25),
    'MakeFlag': BIT,
    'FinishedGoodsFlag': BIT,
    'SafetyStockLevel': Integer,
    'ReorderPoint': Integer,
    'StandardCost': DECIMAL(8,4),
    'ListPrice': MONEY,
    'Size': NVARCHAR(5),
    'SizeUnitMeasureCode': CHAR(2),
    'WeightUnitMeasureCode': NVARCHAR(3),
    'Weight': DECIMAL(8,2),
    'DaysToManufacture': Integer,
    'ProductLine': CHAR(1),
    'Class': CHAR(1),
    'Style': CHAR(1),
    'ProductSubcategoryID': Integer,
    'ProductModelID': Integer,
    'SellStartDate': DATE,
    'SellEndDate': DATE,
    'DiscontinuedDate': DATE,
    'ModifiedDate': DATE,
    'ProductCategoryID': Integer,
    'ProductDescriptionID': Integer,
    'CatalogDescription': XML,
    'Instructions': XML,
    'IllustrationID': Integer,
    'CultureID': NVARCHAR(10),
    'ProductPhotoID': Integer,
    'ThumbNailPhotoHexString': String,
    'ThumbnailPhotoFileName': NVARCHAR(50),
    'LargePhotoHexString': String,
    'LargePhotoFileName': NVARCHAR(50),
    'Primary': BIT
}

prepare_and_insert(products, products_dtypes, 'Product')

Inserting data into table: Product


### Regions (TODO CHANGE)

In [None]:
regions_dtypes = {
    'RegionID': Integer,
    'RegionName': VARCHAR(10),
    'StateProvinceID': Integer,
    'StateProvinceCode': VARCHAR(10),
    'CountryRegionCode': CHAR(2),
    'IsOnlyStateProvinceFlag': BIT,
    'Name': VARCHAR(50),
    'TerritoryID': Integer,
    'ModifiedDate': Date
}

regions['StateProvinceCode'] = regions['StateProvinceCode'].astype(str)

prepare_and_insert(regions, regions_dtypes, 'Region')

Inserting data into table: Region


### Customers

In [None]:
customers_dtypes = {
    'CUSTOMER_CUSTOMERS_ID': Integer,
    'CUSTOMER_CUSTOMER_PersonID': Integer,
    'CUSTOMER_CUSTOMER_StoreID': Integer,
    'CUSTOMER_CUSTOMER_TerritoryID': Integer,
    'CUSTOMER_CUSTOMER_AccountNumber': CHAR(10),
    'CUSTOMER_CUSTOMERS_CompanyName': NVARCHAR(100),
    'CUSTOMER_CUSTOMERS_ContactName': NVARCHAR(255),
    'CUSTOMER_CUSTOMERS_ContactTitle': NVARCHAR(100),
    'CUSTOMER_CUSTOMERS_Address': NVARCHAR(255),
    'CUSTOMER_CUSTOMERS_City': NVARCHAR(100),
    'CUSTOMER_CUSTOMERS_Region': NVARCHAR(50),
    'CUSTOMER_CUSTOMERS_PostalCode': NVARCHAR(20),
    'CUSTOMER_CUSTOMERS_Country': NVARCHAR(150),
    'CUSTOMER_CUSTOMERS_Phone': NVARCHAR(24),
    'CUSTOMER_CUSTOMERS_Fax': NVARCHAR(24),
    'CUSTOMER_CUSTOMER_Fname': NVARCHAR(255),
    'CUSTOMER_CUSTOMER_Lname': NVARCHAR(255),
    'CUSTOMER_CUSTOMER_State': CHAR(2)
}

prepare_and_insert(customers, customers_dtypes, 'Customer')

Inserting data into table: Customer


## Constraints
altering the tables to add the (foreign key) constraints

In [None]:
# opening the UnitedOutdoors_constraints.sql file
with open('sql/UnitedOutdoors_constraints.sql', 'r') as file:
    sql_script = file.read()

# Execute the script
split_and_execute_sql_script(sql_script, united_outdoors_conn)

# Closing connections

In [None]:
try:
    united_outdoors_conn.close()
    northwind_conn.close()
    aenc_conn.close()
    adventureworks_conn.close()
except OperationalError as e:
        print(f'Error: {e}')
finally:
    united_outdoors_engine.dispose()
    northwind_engine.dispose()
    aenc_engine.dispose()
    adventureworks_engine.dispose()