# United outdoors datawarehouse

## Imports

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, Integer, VARCHAR, CHAR, NVARCHAR, Date, LargeBinary, DECIMAL, DATE, VARBINARY
from sqlalchemy.dialects.mssql import XML, BIT, MONEY
from sqlalchemy.exc import OperationalError
import urllib
import re

## Database connection details

In [2]:
DB = {
    'servername' : '(local)\\SQLEXPRESS',
    'united_outdoors_database' : 'UnitedOutdoors',
    'northwind_database' : 'Northwind',
    'aenc_database' : 'Aenc',
    'adventureworks_database' : 'AdventureWorks2019',
    'master' : 'master'
}

In [3]:
def create_connection(servername, database):
    params = urllib.parse.quote_plus(f'DRIVER={{SQL Server}};SERVER={servername};DATABASE={database};Trusted_Connection=yes')
    engine = create_engine(f'mssql+pyodbc:///?odbc_connect={params}', use_setinputsizes=False) # setinputsizes needs to be turned off for sql server, idk why but gives errors otherwise
    try:
        conn = engine.connect()
        print(f'Connection to {database} database successful')
        return conn, engine
    except OperationalError as e:
        print(f'Error: {e}')
        return None, None

In [4]:
def split_and_execute_sql_script(script, connection):
    # splitting the script into the database creation and the rest
    commands = re.split(r'GO\n', script)
    # removing all \bGO\b from the commands
    commands = [re.sub(r'\bGO\b', '', command) for command in commands]
    
    # Execute the commands
    for command in commands:
        command = command.strip()
        # Skip if the command is empty or 'GO'
        if not command or command.upper() == 'GO':
            continue
        try:
            connection.connection.execute(command)
            connection.connection.commit()
            #print(f'Command executed: {command}')
        except OperationalError as e:
            print(f'Error: {e} at command: {command}')

## Create the UnitedOutdoors datawarehouse

In [5]:
conn, creation_engine = create_connection(DB["servername"], DB["master"])

# Open the SQL script file and read its contents
with open('sql/UnitedOutdoors_creation.sql', 'r') as file:
    sql_script = file.read()

split_and_execute_sql_script(sql_script, conn)

conn.close()
creation_engine.dispose()

Connection to master database successful


## Connecting to the UnitedOutdoors datawarehouse

In [6]:
united_outdoors_conn , united_outdoors_engine = create_connection(DB["servername"], DB["united_outdoors_database"])

Connection to UnitedOutdoors database successful


## Loading the data from the source databases

### Northwind database

#### Connection

In [7]:
northwind_conn, northwind_engine = create_connection(DB["servername"], DB["northwind_database"])

Connection to Northwind database successful


#### Loading data

In [8]:
# Load the data from the source database
northwind_categories = pd.read_sql('SELECT * FROM Categories', northwind_conn)
northwind_customer_customer_demo = pd.read_sql('SELECT * FROM CustomerCustomerDemo', northwind_conn)
northwind_customer_demographics = pd.read_sql('SELECT * FROM CustomerDemographics', northwind_conn)
northwind_customers = pd.read_sql('SELECT * FROM Customers', northwind_conn)
northwind_employees = pd.read_sql('SELECT * FROM Employees', northwind_conn)
northwind_employee_territories = pd.read_sql('SELECT * FROM EmployeeTerritories', northwind_conn)
northwind_order_details = pd.read_sql('SELECT * FROM [Order Details]', northwind_conn)
northwind_orders = pd.read_sql('SELECT * FROM Orders', northwind_conn)
northwind_products = pd.read_sql('SELECT * FROM Products', northwind_conn)
northwind_region = pd.read_sql('SELECT * FROM Region', northwind_conn)
northwind_shippers = pd.read_sql('SELECT * FROM Shippers', northwind_conn)
northwind_suppliers = pd.read_sql('SELECT * FROM Suppliers', northwind_conn)
northwind_territories = pd.read_sql('SELECT * FROM Territories', northwind_conn)

### Aenc database

#### Connection

In [9]:
aenc_conn , aenc_engine = create_connection(DB["servername"], DB["aenc_database"])

Connection to Aenc database successful


#### Loading data

In [10]:
aenc_bonus = pd.read_sql('SELECT * FROM Bonus', aenc_conn)
aenc_customer = pd.read_sql('SELECT * FROM customer', aenc_conn)
aenc_department = pd.read_sql('SELECT * FROM department', aenc_conn)
aenc_employee = pd.read_sql('SELECT * FROM Employee', aenc_conn)
aenc_product = pd.read_sql('SELECT * FROM product', aenc_conn)
aenc_region = pd.read_sql('SELECT * FROM region', aenc_conn)
aenc_sales_order = pd.read_sql('SELECT * FROM sales_order', aenc_conn)
aenc_sales_order_item = pd.read_sql('SELECT * FROM sales_order_item', aenc_conn)
aenc_state = pd.read_sql('SELECT * FROM state', aenc_conn)

### AdventureWorks database

#### Connection

In [11]:
adventureworks_conn, adventureworks_engine = create_connection(DB["servername"], DB["adventureworks_database"])

Connection to AdventureWorks2019 database successful


#### Loading data

In [12]:
adventureworks_humanresources_department = pd.read_sql('SELECT * FROM HumanResources.Department', adventureworks_conn)
adventureworks_humanresources_employee = pd.read_sql('SELECT * FROM HumanResources.Employee', adventureworks_conn)
adventureworks_humanresources_employeedepartmenthistory = pd.read_sql('SELECT * FROM HumanResources.EmployeeDepartmentHistory', adventureworks_conn)
adventureworks_humanresources_employeepayhistory = pd.read_sql('SELECT * FROM HumanResources.EmployeePayHistory', adventureworks_conn)
adventureworks_humanresources_jobcandidate = pd.read_sql('SELECT * FROM HumanResources.JobCandidate', adventureworks_conn)
adventureworks_humanresources_shift = pd.read_sql('SELECT * FROM HumanResources.Shift', adventureworks_conn)

In [13]:
adventureworks_person_address = pd.read_sql('SELECT AddressID, AddressLine1, AddressLine2, City, StateProvinceID, PostalCode, CAST(SpatialLocation AS VARCHAR(MAX)) AS SpatialLocation,rowguid, ModifiedDate   FROM Person.Address', adventureworks_conn)
adventureworks_person_address_type = pd.read_sql('SELECT * FROM Person.AddressType', adventureworks_conn)
adventureworks_person_businessentity = pd.read_sql('SELECT * FROM Person.BusinessEntity', adventureworks_conn)
adventureworks_person_businessentityaddress = pd.read_sql('SELECT * FROM Person.BusinessEntityAddress', adventureworks_conn)
adventureworks_person_businessentitycontact = pd.read_sql('SELECT * FROM Person.BusinessEntityContact', adventureworks_conn)
adventureworks_person_contacttype = pd.read_sql('SELECT * FROM Person.ContactType', adventureworks_conn)
adventureworks_person_countryregion = pd.read_sql('SELECT * FROM Person.CountryRegion', adventureworks_conn)
adventureworks_person_emailaddress = pd.read_sql('SELECT * FROM Person.EmailAddress', adventureworks_conn)
adventureworks_person_password = pd.read_sql('SELECT * FROM Person.Password', adventureworks_conn)
adventureworks_person_person = pd.read_sql('SELECT * FROM Person.Person', adventureworks_conn)
adventureworks_person_personphone = pd.read_sql('SELECT * FROM Person.PersonPhone', adventureworks_conn)
adventureworks_person_phonenumbertype = pd.read_sql('SELECT * FROM Person.PhoneNumberType', adventureworks_conn)
adventureworks_person_stateprovince = pd.read_sql('SELECT * FROM Person.StateProvince', adventureworks_conn)

In [14]:
adventureworks_production_bill_of_materials = pd.read_sql('SELECT * FROM Production.BillOfMaterials', adventureworks_conn)
adventureworks_production_culture = pd.read_sql('SELECT * FROM Production.Culture', adventureworks_conn)
adventureworks_production_document = pd.read_sql('SELECT * FROM Production.Document', adventureworks_conn)
adventureworks_production_illustration = pd.read_sql('SELECT * FROM Production.Illustration', adventureworks_conn)
adventureworks_production_location = pd.read_sql('SELECT * FROM Production.Location', adventureworks_conn)
adventureworks_production_product = pd.read_sql('SELECT * FROM Production.Product', adventureworks_conn)
adventureworks_production_productcategory = pd.read_sql('SELECT * FROM Production.ProductCategory', adventureworks_conn)
adventureworks_production_productcosthistory = pd.read_sql('SELECT * FROM Production.ProductCostHistory', adventureworks_conn)
adventureworks_production_productdescription = pd.read_sql('SELECT * FROM Production.ProductDescription', adventureworks_conn)
adventureworks_production_productdocument = pd.read_sql('SELECT * FROM Production.ProductDocument', adventureworks_conn)
adventureworks_production_productinventory = pd.read_sql('SELECT * FROM Production.ProductInventory', adventureworks_conn)
adventureworks_production_productlistpricehistory = pd.read_sql('SELECT * FROM Production.ProductListPriceHistory', adventureworks_conn)
adventureworks_production_productmodel = pd.read_sql('SELECT * FROM Production.ProductModel', adventureworks_conn)
adventureworks_production_productmodelillustration = pd.read_sql('SELECT * FROM Production.ProductModelIllustration', adventureworks_conn)
adventureworks_production_productmodelproductdescriptionculture = pd.read_sql('SELECT * FROM Production.ProductModelProductDescriptionCulture', adventureworks_conn)
adventureworks_production_productphoto = pd.read_sql('SELECT * FROM Production.ProductPhoto', adventureworks_conn)
adventureworks_production_productproductphoto = pd.read_sql('SELECT * FROM Production.ProductProductPhoto', adventureworks_conn)
adventureworks_production_productreview = pd.read_sql('SELECT * FROM Production.ProductReview', adventureworks_conn)
adventureworks_production_productsubcategory = pd.read_sql('SELECT * FROM Production.ProductSubcategory', adventureworks_conn)
adventureworks_production_scrapreason = pd.read_sql('SELECT * FROM Production.ScrapReason', adventureworks_conn)
adventureworks_production_transactionhistory = pd.read_sql('SELECT * FROM Production.TransactionHistory', adventureworks_conn)
adventureworks_production_transactionhistoryarchive = pd.read_sql('SELECT * FROM Production.TransactionHistoryArchive', adventureworks_conn)
adventureworks_production_unitmeasure = pd.read_sql('SELECT * FROM Production.UnitMeasure', adventureworks_conn)
adventureworks_production_workorder = pd.read_sql('SELECT * FROM Production.WorkOrder', adventureworks_conn)
adventureworks_production_workorderrouting = pd.read_sql('SELECT * FROM Production.WorkOrderRouting', adventureworks_conn)

In [15]:
adventureworks_purchasing_productvendor = pd.read_sql('SELECT * FROM Purchasing.ProductVendor', adventureworks_conn)
adventureworks_purchasing_purchaseorderdetail = pd.read_sql('SELECT * FROM Purchasing.PurchaseOrderDetail', adventureworks_conn)
adventureworks_purchasing_purchaseorderheader = pd.read_sql('SELECT * FROM Purchasing.PurchaseOrderHeader', adventureworks_conn)
adventureworks_purchasing_shipmethod = pd.read_sql('SELECT * FROM Purchasing.ShipMethod', adventureworks_conn)
adventureworks_purchasing_vendor = pd.read_sql('SELECT * FROM Purchasing.Vendor', adventureworks_conn)

In [16]:
adventureworks_sales_countryregioncurrency = pd.read_sql('SELECT * FROM Sales.CountryRegionCurrency', adventureworks_conn)
adventureworks_sales_creditcard = pd.read_sql('SELECT * FROM Sales.CreditCard', adventureworks_conn)
adventureworks_sales_currency = pd.read_sql('SELECT * FROM Sales.Currency', adventureworks_conn)
adventureworks_sales_currencyrate = pd.read_sql('SELECT * FROM Sales.CurrencyRate', adventureworks_conn)
adventureworks_sales_customer = pd.read_sql('SELECT * FROM Sales.Customer', adventureworks_conn)
adventureworks_sales_personcreditcard = pd.read_sql('SELECT * FROM Sales.PersonCreditCard', adventureworks_conn)
adventureworks_sales_salesorderdetail = pd.read_sql('SELECT * FROM Sales.SalesOrderDetail', adventureworks_conn)
adventureworks_sales_salesorderheader = pd.read_sql('SELECT * FROM Sales.SalesOrderHeader', adventureworks_conn)
adventureworks_sales_salesorderhearerrsaleseason = pd.read_sql('SELECT * FROM Sales.SalesOrderHeaderSalesReason', adventureworks_conn)
adventureworks_sales_salesperson = pd.read_sql('SELECT * FROM Sales.SalesPerson', adventureworks_conn)
adventureworks_sales_salespersonquotahistory = pd.read_sql('SELECT * FROM Sales.SalesPersonQuotaHistory', adventureworks_conn)
adventureworks_sales_salesreason = pd.read_sql('SELECT * FROM Sales.SalesReason', adventureworks_conn)
adventureworks_sales_salestaxrate = pd.read_sql('SELECT * FROM Sales.SalesTaxRate', adventureworks_conn)
adventureworks_sales_salesterritory = pd.read_sql('SELECT * FROM Sales.SalesTerritory', adventureworks_conn)
adventureworks_sales_salesterritoryhistory = pd.read_sql('SELECT * FROM Sales.SalesTerritoryHistory', adventureworks_conn)
adventureworks_sales_shoppingcartitem = pd.read_sql('SELECT * FROM Sales.ShoppingCartItem', adventureworks_conn)
adventureworks_sales_specialoffer = pd.read_sql('SELECT * FROM Sales.SpecialOffer', adventureworks_conn)
adventureworks_sales_specialofferproduct = pd.read_sql('SELECT * FROM Sales.SpecialOfferProduct', adventureworks_conn)
adventureworks_sales_store = pd.read_sql('SELECT * FROM Sales.Store', adventureworks_conn)

## Combining the data

### Products

In [17]:
# renaming aenc product columns to match the other product data
aenc_product.rename(columns={'id': 'ProductID', 'name' : 'ProductName', 'description':'Description' , 'prod_size' : 'ProdSize', 'color': 'Color', 'quantity': 'Quantity','unit_price' : 'UnitPrice'}, inplace=True)

aenc_product

Unnamed: 0,ProductID,ProductName,Description,ProdSize,Color,Quantity,UnitPrice,picture_name,Category
0,300,Tee Shirt,Tank Top,Small,White,18,9,tshirtw.bmp,Clothes
1,301,Tee Shirt,V-neck,Medium,Orange,39,14,tshirto.bmp,Clothes
2,302,Tee Shirt,Crew Neck,One size fits all,Black,72,14,tshirtb.bmp,Clothes
3,400,Baseball Cap,Cotton Cap,One size fits all,Black,92,9,capb.bmp,Accessories
4,401,Baseball Cap,Wool cap,One size fits all,White,12,10,capw.bmp,Accessories
5,500,Visor,Cloth Visor,One size fits all,White,36,7,visorw.bmp,Accessories
6,501,Visor,Plastic Visor,One size fits all,Black,28,7,visorb.bmp,Accessories
7,600,Sweatshirt,Hooded Sweatshirt,Large,Green,39,24,sshirtg.bmp,Clothes
8,601,Sweatshirt,Zipped Sweatshirt,Large,Blue,32,24,sshirtb.bmp,Clothes
9,700,Shorts,Cotton Shorts,Medium,Black,80,15,shortsb.bmp,Clothes


In [18]:
# combining all adventureworks product data
adventureworks_combined_products = pd.concat([adventureworks_production_product, adventureworks_production_productcategory, adventureworks_production_productsubcategory, adventureworks_production_productdescription, adventureworks_production_productdocument, adventureworks_production_productmodel, adventureworks_production_productmodelillustration, adventureworks_production_productmodelproductdescriptionculture, adventureworks_production_productphoto, adventureworks_production_productproductphoto  ], ignore_index=True)

adventureworks_combined_products

Unnamed: 0,ProductID,Name,ProductNumber,MakeFlag,FinishedGoodsFlag,Color,SafetyStockLevel,ReorderPoint,StandardCost,ListPrice,...,CatalogDescription,Instructions,IllustrationID,CultureID,ProductPhotoID,ThumbNailPhoto,ThumbnailPhotoFileName,LargePhoto,LargePhotoFileName,Primary
0,1.0,Adjustable Race,AR-5381,False,False,,1000.0,750.0,0.0,0.0,...,,,,,,,,,,
1,2.0,Bearing Ball,BA-8327,False,False,,1000.0,750.0,0.0,0.0,...,,,,,,,,,,
2,3.0,BB Ball Bearing,BE-2349,True,False,,800.0,600.0,0.0,0.0,...,,,,,,,,,,
3,4.0,Headset Ball Bearings,BE-2908,False,False,,800.0,600.0,0.0,0.0,...,,,,,,,,,,
4,316.0,Blade,BL-2036,True,False,,800.0,600.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2836,995.0,,,,,,,,,,...,,,,,1.0,,,,,True
2837,996.0,,,,,,,,,,...,,,,,1.0,,,,,True
2838,997.0,,,,,,,,,,...,,,,,102.0,,,,,True
2839,998.0,,,,,,,,,,...,,,,,102.0,,,,,True


In [19]:
# Combining all product data
products = pd.concat([northwind_products, aenc_product, adventureworks_combined_products], ignore_index=True)

# replace empty values with None
products = products.where(pd.notnull(products), None)
products = products.replace({np.nan: None})

products

Unnamed: 0,ProductID,ProductName,SupplierID,CategoryID,QuantityPerUnit,UnitPrice,UnitsInStock,UnitsOnOrder,ReorderLevel,Discontinued,...,CatalogDescription,Instructions,IllustrationID,CultureID,ProductPhotoID,ThumbNailPhoto,ThumbnailPhotoFileName,LargePhoto,LargePhotoFileName,Primary
0,1.0,Chai,1.0,1.0,10 boxes x 20 bags,18.0,39.0,0.0,10.0,False,...,,,,,,,,,,
1,2.0,Chang,1.0,1.0,24 - 12 oz bottles,19.0,17.0,40.0,25.0,False,...,,,,,,,,,,
2,3.0,Aniseed Syrup,1.0,2.0,12 - 550 ml bottles,10.0,13.0,70.0,25.0,False,...,,,,,,,,,,
3,4.0,Chef Anton's Cajun Seasoning,2.0,2.0,48 - 6 oz jars,22.0,53.0,0.0,0.0,False,...,,,,,,,,,,
4,5.0,Chef Anton's Gumbo Mix,2.0,2.0,36 boxes,21.35,0.0,0.0,0.0,True,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2923,995.0,,,,,,,,,,...,,,,,1.0,,,,,True
2924,996.0,,,,,,,,,,...,,,,,1.0,,,,,True
2925,997.0,,,,,,,,,,...,,,,,102.0,,,,,True
2926,998.0,,,,,,,,,,...,,,,,102.0,,,,,True


### Regions

In [20]:
# Combining all region data
regions = pd.concat([northwind_region, aenc_region, adventureworks_person_stateprovince], ignore_index=True)

# combining regiondescription and region columns to create a name column (one is always None)
regions['RegionName'] = regions['RegionDescription'].combine_first(regions['region'])

# dropping the other columns
regions.drop(columns=['RegionDescription', 'region'], inplace=True)

# replace empty values with None
regions = regions.where(pd.notnull(regions), None)
regions = regions.replace({np.nan: None})


regions

Unnamed: 0,RegionID,StateProvinceID,StateProvinceCode,CountryRegionCode,IsOnlyStateProvinceFlag,Name,TerritoryID,rowguid,ModifiedDate,RegionName
0,1.0,,,,,,,,,Eastern ...
1,2.0,,,,,,,,,Western ...
2,3.0,,,,,,,,,Northern ...
3,4.0,,,,,,,,,Southern ...
4,,,,,,,,,,Canada
...,...,...,...,...,...,...,...,...,...,...
186,,177.0,91,FR,False,Essonne,7.0,35894A81-C267-4511-A706-99EA2C08181F,2008-04-30 00:00:00,
187,,178.0,92,FR,False,Hauts de Seine,7.0,F8FD6D62-A913-4F10-9E42-D348EDA02BD9,2008-04-30 00:00:00,
188,,179.0,93,FR,False,Seine Saint Denis,7.0,466C15BC-46EC-427D-99DB-98C380634527,2008-04-30 00:00:00,
189,,180.0,94,FR,False,Val de Marne,7.0,FE0A2A02-FE1D-4B79-B970-167EC7F724FC,2008-04-30 00:00:00,


### Customers

In [21]:
# combining northwind customer data
northwind_combined_customer = pd.concat([northwind_customers, northwind_customer_customer_demo, northwind_customer_demographics], ignore_index=True)

# renaming northwind customer columns to match the other customer data
northwind_combined_customer.rename(columns={'PostalCode' : 'Zip'}, inplace=True)

northwind_combined_customer

Unnamed: 0,CustomerID,CompanyName,ContactName,ContactTitle,Address,City,Region,Zip,Country,Phone,Fax,CustomerTypeID,CustomerDesc
0,ALFKI,Alfreds Futterkiste,Maria Anders,Sales Representative,Obere Str. 57,Berlin,,12209,Germany,030-0074321,030-0076545,,
1,ANATR,Ana Trujillo Emparedados y helados,Ana Trujillo,Owner,Avda. de la ConstituciÃ³n 2222,MÃ©xico D.F.,,05021,Mexico,(5) 555-4729,(5) 555-3745,,
2,ANTON,Antonio Moreno TaquerÃ­a,Antonio Moreno,Owner,Mataderos 2312,MÃ©xico D.F.,,05023,Mexico,(5) 555-3932,,,
3,AROUT,Around the Horn,Thomas Hardy,Sales Representative,120 Hanover Sq.,London,,WA1 1DP,UK,(171) 555-7788,(171) 555-6750,,
4,BERGS,Berglunds snabbkÃ¶p,Christina Berglund,Order Administrator,BerguvsvÃ¤gen 8,LuleÃ¥,,S-958 22,Sweden,0921-12 34 65,0921-12 34 67,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,WARTH,Wartian Herkku,Pirkko Koskitalo,Accounting Manager,Torikatu 38,Oulu,,90110,Finland,981-443655,981-443655,,
87,WELLI,Wellington Importadora,Paula Parente,Sales Manager,"Rua do Mercado, 12",Resende,SP,08737-363,Brazil,(14) 555-8122,,,
88,WHITC,White Clover Markets,Karl Jablonski,Owner,305 - 14th Ave. S. Suite 3B,Seattle,WA,98128,USA,(206) 555-4112,(206) 555-4115,,
89,WILMK,Wilman Kala,Matti Karttunen,Owner/Marketing Assistant,Keskuskatu 45,Helsinki,,21240,Finland,90-224 8858,90-224 8858,,


In [22]:
# combining all adventureworks customer data
adventureworks_combined_customers = pd.merge(adventureworks_sales_customer, adventureworks_person_person, left_on='PersonID', right_on='BusinessEntityID', how='outer', suffixes=('_sales_customer', '_person'))

adventureworks_combined_customers = pd.merge(adventureworks_combined_customers, adventureworks_person_businessentityaddress, left_on='BusinessEntityID', right_on='BusinessEntityID', suffixes=('', '_businessentityaddress'))

adventureworks_combined_customers = pd.merge(adventureworks_combined_customers, adventureworks_person_address, left_on='AddressID', right_on='AddressID', how='left', suffixes=('', '_address'))

adventureworks_combined_customers = pd.merge(adventureworks_combined_customers, adventureworks_person_address_type, left_on='AddressTypeID', right_on='AddressTypeID', how='left', suffixes=('', '_address_type'))

adventureworks_combined_customers = pd.merge(adventureworks_combined_customers, adventureworks_person_stateprovince, left_on=['StateProvinceID', 'TerritoryID'], right_on=['StateProvinceID', 'TerritoryID'], suffixes=('', '_stateprovince'))

adventureworks_combined_customers = pd.merge(adventureworks_combined_customers, adventureworks_person_countryregion, left_on='CountryRegionCode', right_on='CountryRegionCode', suffixes=('', '_countryregion'))

# combining fist, middle and last name columns to create a contact name column
adventureworks_combined_customers['ContactName'] = adventureworks_combined_customers['FirstName'] + ' ' + adventureworks_combined_customers['MiddleName'] + ' ' + adventureworks_combined_customers['LastName']
adventureworks_combined_customers.drop(columns=['FirstName', 'MiddleName', 'LastName'], inplace=True)

# Remove any double spaces caused by missing middle names
adventureworks_combined_customers['ContactName'] = adventureworks_combined_customers['ContactName'].str.replace('  ', ' ')

# Get all columns that contain 'rowguid' in their name
columns_to_drop = adventureworks_combined_customers.filter(like='rowguid').columns

# Drop these columns
adventureworks_combined_customers.drop(columns=columns_to_drop, inplace=True)

# only keeping the most recent modified date from the two tables
adventureworks_combined_customers['ModifiedDate'] = adventureworks_combined_customers[['ModifiedDate_sales_customer', 'ModifiedDate_person', 'ModifiedDate', 'ModifiedDate_address', 'ModifiedDate_address_type', 'ModifiedDate_stateprovince', 'ModifiedDate_countryregion']].max(axis=1)

# dropping the other modified date columns
adventureworks_combined_customers.drop(columns=['ModifiedDate_sales_customer', 'ModifiedDate_person', 'ModifiedDate', 'ModifiedDate_address', 'ModifiedDate_address_type', 'ModifiedDate_stateprovince', 'ModifiedDate_countryregion'], inplace=True)

# combining PersonID and BusinessEntityID columns
adventureworks_combined_customers['PersonID'] = adventureworks_combined_customers['PersonID'].combine_first(adventureworks_combined_customers['BusinessEntityID'])
adventureworks_combined_customers.drop(columns=['BusinessEntityID'], inplace=True)

# renaming columns to match the other customer data
adventureworks_combined_customers.rename(columns={'AddressLine1' : 'Address', 'PostalCode' : 'Zip', 'Name' : 'AddressType', 'Name_stateprovince' : 'StateProvince', 'Name_countryregion': 'CountryRegion' }, inplace=True)

adventureworks_combined_customers

Unnamed: 0,CustomerID,PersonID,StoreID,TerritoryID,AccountNumber,PersonType,NameStyle,Title,Suffix,EmailPromotion,...,StateProvinceID,Zip,SpatialLocation,AddressType,StateProvinceCode,CountryRegionCode,IsOnlyStateProvinceFlag,StateProvince,CountryRegion,ContactName
0,11377.0,1699.0,,8.0,AW00011377,IN,False,Mr.,,1.0,...,53,42651,POINT (7.11082410683939 51.2015555665827),Home,NW,DE,False,Nordrhein-Westfalen,Germany,David R. Robinett
1,11913.0,1700.0,,9.0,AW00011913,IN,False,Ms.,,0.0,...,77,3198,POINT (145.141451560879 -38.0612939642931),Home,VIC,AU,False,Victoria,Australia,Rebecca A. Robinson
2,11952.0,1701.0,,9.0,AW00011952,IN,False,Ms.,,2.0,...,77,3220,POINT (144.201620782255 -38.1464342680786),Home,VIC,AU,False,Victoria,Australia,Dorothy B. Robinson
3,20164.0,1702.0,,10.0,AW00020164,IN,False,Ms.,,0.0,...,14,LA1 1LN,POINT (-2.80326155845985 54.1184442932464),Home,ENG,GB,True,England,United Kingdom,Carol Ann F. Rockne
4,20211.0,1703.0,,9.0,AW00020211,IN,False,Mr.,,0.0,...,64,4169,POINT (152.9802503342 -27.4802117164592),Home,QLD,AU,False,Queensland,Australia,Scott M. Rodgers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18503,19379.0,20773.0,,10.0,AW00019379,IN,False,,,0.0,...,14,BD1 4SJ,POINT (-1.74961030805808 53.8678047171656),Home,ENG,GB,True,England,United Kingdom,
18504,13933.0,20774.0,,4.0,AW00013933,IN,False,,,2.0,...,9,90505,POINT (-118.20918788284 33.7799182919931),Home,CA,US,False,California,United States,Isabella F Richardson
18505,24634.0,20775.0,,1.0,AW00024634,IN,False,,,0.0,...,79,98312,POINT (-122.900630062084 47.5744175658405),Home,WA,US,False,Washington,United States,Crystal S He
18506,21127.0,20776.0,,7.0,AW00021127,IN,False,,,1.0,...,164,78000,POINT (2.1423481823744 48.7821960594244),Home,78,FR,False,Yveline,France,


In [23]:
# renaming aenc customer columns to match the other customer data
aenc_customer.rename(columns={'id': 'CustomerID', 'address' : 'Address', 'city':'City' , 'state' : 'State', 'zip' : 'Zip', 'phone': 'Phone', 'company_name' : 'CompanyName'}, inplace=True)

# combining fname and lname columns to create a contact name column
aenc_customer['ContactName'] = aenc_customer['fname'] + ' ' + aenc_customer['lname']
aenc_customer.drop(columns=['fname', 'lname'], inplace=True)

aenc_customer

Unnamed: 0,CustomerID,Address,City,State,Zip,Phone,CompanyName,ContactName
0,101,3114 Pioneer Avenue,Rutherford,NJ,07070,2015558966,The Power Group,Michaels Devlin
1,102,1033 Whippany Road,New York,NY,10154,2125558725,AMF Corp.,Beth Reiser
2,103,1990 Windsor Street,Paoli,PA,19301,2155556513,Darling Associates,Erin Niedringhaus
3,104,550 Dundas Street East,Knoxville,TN,37919,6155555463,P.S.C.,Meghan Mason
4,105,1210 Highway 36,Carmel,IN,46032,3175558437,Amo & Sons,Laura McCarthy
...,...,...,...,...,...,...,...,...
121,552,654 West Hill,Nashville,TN,37320,6155553689,Greensleeves,Janice O'Toole
122,553,77 Recordings Circle,Tacoma,WA,96521,5095551695,It's a Hit!,Stevie Nickolas
123,555,99 Main Street,Los Angeles,CA,90205,2135554457,Quaker Fashions,Philipe Fernandez
124,661,3 Back Pages Lane,Missola,IL,60505,7085556857,Stutzman Advertising,Jennifer Stutzman


In [24]:
# Combining all customer data
customers = pd.concat([northwind_combined_customer, aenc_customer, adventureworks_combined_customers], ignore_index=True)

# replace empty values with None
customers = customers.where(pd.notnull(customers), None)
customers = customers.replace({np.nan: None})

# replacing all empty strings with None
customers = customers.infer_objects(copy=False).replace(r'^\s*$', None, regex=True)

customers

Unnamed: 0,CustomerID,CompanyName,ContactName,ContactTitle,Address,City,Region,Zip,Country,Phone,...,AddressTypeID,AddressLine2,StateProvinceID,SpatialLocation,AddressType,StateProvinceCode,CountryRegionCode,IsOnlyStateProvinceFlag,StateProvince,CountryRegion
0,ALFKI,Alfreds Futterkiste,Maria Anders,Sales Representative,Obere Str. 57,Berlin,,12209,Germany,030-0074321,...,,,,,,,,,,
1,ANATR,Ana Trujillo Emparedados y helados,Ana Trujillo,Owner,Avda. de la ConstituciÃ³n 2222,MÃ©xico D.F.,,05021,Mexico,(5) 555-4729,...,,,,,,,,,,
2,ANTON,Antonio Moreno TaquerÃ­a,Antonio Moreno,Owner,Mataderos 2312,MÃ©xico D.F.,,05023,Mexico,(5) 555-3932,...,,,,,,,,,,
3,AROUT,Around the Horn,Thomas Hardy,Sales Representative,120 Hanover Sq.,London,,WA1 1DP,UK,(171) 555-7788,...,,,,,,,,,,
4,BERGS,Berglunds snabbkÃ¶p,Christina Berglund,Order Administrator,BerguvsvÃ¤gen 8,LuleÃ¥,,S-958 22,Sweden,0921-12 34 65,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18720,19379.0,,,,988 Mt. Everest Court,W. York,,BD1 4SJ,,,...,2.0,,14.0,POINT (-1.74961030805808 53.8678047171656),Home,ENG,GB,True,England,United Kingdom
18721,13933.0,,Isabella F Richardson,,7413 Alpine Drive,Torrance,,90505,,,...,2.0,,9.0,POINT (-118.20918788284 33.7799182919931),Home,CA,US,False,California,United States
18722,24634.0,,Crystal S He,,4764 East Avenue,Bremerton,,98312,,,...,2.0,,79.0,POINT (-122.900630062084 47.5744175658405),Home,WA,US,False,Washington,United States
18723,21127.0,,,,"34334, rue Jean Mermoz",Versailles,,78000,,,...,2.0,,164.0,POINT (2.1423481823744 48.7821960594244),Home,78,FR,False,Yveline,France


## Loading the data into the UnitedOutdoors datawarehouse

### Products

In [27]:
products_dtypes = {
    'ProductID': Integer,
    'ProductName': NVARCHAR(50),
    'SupplierID': Integer,
    'CategoryID': Integer,
    'QuantityPerUnit': NVARCHAR(30),
    'UnitPrice': MONEY,
    'UnitsInStock': Integer,
    'UnitsOnOrder': Integer,
    'ReorderLevel': Integer,
    'Discontinued': BIT,
    'Description': NVARCHAR(100),
    'ProdSize': NVARCHAR(50),
    'Color': NVARCHAR(15),
    'Quantity': Integer,
    'picture_name': NVARCHAR(50),
    'Category': NVARCHAR(20),
    'Name': NVARCHAR(50),
    'ProductNumber': NVARCHAR(25),
    'MakeFlag': BIT,
    'FinishedGoodsFlag': BIT,
    'SafetyStockLevel': Integer,
    'ReorderPoint': Integer,
    'StandardCost': DECIMAL(8,4),
    'ListPrice': MONEY,
    'Size': NVARCHAR(5),
    'SizeUnitMeasureCode': CHAR(2),
    'WeightUnitMeasureCode': NVARCHAR(3),
    'Weight': DECIMAL(8,2),
    'DaysToManufacture': Integer,
    'ProductLine': CHAR(1),
    'Class': CHAR(1),
    'Style': CHAR(1),
    'ProductSubcategoryID': Integer,
    'ProductModelID': Integer,
    'SellStartDate': DATE,
    'SellEndDate': DATE,
    'DiscontinuedDate': DATE,
    'ModifiedDate': DATE,
    'ProductCategoryID': Integer,
    'ProductDescriptionID': Integer,
    'DocumentNode': NVARCHAR(50),  # HIERARCHYID is not supported in SQLAlchemy, using NVARCHAR as a substitute
    'CatalogDescription': XML,
    'Instructions': XML,
    'IllustrationID': Integer,
    'CultureID': NVARCHAR(10),
    'ProductPhotoID': Integer,
    'ThumbNailPhoto': NVARCHAR,
    'ThumbnailPhotoFileName': NVARCHAR(50),
    'LargePhoto': NVARCHAR,
    'LargePhotoFileName': NVARCHAR(50),
    'Primary': BIT
}

# applying the data types to the columns
products['ProductID'] = products['ProductID'].astype(str)
# TODO import these as strings from database
products['ThumbNailPhoto'] = products['ThumbNailPhoto'].astype(str)
products['LargePhoto'] = products['LargePhoto'].astype(str)

# stripping all columns with string data
products = products.map(lambda x: x.strip() if isinstance(x, str) else x)

products.to_sql(name='Products', schema='dbo', con=united_outdoors_engine, if_exists='append', index=False)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf7 in position 10: invalid start byte

### Regions

In [None]:
regions_dtypes = {
    'RegionID': Integer,
    'RegionName': VARCHAR(10),
    'StateProvinceID': Integer,
    'StateProvinceCode': VARCHAR(10),
    'CountryRegionCode': CHAR(2),
    'IsOnlyStateProvinceFlag': BIT,
    'Name': VARCHAR(50),
    'TerritoryID': Integer,
    'ModifiedDate': Date
}

regions['StateProvinceCode'] = regions['StateProvinceCode'].astype(str)

# stripping all columns with string data
regions = regions.map(lambda x: x.strip() if isinstance(x, str) else x)

#regions['StateProvinceCode'] = regions['StateProvinceCode'].str.slice(0, 2)

regions.to_sql(name='Regions', schema='dbo', con=united_outdoors_engine, if_exists='append', index=False, dtype=regions_dtypes)

### Customers

In [None]:
customers_dtypes = {
    'CustomerID': NVARCHAR(10),
    'CompanyName': NVARCHAR(40),
    'ContactName': NVARCHAR(60),
    'ContactTitle': NVARCHAR(30),
    'Address': NVARCHAR(60),
    'City': NVARCHAR(30),
    'Region': VARCHAR(15),
    'Zip': VARCHAR(10),
    'Country': VARCHAR(15),
    'Phone': VARCHAR(24),
    'Fax': VARCHAR(24),
    'CustomerTypeID': VARCHAR,
    'CustomerDesc': VARCHAR(100),
    'State': CHAR(2),
    'PersonID': Integer,
    'StoreID': Integer,
    'TerritoryID': Integer,
    'AccountNumber': CHAR(10),
    'PersonType': VARCHAR(2),
    'NameStyle': BIT,
    'Title': VARCHAR(10),
    'Suffix': VARCHAR(10),
    'EmailPromotion': Integer,
    'AdditionalContactInfo': XML,
    'Demographics': XML,
    'AddressID': Integer,
    'AddressTypeID': Integer,
    'AddressLine2': VARCHAR(60),
    'StateProvinceID': Integer,
    'SpatialLocation': VARCHAR,  # SQLAlchemy does not support the GEOGRAPHY data type
    'AddressType': VARCHAR(50),
    'StateProvinceCode': VARCHAR(10),
    'CountryRegionCode': CHAR(2),
    'IsOnlyStateProvinceFlag': BIT,
    'StateProvince': VARCHAR(50),
    'CountryRegion': VARCHAR(50)
}

# applying the data types to the columns
customers['CustomerID'] = customers['CustomerID'].astype(str)

# stripping all columns with string data
customers = customers.map(lambda x: x.strip() if isinstance(x, str) else x)

customers.to_sql(name='Customers', schema='dbo', con=united_outdoors_engine, if_exists='append', index=False)

## Constraints
altering the tables to add the (foreign key) constraints

In [None]:
# opening the UnitedOutdoors_constraints.sql file
with open('sql/UnitedOutdoors_constraints.sql', 'r') as file:
    sql_script = file.read()

# Execute the script
split_and_execute_sql_script(sql_script, united_outdoors_conn)

# Closing connections

In [None]:
try:
    united_outdoors_conn.close()
    northwind_conn.close()
    aenc_conn.close()
    adventureworks_conn.close()
except OperationalError as e:
        print(f'Error: {e}')
finally:
    united_outdoors_engine.dispose()
    northwind_engine.dispose()
    aenc_engine.dispose()
    adventureworks_engine.dispose()