<span style="color: #608b4e;">-- A Physical Foreign Key is a relation between two Physical Attributes</span>

<span style="color: #608b4e;">-- Sometimes multiple Physical Attributes relate to multiple Physical Attributes, in which case Order matters</span>

<span style="color: #608b4e;">-- A key always has a name</span>

<span style="color: #608b4e;">-- A key may be present in the database and ENFORCED</span>

<span style="color: #608b4e;">-- A key may be present in the database and NOTENFORCED</span>

<span style="color: #608b4e;">-- A key may NOT be present in the database and hence is purely LOGICAL<br><br>------------------------</span>

<span style="color: #608b4e;">In a 1-M relation, A 'parent' table is the table that has the ID and information about the ID eg 'CustomerID' . A 'child' table is the table that refers to the parent by key, eg 'RelatedCustomerID'.</span>

<span style="color: #608b4e;"><br></span>

In [1]:
import sqlite3
import pandas as pd
import numpy as np
from datetime import datetime
import os
import sys
sys.path.append('../../')
from user_packages import hashing

In [None]:
# set variables
source_file_name = 'foreign_key_import.csv'
source_file_path = '../../imports/physical/sqlserver' # the file format is the same for dataverse and sqlserver

record_source = 'ForeignKeyCsv'

In [3]:
# db connect
conn = sqlite3.connect('../../full_metadata.db')

In [17]:
# read csv to df

df = pd.read_csv(os.path.join(source_file_path, source_file_name))
df = df.replace({np.nan: None})

#df

In [16]:
# add extra columns

# columns with variable data
df['RecordSource'] = record_source
df['LoadDate'] = datetime.now()


# derive keyphrase columns
df['PhysicalAttribute_childKeyPhrase'] = df.apply(
  lambda row: 
    row['SERVER_NAME_CHILD']
    + '.' + row['DATABASE_NAME_CHILD']
    + '.' + row['SCHEMA_NAME_CHILD']
    + '.' + row['TABLE_NAME_CHILD']
    + '.' + row['COLUMN_NAME_CHILD']
  , axis=1
)
df['PhysicalAttribute_parentKeyPhrase'] = df.apply(
  lambda row: 
    row['SERVER_NAME_PARENT']
    + '.' + row['DATABASE_NAME_PARENT']
    + '.' + row['SCHEMA_NAME_PARENT']
    + '.' + row['TABLE_NAME_PARENT']
    + '.' + row['COLUMN_NAME_PARENT']
  , axis=1
)


# hash the keyphrases
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'PhysicalAttribute_childHashKey'
  , columns = ['PhysicalAttribute_childKeyPhrase']
)
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'PhysicalAttribute_parentHashKey'
  , columns = ['PhysicalAttribute_parentKeyPhrase']
)
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'PhysicalForeignKeyHashKey'
  , columns = ['CONSTRAINT_NAME']
)
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'PhysicalForeignKeyPhysicalAttributePhysicalAttributeHashKey'
  , columns = [
    'CONSTRAINT_NAME'
    , 'PhysicalAttribute_childKeyPhrase'
    , 'PhysicalAttribute_parentKeyPhrase'
    , 'ORDINAL_POSITION'
  ]
)


# hash the payload
# As this is essentially an effectivity satellite, the only field that gets hashed is the enforcement.
# Start/end dates are dealt with in the queries.
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'HashDiff'
  , columns = [
      'CONSTRAINT_ENFORCEMENT'
  ]
)

#df

In [10]:
# Clear down and Write to staging
conn.execute("DELETE FROM stg_Py_ForeignKeyCsvToPhysicalForeignKey")
conn.commit()
df.to_sql('stg_Py_ForeignKeyCsvToPhysicalForeignKey', conn, if_exists='append', index=False)

6

In [11]:
# Write to the Hub
sql_query = """
INSERT INTO rv_h_PhysicalForeignKey
(
    PhysicalForeignKeyHashKey
  , LoadDate
  , RecordSource
  , PhysicalForeignKeyName
)
SELECT DISTINCT
    PhysicalForeignKeyHashKey
  , LoadDate
  , RecordSource
  , CONSTRAINT_NAME
FROM
  stg_Py_ForeignKeyCsvToPhysicalForeignKey
WHERE
  PhysicalForeignKeyHashKey NOT IN (SELECT PhysicalForeignKeyHashKey FROM rv_h_PhysicalForeignKey)
""";
conn.execute(sql_query)
conn.commit()

In [13]:
# Write to the Satellite 
# This only starts a new effectivity row, does not end
sql_query = """
INSERT INTO rv_s_PhysicalForeignKey
(
    PhysicalForeignKeyHashKey
  , LoadDate
  , RecordSource
  , HashDiff

  , Enforcement
  , EffectiveStart

)
SELECT DISTINCT
    stg.PhysicalForeignKeyHashKey
  , stg.LoadDate
  , stg.RecordSource
  , stg.HashDiff
  
  , stg.CONSTRAINT_ENFORCEMENT
  , datetime()

FROM
  stg_Py_ForeignKeyCsvToPhysicalForeignKey AS stg
  LEFT OUTER JOIN rv_s_PhysicalForeignKey AS sat ON (
    stg.PhysicalForeignKeyHashKey = sat.PhysicalForeignKeyHashKey
    AND sat.LoadDate = (
      SELECT MAX(z.LoadDate)
      FROM rv_s_PhysicalForeignKey AS z
      WHERE z.PhysicalForeignKeyHashKey = sat.PhysicalForeignKeyHashKey
    )
  )
WHERE
  (
    sat.HashDiff != stg.HashDiff
    OR sat.HashDiff IS NULL
  )
""";
conn.execute(sql_query)
conn.commit()

In [15]:
# Write to the Link
sql_query = """
INSERT INTO rv_l_PhysicalForeignKeyPhysicalAttributePhysicalAttribute
(
  PhysicalForeignKeyPhysicalAttributePhysicalAttributeHashKey
  , LoadDate
  , RecordSource
  , PhysicalForeignKeyHashKey
  , PhysicalAttribute_childHashKey
  , PhysicalAttribute_parentHashKey
  , OrdinalPosition 
)
SELECT
  stg.PhysicalForeignKeyPhysicalAttributePhysicalAttributeHashKey
  , stg.LoadDate
  , stg.RecordSource
  , stg.PhysicalForeignKeyHashKey
  , stg.PhysicalAttribute_childHashKey
  , stg.PhysicalAttribute_parentHashKey
  , stg.ORDINAL_POSITION 
FROM
  stg_Py_ForeignKeyCsvToPhysicalForeignKey stg
WHERE
  stg.PhysicalForeignKeyPhysicalAttributePhysicalAttributeHashKey IS NOT NULL
  AND PhysicalForeignKeyPhysicalAttributePhysicalAttributeHashKey NOT IN (
    SELECT PhysicalForeignKeyPhysicalAttributePhysicalAttributeHashKey
    FROM rv_l_PhysicalForeignKeyPhysicalAttributePhysicalAttribute
  )
""";
conn.execute(sql_query)
conn.commit()