In [1]:
import sqlite3
import pandas as pd
import numpy as np
from datetime import datetime
import sys
sys.path.append('../')
from user_packages import hashing

In [2]:
# set variables
target_model_name = 'excited.skirt.earth'
source_file_name = 'oh_columns_expanded.csv'
source_file_path = ''

record_source = 'SQLServerScrape'
SystemKeyPhrase = 'OpenHousing' # this should come from the System Hub

In [3]:
# db connect
conn = sqlite3.connect('../full_metadata.db')

In [21]:
# read csv to df
dtypeDict = {
    'CHARACTER_MAXIMUM_LENGTH': 'Int64'
  , 'CHARACTER_OCTET_LENGTH': 'Int64'
  , 'NUMERIC_PRECISION': 'Int64'
  , 'NUMERIC_PRECISION_RADIX': 'Int64'
  , 'NUMERIC_SCALE': 'Int64'
  , 'DATETIME_PRECISION': 'Int64'
}
#, nrows=10
df = pd.read_csv(source_file_name, dtype = dtypeDict)
df = df.replace({np.nan: None})

#df

In [22]:
# add extra columns

# columns with variable data
df['RecordSource'] = record_source
df['LoadDate'] = datetime.now()
df['SystemKeyPhrase'] = SystemKeyPhrase


# derive keyphrase columns
df['PhysicalStructureKeyPhrase'] = df.apply(
  lambda row: 
    row['SERVER_NAME']
    + '.' + row['DATABASE_NAME']
    + '.' + row['SCHEMA_NAME']
    + '.' + row['TABLE_NAME']
  , axis=1
)
df['PhysicalAttributeKeyPhrase'] = df.apply(
  lambda row: 
    row['SERVER_NAME']
    + '.' + row['DATABASE_NAME']
    + '.' + row['SCHEMA_NAME']
    + '.' + row['TABLE_NAME']
    + '.' + row['COLUMN_NAME']
  , axis=1
)
df['PhysicalStructurePhysicalAttributeKeyPhrase'] = df.apply(  lambda row: row['PhysicalStructureKeyPhrase'] + ':' + row['PhysicalAttributeKeyPhrase'], axis=1)


# hash the keyphrases
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'PhysicalStructureHashKey'
  , columns = ['PhysicalStructureKeyPhrase']
)
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'PhysicalAttributeHashKey'
  , columns = ['PhysicalAttributeKeyPhrase']
)
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'PhysicalStructurePhysicalAttributeHashKey'
  , columns = ['PhysicalStructurePhysicalAttributeKeyPhrase']
)


# hash the payload
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'HashDiff'
  , columns = [
      'SERVER_NAME'
    , 'DATABASE_NAME'
    , 'SCHEMA_NAME'
    , 'TABLE_NAME'
    , 'COLUMN_NAME'

    , 'ORDINAL_POSITION'
    , 'COLUMN_DEFAULT'
    , 'IS_NULLABLE'
    , 'DATA_TYPE'
    , 'CHARACTER_MAXIMUM_LENGTH'
    , 'CHARACTER_OCTET_LENGTH'
    , 'NUMERIC_PRECISION'
    , 'NUMERIC_PRECISION_RADIX'
    , 'NUMERIC_SCALE'
    , 'DATETIME_PRECISION'
    , 'CHARACTER_SET_CATALOG'
    , 'CHARACTER_SET_SCHEMA'
    , 'CHARACTER_SET_NAME'
    , 'COLLATION_CATALOG'
    , 'COLLATION_SCHEMA'
    , 'COLLATION_NAME'
    , 'DOMAIN_CATALOG'
    , 'DOMAIN_SCHEMA'
    , 'DOMAIN_NAME'
    , 'COLUMN_DESCRIPTION'
  ]
)

#df

In [23]:
# Clear down and Write to staging
conn.execute("DELETE FROM stg_Py_SqlServerCsvToPhysicalAttribute")
conn.commit()
df.to_sql('stg_Py_SqlServerCsvToPhysicalAttribute', conn, if_exists='append', index=False)

39707

In [24]:
# Write to the Hub (PhysicalAttribute)
sql_query = """
INSERT INTO rv_h_PhysicalAttribute
(
    PhysicalAttributeHashKey
  , LoadDate
  , RecordSource
  , PhysicalAttributeKeyPhrase
)
SELECT DISTINCT
    PhysicalAttributeHashKey
  , LoadDate
  , RecordSource
  , PhysicalAttributeKeyPhrase
FROM
  stg_Py_SqlServerCsvToPhysicalAttribute
WHERE
  PhysicalAttributeHashKey NOT IN (SELECT PhysicalAttributeHashKey FROM rv_h_PhysicalAttribute)
""";
conn.execute(sql_query)
conn.commit()

In [25]:
# Write to the Satellite (PhysicalAttribute_SqlServerScrape)
sql_query = """
INSERT INTO rv_s_PhysicalAttribute_SqlServerScrape
(
    PhysicalAttributeHashKey
  , LoadDate
  , RecordSource
  , HashDiff

  , SERVER_NAME
  , "DATABASE_NAME"
  , SCHEMA_NAME
  , TABLE_NAME
  , COLUMN_NAME

  , ORDINAL_POSITION
  , COLUMN_DEFAULT
  , IS_NULLABLE
  , DATA_TYPE
  , CHARACTER_MAXIMUM_LENGTH
  , CHARACTER_OCTET_LENGTH
  , NUMERIC_PRECISION
  , NUMERIC_PRECISION_RADIX
  , NUMERIC_SCALE
  , DATETIME_PRECISION
  , CHARACTER_SET_CATALOG
  , CHARACTER_SET_SCHEMA
  , CHARACTER_SET_NAME
  , COLLATION_CATALOG
  , COLLATION_SCHEMA
  , COLLATION_NAME
  , DOMAIN_CATALOG
  , DOMAIN_SCHEMA
  , DOMAIN_NAME
  , COLUMN_DESCRIPTION

)
SELECT DISTINCT
    stg.PhysicalAttributeHashKey
  , stg.LoadDate
  , stg.RecordSource
  , stg.HashDiff
  
  , stg.SERVER_NAME
  , stg."DATABASE_NAME"
  , stg.SCHEMA_NAME
  , stg.TABLE_NAME
  , stg.COLUMN_NAME

  , stg.ORDINAL_POSITION
  , stg.COLUMN_DEFAULT
  , stg.IS_NULLABLE
  , stg.DATA_TYPE
  , stg.CHARACTER_MAXIMUM_LENGTH
  , stg.CHARACTER_OCTET_LENGTH
  , stg.NUMERIC_PRECISION
  , stg.NUMERIC_PRECISION_RADIX
  , stg.NUMERIC_SCALE
  , stg.DATETIME_PRECISION
  , stg.CHARACTER_SET_CATALOG
  , stg.CHARACTER_SET_SCHEMA
  , stg.CHARACTER_SET_NAME
  , stg.COLLATION_CATALOG
  , stg.COLLATION_SCHEMA
  , stg.COLLATION_NAME
  , stg.DOMAIN_CATALOG
  , stg.DOMAIN_SCHEMA
  , stg.DOMAIN_NAME
  , stg.COLUMN_DESCRIPTION

FROM
  stg_Py_SqlServerCsvToPhysicalAttribute AS stg
  LEFT OUTER JOIN rv_s_PhysicalAttribute_SqlServerScrape AS sat ON (
    stg.PhysicalAttributeHashKey = sat.PhysicalAttributeHashKey
    AND sat.LoadDate = (
      SELECT MAX(z.LoadDate)
      FROM rv_s_PhysicalAttribute_SqlServerScrape AS z
      WHERE z.PhysicalAttributeHashKey = sat.PhysicalAttributeHashKey
    )
  )
WHERE
  (
    sat.HashDiff != stg.HashDiff
    OR sat.HashDiff IS NULL
  )
""";
conn.execute(sql_query)
conn.commit()

In [26]:
# Write to the Link (PhysicalStructure - PhysicalAttribute)
sql_query = """
INSERT INTO rv_l_PhysicalStructurePhysicalAttribute
(
  PhysicalStructurePhysicalAttributeHashKey
  , LoadDate
  , RecordSource
  , PhysicalStructureHashKey
  , PhysicalAttributeHashKey
)
SELECT
  stg.PhysicalStructurePhysicalAttributeHashKey
  , stg.LoadDate
  , stg.RecordSource
  , stg.PhysicalStructureHashKey
  , stg.PhysicalAttributeHashKey
FROM
  stg_Py_SqlServerCsvToPhysicalAttribute stg
WHERE
  stg.PhysicalStructurePhysicalAttributeHashKey IS NOT NULL
  AND PhysicalStructurePhysicalAttributeHashKey NOT IN (
    SELECT PhysicalStructurePhysicalAttributeHashKey
    FROM rv_l_PhysicalStructurePhysicalAttribute
  )
""";
conn.execute(sql_query)
conn.commit()