In [1]:
import sqlite3
import pandas as pd
import numpy as np
from datetime import datetime
import sys
sys.path.append('../')
from user_packages import hashing


In [2]:
# set variables
target_model_name = 'excited.skirt.earth'
source_file_name = 'OH_repair_tables_analysis.csv'
source_file_path = ''

record_source = 'SQLServerAnalysis'
SystemKeyPhrase = 'OpenHousing' # this should come from the System Hub


In [3]:
# db connect
conn = sqlite3.connect('../full_metadata.db')

In [4]:
# read table to df
df = pd.read_csv(source_file_name)
df = df.replace({np.nan: None})


#df

In [5]:
# add extra columns

# columns with variable data
df['RecordSource'] = record_source
df['LoadDate'] = datetime.now()
df['SystemKeyPhrase'] = SystemKeyPhrase


# derive keyphrase columns
df['PhysicalStructureKeyPhrase'] = df.apply(
  lambda row: 
    row['SERVER_NAME']
    + '.' + row['DATABASE_NAME']
    + '.' + row['SCHEMA_NAME']
    + '.' + row['TABLE_NAME']
  , axis=1
)

# hash the keyphrases
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'PhysicalStructureHashKey'
  , columns = ['PhysicalStructureKeyPhrase']
)


# hash the payload
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'HashDiff'
  , columns = [
      'SERVER_NAME'
    , 'DATABASE_NAME'
    , 'SCHEMA_NAME'
    , 'TABLE_NAME'
    , 'ROW_COUNT'
    , 'TABLE_TYPE'
    , 'TABLE_TYPE_DESCRIPTION'
    , 'COLUMN_COUNT'
    , 'TABLE_DESCRIPTION'
  ]
)

#df

In [6]:
df_reference_tag = df.drop(
  [
      'ROW_COUNT'
    , 'TABLE_TYPE'
    , 'TABLE_TYPE_DESCRIPTION'
    , 'COLUMN_COUNT'
    , 'TABLE_DESCRIPTION'

    , 'SERVER_NAME'
    , 'DATABASE_NAME'
    , 'SCHEMA_NAME'
    , 'TABLE_NAME'
    , 'HashDiff'
    , 'PhysicalStructureKeyPhrase'
    , 'SystemKeyPhrase'
  ]
  , axis=1
)
df = df.drop('IS_REFERENCE', axis=1)

#df_reference_tag

In [23]:
# Clear down and Write to staging
conn.execute("DELETE FROM stg_Py_SqlServerCsvToPhysicalStructure")
conn.commit()
df.to_sql('stg_Py_SqlServerCsvToPhysicalStructure', conn, if_exists='append', index=False)

37

In [45]:
# Write to the Hub (PhysicalStructure)
# NO, we are not creating new structures

In [24]:
# Write to the Satellite (PhysicalStructure_SqlServerScrape)
sql_query = """
INSERT INTO rv_s_PhysicalStructure_SqlServerScrape
(
    PhysicalStructureHashKey
  , LoadDate
  , RecordSource
  , HashDiff

  , SERVER_NAME
  , "DATABASE_NAME"
  , SCHEMA_NAME
  , TABLE_NAME
  , ROW_COUNT
  , TABLE_TYPE
  , TABLE_TYPE_DESCRIPTION
  , COLUMN_COUNT
  , TABLE_DESCRIPTION

)
SELECT DISTINCT
    stg.PhysicalStructureHashKey
  , stg.LoadDate
  , stg.RecordSource
  , stg.HashDiff
  
  , stg.SERVER_NAME
  , stg."DATABASE_NAME"
  , stg.SCHEMA_NAME
  , stg.TABLE_NAME
  , stg.ROW_COUNT
  , stg.TABLE_TYPE
  , stg.TABLE_TYPE_DESCRIPTION
  , stg.COLUMN_COUNT
  , stg.TABLE_DESCRIPTION

FROM
  stg_Py_SqlServerCsvToPhysicalStructure AS stg
  LEFT OUTER JOIN rv_s_PhysicalStructure_SqlServerScrape AS sat ON (
    stg.PhysicalStructureHashKey = sat.PhysicalStructureHashKey
    AND sat.LoadDate = (
      SELECT MAX(z.LoadDate)
      FROM rv_s_PhysicalStructure_SqlServerScrape AS z
      WHERE z.PhysicalStructureHashKey = sat.PhysicalStructureHashKey
    )
  )
WHERE
  (
    sat.HashDiff != stg.HashDiff
    OR sat.HashDiff IS NULL
  )
""";
conn.execute(sql_query)
conn.commit()

In [None]:
# No inserting to links - this type of sheet shouldn't have created new Structures

In [30]:
# Clear down and Write to staging
conn.execute("DELETE FROM stg_Py_SqlServerAnalysisCsvToPhysicalStructureTag")
conn.commit()
df_reference_tag.to_sql('stg_Py_SqlServerAnalysisCsvToPhysicalStructureTag', conn, if_exists='append', index=False)

37

In [None]:
# Then it all becomes a bit of a hack, because what we want to do now isn't really determined by what's in the sheet.
# Plus, we are working with what is effectively an effectivity satellite on a hub...

# Set the 'is reference' tag onto those tables that are reference
sql_query = """
WITH
current_tags AS ( -- gets the active tags that are on the Structures in the stg table
  SELECT
      sat.PhysicalStructureHashKey
    , sat.Tag
    , sat.StartDate
    , sat.EndDate
  FROM
    rv_s_PhysicalStructure_AnalysisTag sat
    INNER JOIN stg_Py_SqlServerAnalysisCsvToPhysicalStructureTag stg ON (
      sat.PhysicalStructureHashKey = stg.PhysicalStructureHashKey
      AND sat.LoadDate = (
        SELECT MAX(z.LoadDate)
        FROM rv_s_PhysicalStructure_AnalysisTag AS z
        WHERE z.PhysicalStructureHashKey = sat.PhysicalStructureHashKey
      )
    )
  WHERE
    sat.EndDate IS NULL
)
INSERT INTO rv_s_PhysicalStructure_AnalysisTag
(
    PhysicalStructureHashKey
  , LoadDate
  , RecordSource

  , Tag
  , StartDate

)
SELECT DISTINCT
    stg.PhysicalStructureHashKey
  , stg.LoadDate
  , stg.RecordSource
  
  , 'Reference Data'
  , stg.LoadDate

FROM
  stg_Py_SqlServerAnalysisCsvToPhysicalStructureTag AS stg
  LEFT OUTER JOIN current_tags AS sat ON (
    stg.PhysicalStructureHashKey = sat.PhysicalStructureHashKey
  )
WHERE
  (
    sat.Tag != 'Reference Data'
    OR sat.Tag IS NULL
  )
  AND stg.IS_REFERENCE = 'Y'
""";
#conn.execute(sql_query)
#conn.commit()


In [None]:
# Set the 'repairs' tag on all the tables
sql_query = """
WITH
current_tags AS ( -- gets the active tags that are on the Structures in the stg table
  SELECT
      sat.PhysicalStructureHashKey
    , sat.Tag
    , sat.StartDate
    , sat.EndDate
  FROM
    rv_s_PhysicalStructure_AnalysisTag sat
    INNER JOIN stg_Py_SqlServerAnalysisCsvToPhysicalStructureTag stg ON (
      sat.PhysicalStructureHashKey = stg.PhysicalStructureHashKey
      AND sat.LoadDate = (
        SELECT MAX(z.LoadDate)
        FROM rv_s_PhysicalStructure_AnalysisTag AS z
        WHERE z.PhysicalStructureHashKey = sat.PhysicalStructureHashKey
      )
    )
  WHERE
    sat.EndDate IS NULL
)
INSERT INTO rv_s_PhysicalStructure_AnalysisTag
(
    PhysicalStructureHashKey
  , LoadDate
  , RecordSource

  , Tag
  , StartDate

)
SELECT DISTINCT
    stg.PhysicalStructureHashKey
  , stg.LoadDate
  , stg.RecordSource
  
  , 'Repairs'
  , stg.LoadDate

FROM
  stg_Py_SqlServerAnalysisCsvToPhysicalStructureTag AS stg
  LEFT OUTER JOIN current_tags AS sat ON (
    stg.PhysicalStructureHashKey = sat.PhysicalStructureHashKey
  )
WHERE
  (
    sat.Tag != 'Repairs'
    OR sat.Tag IS NULL
  )
""";
#conn.execute(sql_query)
#conn.commit()