In [1]:
import sqlite3
import pandas as pd
import numpy as np
from datetime import datetime
import os
import sys
sys.path.append('../')
from user_packages import hashing


In [2]:
# set variables

source_file_name = 'Pro2_IH_column_analysis.rpt'
source_file_path = ''

record_source = 'SQLServerAnalysis'


In [3]:
# db connect
conn = sqlite3.connect('../full_metadata.db')

In [4]:
with open(os.path.join(source_file_path, source_file_name), 'r', encoding='utf-8-sig') as file:
    lines = file.readlines()

filtered_lines = [line.strip() for line in lines if line.strip() and 'affected)' not in line and 'Completion time:' not in line and 'Warning: ' not in line]
#TODO - doing count(*) on columns with mixed nulls and not nulls gives this warning. Redo template to eliminate.
#Warning: Null value is eliminated by an aggregate or other SET operation.
#can drop two counts, hopefully remove null warnings
#NULL_OR_BLANK_COUNT = NULL_COUNT + BLANK_COUNT
#NOT_NULL_OR_BLANK_COUNT = NOT_NULL_COUNT + (NOT_BLANK_COUNT - NULL_COUNT)

#filtered_lines

In [5]:
data = [line.split(',') for line in filtered_lines] # a straight split like this should be OK as there shouldn't be commas in the values.
df = pd.DataFrame(data)
df = df.drop_duplicates()
df.columns = df.iloc[0]
df = df[1:]
#df

In [6]:
# add extra columns

# columns with variable data
df['RecordSource'] = record_source
df['LoadDate'] = datetime.now()


# derive keyphrase columns
df['PhysicalAttributeKeyPhrase'] = df.apply(
  lambda row: 
    row['SERVER_NAME']
    + '.' + row['DATABASE_NAME']
    + '.' + row['SCHEMA_NAME']
    + '.' + row['TABLE_NAME']
    + '.' + row['COLUMN_NAME']
  , axis=1
)


# hash the keyphrases
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'PhysicalAttributeHashKey'
  , columns = ['PhysicalAttributeKeyPhrase']
)


# hash the payload
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'HashDiff'
  , columns = [
      'SERVER_NAME'
    , 'DATABASE_NAME'
    , 'SCHEMA_NAME'
    , 'TABLE_NAME'
    , 'COLUMN_NAME'

    , 'SCAN_DATE'
    , 'NULL_COUNT'
    , 'NOT_NULL_COUNT'
    , 'NULL_PERCENT'
    , 'NOT_NULL_PERCENT'
    , 'BLANK_COUNT'
    , 'NOT_BLANK_COUNT'
    , 'BLANK_PERCENT'
    , 'NOT_BLANK_PERCENT'
    , 'NULL_OR_BLANK_COUNT'
    , 'NOT_NULL_OR_BLANK_COUNT'
    , 'NULL_OR_BLANK_PERCENT'
    , 'NOT_NULL_OR_BLANK_PERCENT'
    , 'COUNT_DISTINCT'
    , 'MIN_OF_NUMBER'
    , 'MAX_OF_NUMBER'
    , 'MIN_OF_DATE'
    , 'MAX_OF_DATE'
  ]
)

#df

In [7]:
# Clear down and Write to staging
conn.execute("DELETE FROM stg_Py_SqlServerDataAnalysisToPhysicalAttribute")
conn.commit()
df.to_sql('stg_Py_SqlServerDataAnalysisToPhysicalAttribute', conn, if_exists='append', index=False)

39707

In [8]:
# Write to the Satellite (rv_s_PhysicalAttribute_SqlServerDataAnalysis)
sql_query = """
INSERT INTO rv_s_PhysicalAttribute_SqlServerDataAnalysis
(
    PhysicalAttributeHashKey
  , LoadDate
  , RecordSource
  , HashDiff

  , SERVER_NAME
  , "DATABASE_NAME"
  , SCHEMA_NAME
  , TABLE_NAME
  , COLUMN_NAME

  , SCAN_DATE
  , NULL_COUNT
  , NOT_NULL_COUNT
  , NULL_PERCENT
  , NOT_NULL_PERCENT
  , BLANK_COUNT
  , NOT_BLANK_COUNT
  , BLANK_PERCENT
  , NOT_BLANK_PERCENT
  , NULL_OR_BLANK_COUNT
  , NOT_NULL_OR_BLANK_COUNT
  , NULL_OR_BLANK_PERCENT
  , NOT_NULL_OR_BLANK_PERCENT
  , COUNT_DISTINCT
  , MIN_OF_NUMBER
  , MAX_OF_NUMBER
  , MIN_OF_DATE
  , MAX_OF_DATE

)
SELECT DISTINCT
    stg.PhysicalAttributeHashKey
  , stg.LoadDate
  , stg.RecordSource
  , stg.HashDiff
  
  , stg.SERVER_NAME
  , stg."DATABASE_NAME"
  , stg.SCHEMA_NAME
  , stg.TABLE_NAME
  , stg.COLUMN_NAME

  , stg.SCAN_DATE
  , stg.NULL_COUNT
  , stg.NOT_NULL_COUNT
  , stg.NULL_PERCENT
  , stg.NOT_NULL_PERCENT
  , stg.BLANK_COUNT
  , stg.NOT_BLANK_COUNT
  , stg.BLANK_PERCENT
  , stg.NOT_BLANK_PERCENT
  , stg.NULL_OR_BLANK_COUNT
  , stg.NOT_NULL_OR_BLANK_COUNT
  , stg.NULL_OR_BLANK_PERCENT
  , stg.NOT_NULL_OR_BLANK_PERCENT
  , stg.COUNT_DISTINCT
  , stg.MIN_OF_NUMBER
  , stg.MAX_OF_NUMBER
  , stg.MIN_OF_DATE
  , stg.MAX_OF_DATE

FROM
  stg_Py_SqlServerDataAnalysisToPhysicalAttribute AS stg
  LEFT OUTER JOIN rv_s_PhysicalAttribute_SqlServerDataAnalysis AS sat ON (
    stg.PhysicalAttributeHashKey = sat.PhysicalAttributeHashKey
    AND sat.LoadDate = (
      SELECT MAX(z.LoadDate)
      FROM rv_s_PhysicalAttribute_SqlServerDataAnalysis AS z
      WHERE z.PhysicalAttributeHashKey = sat.PhysicalAttributeHashKey
    )
  )
WHERE
  (
    sat.HashDiff != stg.HashDiff
    OR sat.HashDiff IS NULL
  )
""";
conn.execute(sql_query)
conn.commit()