This script takes a csv from a completed Dataverse Analysis sheet (of Dataverse entities, derived from out\_DataversePhysicalStructureOutputForAnalysis), and imports to the Physical Structure tables.

Note: The script will use the Hash/ KeyPhrase that are in the analysis sheet, not generate new ones.

Note: The script will split tags on semi-colon. It will not END a tag on a table, only add new ones.

In [1]:
import sqlite3
import pandas as pd
import numpy as np
from datetime import datetime
import sys
sys.path.append('../')
from user_packages import hashing


In [2]:
# set variables
target_model_name = 'net.always.green' # from bv_model
source_file_name = 'ipc_entities_analysis_import.csv'
source_file_path = ''

record_source = 'DataverseAnalysis'


In [3]:
# db connect
conn = sqlite3.connect('../full_metadata.db')

In [4]:
# read table to df
df = pd.read_csv(source_file_name)
df = df.replace({np.nan: None})


#df

In [5]:
# add extra columns

# columns with variable data
df['RecordSource'] = record_source
df['LoadDate'] = datetime.now()

#Tags

# hash the payload
# this does not include the entity usage columns, as they shouldn't have changed in analysis
# this does not include the Tags, as they are handled separately
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'HashDiff'
  , columns = [
      'SERVER_NAME'
    , 'Logical Name'
    , 'Schema Name'
    , 'Entity'
    , 'Plural Display Name'
    , 'Object Type Code'
    , 'Is Custom Entity'
    , 'Ownership Type'
    , 'Description'
  ]
)

#df

In [7]:
# Clear down and Write to staging
conn.execute("DELETE FROM stg_Py_DataverseAnalysisCsvToPhysicalStructure")
conn.commit()
df.to_sql('stg_Py_DataverseAnalysisCsvToPhysicalStructure', conn, if_exists='append', index=False)

46

In [8]:
# Write to the Satellite (PhysicalStructure_XRMMetadata)
sql_query = """
INSERT INTO rv_s_PhysicalStructure_XRMMetadata
(
    PhysicalStructureHashKey
  , LoadDate
  , RecordSource
  , HashDiff

  , "Entity"
  , "Plural Display Name"
  , "Description"
  , "Schema Name"
  , "Logical Name"
  , "Object Type Code"
  , "Is Custom Entity"
  , "Ownership Type"
  , "SERVER_NAME"

)
SELECT DISTINCT
    stg.PhysicalStructureHashKey
  , stg.LoadDate
  , stg.RecordSource
  , stg.HashDiff
  
  , stg."Entity"
  , stg."Plural Display Name"
  , stg."Description"
  , stg."Schema Name"
  , stg."Logical Name"
  , stg."Object Type Code"
  , stg."Is Custom Entity"
  , stg."Ownership Type"
  , stg."SERVER_NAME"

FROM
  stg_Py_DataverseAnalysisCsvToPhysicalStructure AS stg
  LEFT OUTER JOIN rv_s_PhysicalStructure_XRMMetadata AS sat ON (
    stg.PhysicalStructureHashKey = sat.PhysicalStructureHashKey
    AND sat.LoadDate = (
      SELECT MAX(z.LoadDate)
      FROM rv_s_PhysicalStructure_XRMMetadata AS z
      WHERE z.PhysicalStructureHashKey = sat.PhysicalStructureHashKey
    )
  )
WHERE
  (
    sat.HashDiff != stg.HashDiff
    OR sat.HashDiff IS NULL
  )
""";
conn.execute(sql_query)
conn.commit()

In [None]:
# Get the incoming tags from the staging table,
# split them into a list of tags
# insert tags, taking out the ones that are currently active.
# NOTE: This does not END any tags.
sql_query = """
WITH RECURSIVE
source_table AS (
	SELECT 
	  PhysicalStructureHashKey AS id
	  , Tags as string
	FROM stg_Py_DataverseAnalysisCsvToPhysicalStructure
)
, splitstring (id, string, remaining_string) AS (
  SELECT
		id
    , CASE
      WHEN a.string LIKE '%;%'
      THEN SUBSTRING(a.string, 1, INSTR(a.string, ';')-1)
      ELSE a.string
    END AS string
    , SUBSTRING(SUBSTRING(a.string, INSTR(a.string, ';') + 1), 1) AS remaining_string
  FROM source_table AS a
UNION ALL
  SELECT 
		id
    , CASE 
      WHEN c.remaining_string LIKE '%;%' 
      THEN SUBSTRING(c.remaining_string, 1, INSTR(c.remaining_string, ';')-1) 
      ELSE c.remaining_string
    END AS string
    , CASE
      WHEN c.remaining_string LIKE '%;%' 
      THEN SUBSTRING( SUBSTRING(c.remaining_string, INSTR(c.remaining_string, ';')+1), 1)
    END AS remaining_string
  FROM splitstring c
  WHERE
    c.string <> ''
    AND c.string IS NOT NULL
)
, incoming_tags AS (
  SELECT
    s.id AS PhysicalStructureHashKey
		, stg.LoadDate 
		, stg.RecordSource
		
    , s.string AS Tag
    , stg.LoadDate AS StartDate
  --   , ROW_NUMBER() OVER (
  -- 		PARTITION BY s.id
  -- 		ORDER BY s.string DESC
  -- 	) AS ROW_id
  FROM
    splitstring s
    LEFT JOIN stg_Py_DataverseAnalysisCsvToPhysicalStructure stg ON (s.id = stg.PhysicalStructureHashKey)
  WHERE
    s.string <> ''
    AND s.string IS NOT NULL
)
, current_tags AS ( -- gets the active tags that are on the Structures in the stg table
  SELECT
      sat.PhysicalStructureHashKey
    , sat.Tag
    --, sat.StartDate
    --, sat.EndDate
  FROM
    rv_s_PhysicalStructure_AnalysisTag sat
    INNER JOIN stg_Py_DataverseAnalysisCsvToPhysicalStructure stg ON (
      sat.PhysicalStructureHashKey = stg.PhysicalStructureHashKey
      AND sat.LoadDate = (
        SELECT MAX(z.LoadDate)
        FROM rv_s_PhysicalStructure_AnalysisTag AS z
        WHERE z.PhysicalStructureHashKey = sat.PhysicalStructureHashKey
      )
    )
  WHERE
    sat.EndDate IS NULL
)
INSERT INTO rv_s_PhysicalStructure_AnalysisTag
(
    PhysicalStructureHashKey
  , LoadDate
  , RecordSource

  , Tag
  , StartDate
)
SELECT DISTINCT
    stg.PhysicalStructureHashKey
  , stg.LoadDate
  , stg.RecordSource
  
  , stg.Tag
  , stg.LoadDate AS StartDate
FROM
  incoming_tags AS stg
  LEFT OUTER JOIN current_tags AS sat ON (
    stg.PhysicalStructureHashKey = sat.PhysicalStructureHashKey
		AND stg.Tag = sat.Tag
  )
-- "Where in incoming_tags and not in current_tags"	
WHERE
	sat.PhysicalStructureHashKey IS NULL
	AND sat.Tag IS NULL
""";
conn.execute(sql_query)
conn.commit()