In [1]:
script_depth = '../../'

import sqlite3
import pandas as pd
import numpy as np
from datetime import datetime
import os
import sys
sys.path.append(script_depth)
from user_packages import hashing


In [4]:
# set variables
target_model_name = 'invite.please.over'
source_file_name = 'mysql_tables.csv'
source_file_path = script_depth + 'imports/physical/mysql/'

record_source = 'MySQLScrape'
SystemKeyPhrase = 'DRS' # this should come from the System Hub


In [3]:
# db connect
conn = sqlite3.connect(script_depth + 'full_metadata.db')

In [5]:
# read table to df
df = pd.read_csv(os.path.join(source_file_path, source_file_name))
df = df.replace({np.nan: None})

df

Unnamed: 0,SERVER_NAME,DATABASE_NAME,SCHEMA_NAME,TABLE_NAME,TABLE_TYPE,ENGINE,ROW_COUNT,AVG_ROW_LENGTH,DATA_LENGTH,INDEX_LENGTH,AUTO_INCREMENT,UPDATE_TIME,COLUMN_COUNT,TABLE_COLLATION,TABLE_COMMENT
0,DRS_Core,def,information_schema,ALL_PLUGINS,SYSTEM VIEW,Aria,,0.0,8192.0,8192.0,,15/08/2024 19:06,13,utf8mb3_general_ci,
1,DRS_Core,def,information_schema,APPLICABLE_ROLES,SYSTEM VIEW,MEMORY,,1565.0,0.0,0.0,,,4,utf8mb3_general_ci,
2,DRS_Core,def,information_schema,CHARACTER_SETS,SYSTEM VIEW,MEMORY,,384.0,0.0,0.0,,,4,utf8mb3_general_ci,
3,DRS_Core,def,information_schema,CHECK_CONSTRAINTS,SYSTEM VIEW,Aria,,0.0,8192.0,8192.0,,15/08/2024 19:06,6,utf8mb3_general_ci,
4,DRS_Core,def,information_schema,COLLATIONS,SYSTEM VIEW,MEMORY,,231.0,0.0,0.0,,,6,utf8mb3_general_ci,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
820,DRS_Core,def,_portal,gateway_connection_log,BASE TABLE,InnoDB,1131.0,188.0,212992.0,0.0,,15/08/2024 18:56,8,utf8mb3_general_ci,
821,DRS_Core,def,_portal,gateway_properties,BASE TABLE,InnoDB,5.0,3276.0,16384.0,0.0,,,2,utf8mb3_general_ci,
822,DRS_Core,def,_portal,gateway_server,BASE TABLE,InnoDB,0.0,0.0,16384.0,0.0,,,5,utf8mb3_general_ci,
823,DRS_Core,def,_portal,gateway_user,BASE TABLE,InnoDB,530.0,123.0,65536.0,0.0,,15/08/2024 09:03,2,utf8mb3_general_ci,


In [7]:
# add extra columns

# columns with variable data
df['RecordSource'] = record_source
df['LoadDate'] = datetime.now()
df['ModelKeyPhrase'] = target_model_name
df['SystemKeyPhrase'] = SystemKeyPhrase


# derive keyphrase columns
df['PhysicalStructureKeyPhrase'] = df.apply(
  lambda row: 
    row['SERVER_NAME']
    + '.' + row['DATABASE_NAME']
    + '.' + row['SCHEMA_NAME']
    + '.' + row['TABLE_NAME']
  , axis=1
)
df['ModelPhysicalStructureKeyPhrase'] = df.apply(  lambda row: row['ModelKeyPhrase'] + ':' + row['PhysicalStructureKeyPhrase'], axis=1)
df['SystemPhysicalStructureKeyPhrase'] = df.apply(  lambda row: row['SystemKeyPhrase'] + ':' + row['PhysicalStructureKeyPhrase'], axis=1)

# hash the keyphrases
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'ModelHashKey'
  , columns = ['ModelKeyPhrase']
)
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'SystemHashKey'
  , columns = ['SystemKeyPhrase']
)
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'PhysicalStructureHashKey'
  , columns = ['PhysicalStructureKeyPhrase']
)
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'ModelPhysicalStructureHashKey'
  , columns = ['ModelPhysicalStructureKeyPhrase']
)
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'SystemPhysicalStructureHashKey'
  , columns = ['SystemPhysicalStructureKeyPhrase']
)


# hash the payload
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'HashDiff'
  , columns = [
      'SERVER_NAME'
    , 'DATABASE_NAME'
    , 'SCHEMA_NAME'
    , 'TABLE_NAME'

    , 'TABLE_TYPE'
    , 'ENGINE'
    , 'ROW_COUNT'
    , 'AVG_ROW_LENGTH'
    , 'DATA_LENGTH'
    , 'INDEX_LENGTH'
    , 'AUTO_INCREMENT'
    , 'UPDATE_TIME'
    , 'COLUMN_COUNT'
    , 'TABLE_COLLATION'
    , 'TABLE_COMMENT'
  ]
)

df

Unnamed: 0,SERVER_NAME,DATABASE_NAME,SCHEMA_NAME,TABLE_NAME,TABLE_TYPE,ENGINE,ROW_COUNT,AVG_ROW_LENGTH,DATA_LENGTH,INDEX_LENGTH,...,SystemKeyPhrase,PhysicalStructureKeyPhrase,ModelPhysicalStructureKeyPhrase,SystemPhysicalStructureKeyPhrase,ModelHashKey,SystemHashKey,PhysicalStructureHashKey,ModelPhysicalStructureHashKey,SystemPhysicalStructureHashKey,HashDiff
0,DRS_Core,def,information_schema,ALL_PLUGINS,SYSTEM VIEW,Aria,,0.0,8192.0,8192.0,...,DRS,DRS_Core.def.information_schema.ALL_PLUGINS,invite.please.over:DRS_Core.def.information_sc...,DRS:DRS_Core.def.information_schema.ALL_PLUGINS,36182d0eadd7d627494f1ff78ff43521,f1e3446c69e4d87279ff7863482f9dcb,deee9e216db10362f4be1282673d1803,4c01aeda9f8ba88a44c38523a672c61a,074eb805a0cc563adb283526278a5261,e46c5240b3a5c5d70e6e1f43458be68b
1,DRS_Core,def,information_schema,APPLICABLE_ROLES,SYSTEM VIEW,MEMORY,,1565.0,0.0,0.0,...,DRS,DRS_Core.def.information_schema.APPLICABLE_ROLES,invite.please.over:DRS_Core.def.information_sc...,DRS:DRS_Core.def.information_schema.APPLICABLE...,36182d0eadd7d627494f1ff78ff43521,f1e3446c69e4d87279ff7863482f9dcb,59072c3ba12769ad99b420e536413002,6080a4224deea471ec55016cb92292a0,1e47197d056563a82f3b1bf6af3c3443,6cff78986275e1778b20383d3fdefc59
2,DRS_Core,def,information_schema,CHARACTER_SETS,SYSTEM VIEW,MEMORY,,384.0,0.0,0.0,...,DRS,DRS_Core.def.information_schema.CHARACTER_SETS,invite.please.over:DRS_Core.def.information_sc...,DRS:DRS_Core.def.information_schema.CHARACTER_...,36182d0eadd7d627494f1ff78ff43521,f1e3446c69e4d87279ff7863482f9dcb,7b7e1ba8136047c698683f9380dc6d6f,99156bf27e3d421090996ae4f7504683,088525ed449682e43ef233854e978289,bff639ae594709959f318d2335403915
3,DRS_Core,def,information_schema,CHECK_CONSTRAINTS,SYSTEM VIEW,Aria,,0.0,8192.0,8192.0,...,DRS,DRS_Core.def.information_schema.CHECK_CONSTRAINTS,invite.please.over:DRS_Core.def.information_sc...,DRS:DRS_Core.def.information_schema.CHECK_CONS...,36182d0eadd7d627494f1ff78ff43521,f1e3446c69e4d87279ff7863482f9dcb,3b74990f4dbec7c5d044e7bfc9c89652,426ed47f2b726ad5f6d4472b05ed0c4f,1bda964245879c019e5d40289b8cf9bd,9971ef76306b21298ad2bd0cac0c82df
4,DRS_Core,def,information_schema,COLLATIONS,SYSTEM VIEW,MEMORY,,231.0,0.0,0.0,...,DRS,DRS_Core.def.information_schema.COLLATIONS,invite.please.over:DRS_Core.def.information_sc...,DRS:DRS_Core.def.information_schema.COLLATIONS,36182d0eadd7d627494f1ff78ff43521,f1e3446c69e4d87279ff7863482f9dcb,0aa78e120c62e8cba83f5e6b443bd1c1,e67de0b813151f54f6e65531e790f58c,e1135ff40ff401c43ccf1cd709164473,9576a077f3974743739c0836788d6062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
820,DRS_Core,def,_portal,gateway_connection_log,BASE TABLE,InnoDB,1131.0,188.0,212992.0,0.0,...,DRS,DRS_Core.def._portal.gateway_connection_log,invite.please.over:DRS_Core.def._portal.gatewa...,DRS:DRS_Core.def._portal.gateway_connection_log,36182d0eadd7d627494f1ff78ff43521,f1e3446c69e4d87279ff7863482f9dcb,6908dee978a9df8cbf2a9eda4c3d5374,d272a204e489a7211c461b5fb71cbe42,f376c4c4e6d474a0a6c06a32d54c4299,7add81f7c356b8d6646738aa40c6e7c3
821,DRS_Core,def,_portal,gateway_properties,BASE TABLE,InnoDB,5.0,3276.0,16384.0,0.0,...,DRS,DRS_Core.def._portal.gateway_properties,invite.please.over:DRS_Core.def._portal.gatewa...,DRS:DRS_Core.def._portal.gateway_properties,36182d0eadd7d627494f1ff78ff43521,f1e3446c69e4d87279ff7863482f9dcb,2e9d1868a52257a454cb47f263ba04cc,e3fd10c6f0eea9dc780f74df40973911,98caa75e7bf74ba297f62f8dfd2756d3,70995f3c89e480be471bd07a6a110390
822,DRS_Core,def,_portal,gateway_server,BASE TABLE,InnoDB,0.0,0.0,16384.0,0.0,...,DRS,DRS_Core.def._portal.gateway_server,invite.please.over:DRS_Core.def._portal.gatewa...,DRS:DRS_Core.def._portal.gateway_server,36182d0eadd7d627494f1ff78ff43521,f1e3446c69e4d87279ff7863482f9dcb,de1140cd0828cbfd46d6c56e95b6c0a3,8fb85959bbc1bcc57f356538a4409599,02d22d56b72b6e94820578d85b8129d7,349ff7ae6abc067fd71c2668af97200a
823,DRS_Core,def,_portal,gateway_user,BASE TABLE,InnoDB,530.0,123.0,65536.0,0.0,...,DRS,DRS_Core.def._portal.gateway_user,invite.please.over:DRS_Core.def._portal.gatewa...,DRS:DRS_Core.def._portal.gateway_user,36182d0eadd7d627494f1ff78ff43521,f1e3446c69e4d87279ff7863482f9dcb,9564f1fcbaa4b6e6183596e07f44ddba,e811470376252b501b503cf1806ab602,f619c3d925ed5a499fbddb86c898f238,5d9cf6de588545cec86281a3938987c8


In [9]:
# Clear down and Write to staging
conn.execute("DELETE FROM stg_Py_MysqlCsvToPhysicalStructure")
conn.commit()
df.to_sql('stg_Py_MysqlCsvToPhysicalStructure', conn, if_exists='append', index=False)

825

In [10]:
# Write to the Hub (PhysicalStructure)
sql_query = """
INSERT INTO rv_h_PhysicalStructure
(
    PhysicalStructureHashKey
  , LoadDate
  , RecordSource
  , PhysicalStructureKeyPhrase
)
SELECT DISTINCT
    PhysicalStructureHashKey
  , LoadDate
  , RecordSource
  , PhysicalStructureKeyPhrase
FROM
  stg_Py_MysqlCsvToPhysicalStructure
WHERE
  PhysicalStructureHashKey NOT IN (SELECT PhysicalStructureHashKey FROM rv_h_PhysicalStructure)
""";
conn.execute(sql_query)
conn.commit()

In [11]:
# Write to the Satellite (PhysicalStructure_Mysql)
sql_query = """
INSERT INTO rv_s_PhysicalStructure_Mysql
(
    PhysicalStructureHashKey
  , LoadDate
  , RecordSource
  , HashDiff

  , SERVER_NAME
  , "DATABASE_NAME"
  , SCHEMA_NAME
  , TABLE_NAME

  , TABLE_TYPE
  , ENGINE
  , ROW_COUNT
  , AVG_ROW_LENGTH
  , DATA_LENGTH
  , INDEX_LENGTH
  , AUTO_INCREMENT
  , UPDATE_TIME
  , COLUMN_COUNT
  , TABLE_COLLATION
  , TABLE_COMMENT

)
SELECT DISTINCT
    stg.PhysicalStructureHashKey
  , stg.LoadDate
  , stg.RecordSource
  , stg.HashDiff
  
  , stg.SERVER_NAME
  , stg."DATABASE_NAME"
  , stg.SCHEMA_NAME
  , stg.TABLE_NAME

  , stg.TABLE_TYPE
  , stg.ENGINE
  , stg.ROW_COUNT
  , stg.AVG_ROW_LENGTH
  , stg.DATA_LENGTH
  , stg.INDEX_LENGTH
  , stg.AUTO_INCREMENT
  , stg.UPDATE_TIME
  , stg.COLUMN_COUNT
  , stg.TABLE_COLLATION
  , stg.TABLE_COMMENT

FROM
  stg_Py_MysqlCsvToPhysicalStructure AS stg
  LEFT OUTER JOIN rv_s_PhysicalStructure_Mysql AS sat ON (
    stg.PhysicalStructureHashKey = sat.PhysicalStructureHashKey
    AND sat.LoadDate = (
      SELECT MAX(z.LoadDate)
      FROM rv_s_PhysicalStructure_Mysql AS z
      WHERE z.PhysicalStructureHashKey = sat.PhysicalStructureHashKey
    )
  )
WHERE
  (
    sat.HashDiff != stg.HashDiff
    OR sat.HashDiff IS NULL
  )
""";
conn.execute(sql_query)
conn.commit()

In [12]:
# Write to the Link (Model - PhysicalStructure)
sql_query = """
INSERT INTO rv_l_ModelPhysicalStructure
(
  ModelPhysicalStructureHashKey
  , LoadDate
  , RecordSource
  , ModelHashKey
  , PhysicalStructureHashKey
)
SELECT
  stg.ModelPhysicalStructureHashKey
  , stg.LoadDate
  , stg.RecordSource
  , stg.ModelHashKey
  , stg.PhysicalStructureHashKey
FROM
  stg_Py_MysqlCsvToPhysicalStructure stg
WHERE
  stg.ModelPhysicalStructureHashKey IS NOT NULL
  AND ModelPhysicalStructureHashKey NOT IN (
    SELECT ModelPhysicalStructureHashKey
    FROM rv_l_ModelPhysicalStructure
  )
""";
conn.execute(sql_query)
conn.commit()

In [13]:
# Write to the Link (System - PhysicalStructure)
sql_query = """
INSERT INTO rv_l_SystemPhysicalStructure
(
  SystemPhysicalStructureHashKey
  , LoadDate
  , RecordSource
  , SystemHashKey
  , PhysicalStructureHashKey
)
SELECT
  stg.SystemPhysicalStructureHashKey
  , stg.LoadDate
  , stg.RecordSource
  , stg.SystemHashKey
  , stg.PhysicalStructureHashKey
FROM
  stg_Py_MysqlCsvToPhysicalStructure stg
WHERE
  stg.SystemPhysicalStructureHashKey IS NOT NULL
  AND SystemPhysicalStructureHashKey NOT IN (
    SELECT SystemPhysicalStructureHashKey
    FROM rv_l_SystemPhysicalStructure
  )
""";
conn.execute(sql_query)
conn.commit()