In [2]:
# Creates new business-level conceptual entities into the DB from a csv
import sqlite3
import pandas as pd
import numpy as np
import os
from datetime import datetime
import sys
sys.path.append('../../')
from user_packages import hashing


In [3]:
# Set script variables
source_file_name = 'Org_Dictionary(Organisations).csv'
source_file_path = '../../imports/business'

record_source = 'BusinessCSV'

In [4]:
conn = sqlite3.connect('../../full_metadata.db')
cur = conn.cursor()

In [15]:
# read table to df
df = pd.read_csv(os.path.join(source_file_path, source_file_name))
df = df.replace({np.nan: None})

df['ModelBusinessKeyPhrase'] = df.apply(lambda row: row['ModelKeyPhrase'] + ':' + row['BusinessName'], axis=1)

#df

In [14]:
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'ModelHashKey'
  , columns = ['ModelKeyPhrase']
)
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'BusinessHashKey'
  , columns = ['BusinessName']
)
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'HashDiff'
  , columns = ['Description']
)
df = hashing.add_md5_hash_column(
  df
  , md5_column_name = 'ModelBusinessHashKey'
  , columns = ['ModelBusinessKeyPhrase']
)
df['RecordSource'] = record_source

#df

In [11]:
# Clear down and Write to staging
conn.execute("DELETE FROM stg_Py_CSVToBusiness")
conn.commit()
df.to_sql('stg_Py_CSVToBusiness', conn, if_exists='append', index=False)

14

In [8]:
# Write to the Hub (Business)
sql_query = """
INSERT INTO rv_h_Business
(
    BusinessHashKey
  , LoadDate
  , RecordSource
  , BusinessName
)
SELECT DISTINCT
    BusinessHashKey
  , LoadDate
  , RecordSource
  , BusinessName
FROM
  stg_Py_CSVToBusiness
WHERE
  BusinessHashKey NOT IN (SELECT BusinessHashKey FROM rv_h_Business)
""";
conn.execute(sql_query)
conn.commit()

In [12]:
# Write to the Satellite (Business)
sql_query = """
INSERT INTO rv_s_Business
(
    BusinessHashKey
  , LoadDate
  , RecordSource
  , HashDiff

  , "Description"
)
SELECT DISTINCT
    stg.BusinessHashKey
  , stg.LoadDate
  , stg.RecordSource
  , stg.HashDiff
  
  , stg."Description"

FROM
  stg_Py_CSVToBusiness AS stg
  LEFT OUTER JOIN rv_s_Business AS sat ON (
    stg.BusinessHashKey = sat.BusinessHashKey
    AND sat.LoadDate = (
      SELECT MAX(z.LoadDate)
      FROM rv_s_Business AS z
      WHERE z.BusinessHashKey = sat.BusinessHashKey
    )
  )
WHERE
  (
    sat.HashDiff != stg.HashDiff
    OR sat.HashDiff IS NULL
  )
""";
conn.execute(sql_query)
conn.commit()

In [13]:
# Write to the Link (Model - Business)
sql_query = """
INSERT INTO rv_l_ModelBusiness
(
  ModelBusinessHashKey
  , LoadDate
  , RecordSource
  , ModelHashKey
  , BusinessHashKey
)
SELECT
  stg.ModelBusinessHashKey
  , stg.LoadDate
  , stg.RecordSource
  , stg.ModelHashKey
  , stg.BusinessHashKey
FROM
  stg_Py_CSVToBusiness stg
WHERE
  stg.ModelBusinessHashKey IS NOT NULL
  AND ModelBusinessHashKey NOT IN (
    SELECT ModelBusinessHashKey
    FROM rv_l_ModelBusiness
  )
""";
conn.execute(sql_query)
conn.commit()