In [1]:
# Import packages

import numpy as np
import pandas as pd
import pyodbc #for SPC connection
import psycopg2 #for Greenplum connection
import psycopg2.extras #for Greenplum connection
import keyring #for password hiding
import xlwt as xl #for writing to excel
import difflib #for string matching

In [None]:
# if you haven't set this up before, set your password up in keyring
# by replacing "username" and "password" with your username and password for the TACC server.
# Keyring stores the password in the OS and is retrievable in other sessions
# so this code only needs to be run once.

# In future sessions, skip this chunk
keyring.set_password("TACC", "username", "password") # REPLACE THIS WITH ACTUAL USERNAME AND PASSWORD!!!


In [2]:
# Connect to local sql server
# Pre-req: You need to set up the ODBC connection before this cell can be run
try:
    spc = pyodbc.connect(DRIVER="ODBC Driver 17 for SQL Server",
        SERVER="spcdedpwpvs1",
        database="MEDICAID",
        Trusted_Connection="yes")
except:
    print("Connection failed")

In [None]:
# Connect to Greenplum
try:
    tacc = psycopg2.connect(dbname='uthealth', user='xrzhang',
                host='greenplum01.corral.tacc.utexas.edu',
                password=keyring.get_password("TACC", "xrzhang"))
except:
    print("Connection failed")

In [None]:
# Clear all variables except for connections
%reset_selective -f "^((?!spc|tacc).)*$"

In [None]:
#Pull all tables and estimated row counts from Medicaid db
spc_tables = pd.read_sql("""select t.name as Tablename, p.[Rows] as EstRows
        from sys.tables t
            inner join sys.indexes i on t.object_id = i.object_id
            inner join sys.partitions p on i.object_id = p.OBJECT_ID and i.index_id = p.index_id
        --where t.name like 'medicaid.dbo.%'
        order by tablename""", con=spc)
spc_tables.head()

In [None]:
# Pull tables and estimated row counts from TACC server
tacc_tables = pd.read_sql("""select relname as Tablename, n_live_tup as EstRows
    from pg_stat_user_tables
    where schemaname = 'medicaid'
    order by relname
    """, con=tacc)
tacc_tables.head()

In [None]:
# Write table names to excel spreadsheet
spc_tables.to_excel("H:\\GitHub\\uthealth-dw\\greenplum\\datawarehouse\\QA\\qa-all-db-checks\\row-counting\\tablenames_spc.xlsx",
    sheet_name = 'SPC Tables')

tacc_tables.to_excel("H:\\GitHub\\uthealth-dw\\greenplum\\datawarehouse\\QA\\qa-all-db-checks\\row-counting\\tablenames_tacc.xlsx",
sheet_name = 'TACC Tables')

In [None]:
# Write table names to excel spreadsheet
tablenames = xl.Workbook()
spc_tables_sheet = tablenames.add_sheet('SPC Tables', cell_overwrite_ok=True)
tacc_tables_sheet = tablenames.add_sheet('TACC Tables', cell_overwrite_ok=True)

spc_tables_sheet.write(1,1,spc_tables)
tacc_tables_sheet.write(1,1,tacc_tables)

tablenames.save("H:\\GitHub\\uthealth-dw\\greenplum\\datawarehouse\\QA\\qa-all-db-checks\\row-counting\\tablenames.xlsx")

In [None]:
for i in range(len(spc_tables)):
    print(spc_tables.iloc[i,0])

In [None]:
#Decide what tables to keep



In [None]:
#initialize vector to hold actual row count
ActualRows = np.zeros(len(df))
df = df.assign(ActualRows=ActualRows)
df

In [None]:
# Unused testing cells after here

In [None]:
df = pd.read_sql("""select schema_name(schema_id), name
        from sys.tables
        --where name = "admit1820"
        --where schema_id = 1
        """, con=spc)
df

In [None]:
df = pd.read_sql("""select db_name() as db
        """, con=spc)
df

In [None]:
tacc_tables = pd.read_sql("""select table_name
    from information_schema.tables
    where table_schema = 'medicaid'
    order by table_name
    """, con=tacc)
tacc_tables.head()

In [None]:
df2 = pd.read_sql('select * from dev.xz_dx_codes limit 10;', con=tacc)
df2.head()

In [None]:
df = pd.read_sql('select top 10 * from work.dbo.xz_dx_codes', con=spc)
df.head()

In [None]:
df2 = pd.read_sql('select * from dev.xz_dx_codes limit 10;', con=tacc)
df2.head()

In [None]:
# Manually close connection
tacc.close()