<a href="https://colab.research.google.com/github/WalterPaixaoCortes/r3s-scripts/blob/main/notebooks/Data_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Validate Source Database Load

## Importing libraries

In [1]:
import os
import datetime
import traceback
import glob
import gc
import logging
import sqlite3
import sys

from sqlalchemy import event
from sqlalchemy import create_engine
from logging.handlers import TimedRotatingFileHandler
from dotenv import load_dotenv

import pandas as pd

## Declaring auxiliary functions

In [2]:
def count_lines(file_name):
    fp = open(file_name,'r', encoding="iso-8859-1")
    for line_count, line in enumerate(fp):
        pass
    return line_count


## Load Environment Variables

In [3]:
load_dotenv()

True

## Defining the parameters for execution

In [4]:
use_sqlite = eval(os.getenv('USE_SQLITE3'))
used_source = eval(os.getenv('USED_SOURCE'))
validate_process = True

## Defining the variables

### Initializing Logger

In [5]:
fhandler = TimedRotatingFileHandler("logs/validate_source_db_load.log", when="midnight", interval=1)
fhandler.suffix = "%Y%m%d"
logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        handlers=[fhandler, logging.StreamHandler(sys.stdout)],
    )
logger = logging.getLogger(__name__)

### Initializing Variables

In [6]:
unzipped_folder = "rawfiles"
database_folder = "database"
extension = ".zip"

allowed_extensions = [".txt",".csv"]
database_name = f"{database_folder}/source.db"

my_conn = None

## Connecting or Creating database

In [7]:
my_conn = None
if use_sqlite:
  my_conn=sqlite3.connect(database_name)
else:
  logger.info(os.getenv("PG_DATA_CONN"))
  my_conn = create_engine(os.getenv("PG_DATA_CONN"))  
logger.info("Connected to database...")

2022-10-21 16:26:09,077 - INFO - Connected to database...


## Validating Load Process

Here we will check based on line counts if the process to send to the database was sucessful or not.

In [8]:
if validate_process:
  success_data = { "File": [], "File Lines": [], "Table": [], "Table Rows": [], "Difference": []}
  error_data = { "File": [], "File Lines": [], "Table": [], "Table Rows": [], "Difference": []}

  logger.info("Preparing list of files to be processed...")
  list_of_files = filter(os.path.isfile, glob.glob(unzipped_folder + '/*') )
  list_of_files = sorted(list_of_files, key =  lambda x: os.stat(x).st_size)  
  files = [os.path.basename(item) for item in list_of_files]
  for item in files:
    try:
      file_name = os.path.join(unzipped_folder, item)
      table_name, file_ext = os.path.splitext(os.path.basename(item))
      logger.info(f"{file_name} and {table_name} being compared...")
      if file_ext in allowed_extensions and table_name.lower() in used_source:
        file_count = count_lines(file_name)

        db_count = 0
        if use_sqlite:
          db_count = my_conn.execute(f"select count(*) from {table_name.lower()}").fetchone()[0]
        else:
          db_count = my_conn.execute(f"select count(*) from source.{table_name.lower()}").fetchone()[0]

        diff = file_count - db_count
        if diff > 0:
          error_data["File"].append(file_name)
          error_data["File Lines"].append(file_count)
          error_data["Table"].append(table_name)
          error_data["Table Rows"].append(db_count)
          error_data["Difference"].append(diff)
        else:
          success_data["File"].append(file_name)
          success_data["File Lines"].append(file_count)
          success_data["Table"].append(table_name)
          success_data["Table Rows"].append(db_count)
          success_data["Difference"].append(diff)
    except:
        logger.error(traceback.format_exc())

  logger.info("Saving results as files...")
  error_report = pd.DataFrame(error_data)
  error_report.to_markdown(os.path.join(database_folder, "issues.md"))

  success_report = pd.DataFrame(success_data)
  success_report.to_markdown(os.path.join(database_folder, "success.md"))


2022-10-21 16:26:09,122 - INFO - Preparing list of files to be processed...
2022-10-21 16:26:09,138 - INFO - rawfiles\wifia_loans_closed.csv and wifia_loans_closed being compared...
2022-10-21 16:26:09,141 - INFO - rawfiles\wifia_projects_selected.csv and wifia_projects_selected being compared...
2022-10-21 16:26:09,143 - INFO - rawfiles\wifia_letters_submitted.csv and wifia_letters_submitted being compared...
2022-10-21 16:26:09,197 - INFO - rawfiles\ICIS_MASTER_GENERAL_PERMITS.csv and ICIS_MASTER_GENERAL_PERMITS being compared...
2022-10-21 16:26:09,203 - INFO - rawfiles\EPA_INFORMAL_ENFORCEMENT_ACTIONS.csv and EPA_INFORMAL_ENFORCEMENT_ACTIONS being compared...
2022-10-21 16:26:09,265 - INFO - rawfiles\CASE_ENFORCEMENT_CONCLUSION_SEP.csv and CASE_ENFORCEMENT_CONCLUSION_SEP being compared...
2022-10-21 16:26:09,275 - INFO - rawfiles\CASE_RELATED_ACTIVITIES.csv and CASE_RELATED_ACTIVITIES being compared...
2022-10-21 16:26:09,286 - INFO - rawfiles\CASE_RELIEF_SOUGHT.csv and CASE_RELIEF

## Closing the Database Connection

In [9]:
if use_sqlite:
  my_conn.close()