<a href="https://colab.research.google.com/github/WalterPaixaoCortes/r3s-scripts/blob/main/notebooks/Data_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Source Database Load

## Importing libraries

In [1]:
import datetime
import gc
import glob
import logging
import os
import sqlite3
import sys
import traceback
from logging.handlers import TimedRotatingFileHandler

import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine, event

### Load Environment Variables

In [2]:
load_dotenv()

True

## Defining the parameters for execution

In [3]:
create_folders = eval(os.getenv('CREATE_FOLDERS'))
use_sqlite = eval(os.getenv('USE_SQLITE3'))
clean_database = eval(os.getenv('CLEAN_DATABASE'))
save_to_database = True
commit_size = 10000
used_source = eval(os.getenv('USED_SOURCE'))

## Defining the variables

### Initializing Logger

In [4]:
fhandler = TimedRotatingFileHandler("logs/source_db_load.log", when="midnight", interval=1)
fhandler.suffix = "%Y%m%d"
logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        handlers=[fhandler, logging.StreamHandler(sys.stdout)],
    )
logger = logging.getLogger(__name__)

### Initializing Variables

In [5]:
unzipped_folder = "rawfiles"
database_folder = "database"
extension = ".zip"

allowed_extensions = [".txt",".csv"]
database_name = f"{database_folder}/source.db"

my_conn = None

## Defining the environment

In [6]:
if create_folders:
  if not os.path.exists(database_folder): 
    os.mkdir(database_folder)
else:
  logger.info("Folders already created...")

2022-10-21 15:31:07,158 - INFO - Folders already created...


## Cleaning up database

In [7]:
if use_sqlite and clean_database:
  logger.info ("Cleaning database to restart insert operation...")
  if os.path.exists(database_name):
    if my_conn:
      my_conn.close()
    os.remove(database_name)
else:
  logger.info("Database will be used as is...")

2022-10-21 15:31:07,189 - INFO - Cleaning database to restart insert operation...


## Connecting or Creating database

In [8]:
my_conn = None
if use_sqlite:
  my_conn=sqlite3.connect(database_name)
else:
  logger.info(os.getenv("PG_DATA_CONN"))
  my_conn = create_engine(os.getenv("PG_DATA_CONN"))  
logger.info("Connected to database...")

2022-10-21 15:31:07,841 - INFO - Connected to database...


## Save to database

In [9]:
if save_to_database:
  logger.info("Preparing list of files to be processed...")
  list_of_files = filter(os.path.isfile, glob.glob(unzipped_folder + '/*') )
  list_of_files = sorted(list_of_files, key =  lambda x: os.stat(x).st_size)  
  files = [os.path.basename(item) for item in list_of_files]
  
  for item in files:
    table_name, file_ext = os.path.splitext(os.path.basename(item))

    if table_name.lower() in used_source:
      df = None
      if file_ext in allowed_extensions and not table_name.startswith("ResultFile"):
        second = False
        if not table_name.startswith("US"):
          try:
            for df in pd.read_csv(os.path.join(unzipped_folder, item), encoding="iso-8859-1", index_col=False, chunksize=commit_size, dtype=str, on_bad_lines="skip", encoding_errors="replace"):
              df.columns = df.columns.str.lower()
              df.to_sql(table_name.lower(), my_conn, schema="source",
                      if_exists="append", 
                      index=False)
            logger.info(f"File {item} saved on the database...")
          except:
            logger.error(traceback.format_exc())
            second = True
        else:
          second = True
          
        if second:
          try:      
            for df in pd.read_csv(os.path.join(unzipped_folder, item), encoding="iso-8859-1", sep="\t", index_col=False, chunksize=commit_size, dtype=str, on_bad_lines="skip", encoding_errors="replace"):
              df.columns = df.columns.str.lower()
              df.to_sql(table_name.lower(), my_conn,  schema="source",
                      if_exists="append", 
                      index=False)
            logger.info(f"File {item} saved on the database...")
          except:
            logger.error(f"File {item} not saved on the database...")
    gc.collect()

else:
  logger.info("Database already loaded...")

2022-10-21 15:31:07,879 - INFO - Preparing list of files to be processed...
2022-10-21 15:31:07,929 - INFO - File wifia_loans_closed.csv saved on the database...
2022-10-21 15:31:08,015 - INFO - File wifia_projects_selected.csv saved on the database...
2022-10-21 15:31:08,103 - INFO - File wifia_letters_submitted.csv saved on the database...
2022-10-21 15:31:08,224 - INFO - File ICIS_MASTER_GENERAL_PERMITS.csv saved on the database...
2022-10-21 15:31:08,403 - INFO - File EPA_INFORMAL_ENFORCEMENT_ACTIONS.csv saved on the database...
2022-10-21 15:31:08,540 - INFO - File CASE_ENFORCEMENT_CONCLUSION_SEP.csv saved on the database...
2022-10-21 15:31:08,769 - INFO - File CASE_RELATED_ACTIVITIES.csv saved on the database...
2022-10-21 15:31:09,041 - INFO - File CASE_RELIEF_SOUGHT.csv saved on the database...
2022-10-21 15:31:09,544 - INFO - File CASE_POLLUTANTS.csv saved on the database...
2022-10-21 15:31:10,224 - INFO - File CASE_PENALTIES.csv saved on the database...
2022-10-21 15:31:10,

## Closing the Database Connection

In [10]:
if use_sqlite:
  my_conn.close()