<a href="https://colab.research.google.com/github/WalterPaixaoCortes/r3s-scripts/blob/main/notebooks/Data_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Source Database Load

## Importing libraries

In [11]:
import os
import datetime
import traceback
import glob
import gc
import logging
import sqlite3
import sys

from sqlalchemy import event
from sqlalchemy import create_engine
from logging.handlers import TimedRotatingFileHandler
from dotenv import load_dotenv

import pandas as pd

### Load Environment Variables

In [12]:
load_dotenv()

True

## Defining the parameters for execution

In [13]:
create_folders = eval(os.getenv('CREATE_FOLDERS'))
use_sqlite = eval(os.getenv('USE_SQLITE3'))
clean_database = eval(os.getenv('CLEAN_DATABASE'))
save_to_database = True
commit_size = 10000
used_source = ['frs_program_links'] #eval(os.getenv('USED_SOURCE'))

## Defining the variables

### Initializing Logger

In [14]:
fhandler = TimedRotatingFileHandler("logs/source_db_load.log", when="midnight", interval=1)
fhandler.suffix = "%Y%m%d"
logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        handlers=[fhandler, logging.StreamHandler(sys.stdout)],
    )
logger = logging.getLogger(__name__)

### Initializing Variables

In [15]:
unzipped_folder = "rawfiles"
database_folder = "database"
extension = ".zip"

allowed_extensions = [".txt",".csv"]
database_name = f"{database_folder}/source.db"

my_conn = None

## Defining the environment

In [16]:
if create_folders:
  if not os.path.exists(database_folder): 
    os.mkdir(database_folder)
else:
  logger.info("Folders already created...")

2022-10-11 07:22:30,575 - INFO - Folders already created...


## Cleaning up database

In [17]:
if use_sqlite and clean_database:
  logger.info ("Cleaning database to restart insert operation...")
  if os.path.exists(database_name):
    if my_conn:
      my_conn.close()
    os.remove(database_name)
else:
  logger.info("Database will be used as is...")

2022-10-11 07:22:30,653 - INFO - Database will be used as is...


## Connecting or Creating database

In [18]:
my_conn = None
if use_sqlite:
  my_conn=sqlite3.connect(database_name)
else:
  logger.info(os.getenv("PG_DATA_CONN"))
  my_conn = create_engine(os.getenv("PG_DATA_CONN"))  
logger.info("Connected to database...")

2022-10-11 07:22:30,730 - INFO - postgresql+psycopg2://postgres:u9hnRW!TcCbqo2j@r3-tableau.cluster-cs2ga4wck7ra.us-east-2.rds.amazonaws.com/postgres
2022-10-11 07:22:30,731 - INFO - Connected to database...


## Save to database

In [19]:
if save_to_database:
  logger.info("Preparing list of files to be processed...")
  list_of_files = filter(os.path.isfile, glob.glob(unzipped_folder + '/*') )
  list_of_files = sorted(list_of_files, key =  lambda x: os.stat(x).st_size)  
  files = [os.path.basename(item) for item in list_of_files]
  
  for item in files:
    table_name, file_ext = os.path.splitext(os.path.basename(item))

    if table_name.lower() in used_source:
      df = None
      if file_ext in allowed_extensions and not table_name.startswith("ResultFile"):
        second = False
        if not table_name.startswith("US"):
          try:
            for df in pd.read_csv(os.path.join(unzipped_folder, item), encoding="iso-8859-1", index_col=False, chunksize=commit_size, dtype=str, on_bad_lines="skip", encoding_errors="replace"):
              df.columns = df.columns.str.lower()
              df.to_sql(table_name.lower(), my_conn, schema="source",
                      if_exists="append", 
                      index=False)
            logger.info(f"File {item} saved on the database...")
          except:
            logger.error(traceback.format_exc())
            second = True
        else:
          second = True
          
        if second:
          try:      
            for df in pd.read_csv(os.path.join(unzipped_folder, item), encoding="iso-8859-1", sep="\t", index_col=False, chunksize=commit_size, dtype=str, on_bad_lines="skip", encoding_errors="replace"):
              df.columns = df.columns.str.lower()
              df.to_sql(table_name.lower(), my_conn,  schema="source",
                      if_exists="append", 
                      index=False)
            logger.info(f"File {item} saved on the database...")
          except:
            logger.error(f"File {item} not saved on the database...")
    gc.collect()

else:
  logger.info("Database already loaded...")

2022-10-11 07:22:30,845 - INFO - Preparing list of files to be processed...
2022-10-11 07:43:40,386 - INFO - File FRS_PROGRAM_LINKS.csv saved on the database...


## Closing the Database Connection

In [20]:
if use_sqlite:
  my_conn.close()