<a href="https://colab.research.google.com/github/WalterPaixaoCortes/r3s-scripts/blob/main/notebooks/Data_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ECPA file download

## Importing libraries

In [15]:
import os
import datetime
import zipfile
import gzip
import traceback
import glob
import logging
import sys

from logging.handlers import TimedRotatingFileHandler
from urllib.parse import urlparse
from dotenv import load_dotenv

import requests as r

from bs4 import BeautifulSoup

## Defining the parameters for execution

In [16]:
create_folders = False
download_files = True
unzip_files = True

## Defining the variables

### Load Environment Variables

In [17]:
load_dotenv()

True

### Initializing Logger

In [18]:
fhandler = TimedRotatingFileHandler("logs/epa_download.log", when="midnight", interval=1)
fhandler.suffix = "%Y%m%d"
logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s - %(message)s",
        handlers=[fhandler, logging.StreamHandler(sys.stdout)],
    )
logger = logging.getLogger(__name__)

### Build download URLS list

In [19]:
download_urls = ["https://echo.epa.gov/files/echodownloads/frs_downloads.zip",
                 "https://echo.epa.gov/files/echodownloads/case_downloads.zip", 
                 "https://echo.epa.gov/files/echodownloads/npdes_downloads.zip",
                 "https://echo.epa.gov/files/echodownloads/npdes_eff_downloads.zip",
                 "https://echo.epa.gov/files/echodownloads/npdes_master_general_permits.zip",
                 "https://echo.epa.gov/files/echodownloads/npdes_outfalls_layer.zip",
                 "https://echo.epa.gov/files/echodownloads/npdes_limits.zip",
                 "https://echo.epa.gov/files/echodownloads/SDWA_latest_downloads.zip"]

For TRI files, we need to add a sequence of files, since 1987.

In [20]:
tri_start = 1987
tri_end = datetime.datetime.now().year -1
tri_end_url = "https://www3.epa.gov/tri/pds/US_%s.zip"
tri_url = "https://www3.epa.gov/tri/current/US_%s.zip"

logger.info(f"Loading URLs for TRI downloads from {tri_start} to {tri_end}...")
year = tri_start
while year <= tri_end:
  if year == tri_end:
    url = tri_end_url % year
  else:
    url = tri_url % year
  
  download_urls.append(url)
  year += 1

2022-10-07 13:12:07,066 - INFO - Loading URLs for TRI downloads from 1987 to 2021...


For WQI files, we need to detect the correct files on the folder.

In [21]:
base_wqi_url = "https://echo.epa.gov/files/echodownloads/Data-Analytics/WQI"

logger.info(f"Loading URLs for WQI downloads...")
response = r.get(base_wqi_url)
soup = BeautifulSoup(response.content, 'html.parser')
links = soup.find_all('a')

for item in links:
  if "ResultFileToEnd2Output" in item["href"]:
    download_urls.append(f'{base_wqi_url}/{item["href"]}')

2022-10-07 13:12:07,100 - INFO - Loading URLs for WQI downloads...


There is a special routine for DMR files as well.

In [22]:
base_dmr_url = "https://echo.epa.gov/files/echodownloads"

logger.info(f"Loading URLs for DMR downloads...")
response = r.get(base_dmr_url)
soup = BeautifulSoup(response.content, 'html.parser')
links = soup.find_all('a')

for item in links:
  if "npdes_dmrs_" in item["href"]:
    download_urls.append(f'{base_dmr_url}/{item["href"]}')

2022-10-07 13:12:07,884 - INFO - Loading URLs for DMR downloads...


### Initializing Variables

In [23]:
zipfile_folder = "zipfiles"
unzipped_folder = "rawfiles"
database_folder = "database"
extension = ".zip"

allowed_extensions = [".txt",".csv"]

my_conn = None

Now, to help us out to not download files that were already downloaded, lets generate a list of downloaded files

In [24]:
downloaded_files = []
for item in os.listdir(zipfile_folder):
  downloaded_files.append(os.path.basename(urlparse(item).path))

## Defining the environment

In [25]:
if create_folders:
  if not os.path.exists(zipfile_folder): 
    os.mkdir(zipfile_folder)
  if not os.path.exists(unzipped_folder): 
    os.mkdir(unzipped_folder)
  if not os.path.exists(database_folder): 
    os.mkdir(database_folder)
else:
  logger.info("Folders already created...")

2022-10-07 13:12:08,786 - INFO - Folders already created...


## Download zip files

In [26]:
if download_files:
  logger.info(f"Starting download process. Total files to be downloaded: {len(download_urls)}...")
  for download_url in download_urls:
    file_name = os.path.basename(urlparse(download_url).path)
    table_name, ext =  os.path.splitext(file_name)
    if file_name not in downloaded_files:
      logger.info(f"Downloading file {file_name}...")
      try:
        response = r.get(download_url, allow_redirects=True)
        with open(os.path.join(zipfile_folder, file_name), "wb") as fw:
          fw.write(response.content)
          logger.info(f"--> File {file_name} saved.")
      except:
          logger.error(f"--> File {file_name} not downloaded.")
else:
  logger.info("Files already downloaded...")


2022-10-07 13:12:08,920 - INFO - Starting download process. Total files to be downloaded: 62...
2022-10-07 13:12:08,921 - INFO - Downloading file frs_downloads.zip...
2022-10-07 13:18:42,128 - INFO - --> File frs_downloads.zip saved.
2022-10-07 13:18:50,909 - INFO - Downloading file case_downloads.zip...
2022-10-07 13:18:56,949 - INFO - --> File case_downloads.zip saved.
2022-10-07 13:18:56,953 - INFO - Downloading file npdes_downloads.zip...
2022-10-07 13:19:42,817 - INFO - --> File npdes_downloads.zip saved.
2022-10-07 13:19:51,283 - INFO - Downloading file npdes_eff_downloads.zip...
2022-10-07 13:23:23,299 - INFO - --> File npdes_eff_downloads.zip saved.
2022-10-07 13:24:08,324 - INFO - Downloading file npdes_master_general_permits.zip...
2022-10-07 13:24:09,718 - INFO - --> File npdes_master_general_permits.zip saved.
2022-10-07 13:24:09,722 - INFO - Downloading file npdes_outfalls_layer.zip...
2022-10-07 13:24:21,973 - INFO - --> File npdes_outfalls_layer.zip saved.
2022-10-07 13:

## Unzip the files

In [27]:
if unzip_files:
  for item in os.listdir(zipfile_folder):
    if item.endswith(extension) and item not in downloaded_files: 
      logger.info(f"Unzipping file {item}...")
      try:
        file_name = os.path.abspath(os.path.join(zipfile_folder, item)) 
        zip_ref = zipfile.ZipFile(file_name)
        zip_ref.extractall(unzipped_folder)
        zip_ref.close()
        logger.info(f"--> File {item} unzipped.")
      except:
        logger.error(f"--> File {item} not unzipped.")
    elif item.endswith(".gz")  and item not in downloaded_files:
      logger.info(f"Decompressing file {item}...")
      try:
        file_name = os.path.abspath(os.path.join(zipfile_folder, item)) 
        new_file_name = os.path.abspath(os.path.join(unzipped_folder, item.replace(".gz",""))) 
        file_out = gzip.decompress(open(file_name, 'rb').read())
        with open(new_file_name, 'wb') as fw:
          fw.write(file_out)
        logger.info(f"File {new_file_name} decompressed and saved...")        
      except:
        logger.error(f"--> File {item} not decompressed.")
    else:
      logger.info(f"Skipping file {item}.")
else:
  logger.info("Files already unzipped...")

2022-10-07 14:28:42,430 - INFO - Unzipping file case_downloads.zip...
2022-10-07 14:28:46,358 - INFO - --> File case_downloads.zip unzipped.
2022-10-07 14:28:46,360 - INFO - Unzipping file frs_downloads.zip...
2022-10-07 14:28:56,746 - INFO - --> File frs_downloads.zip unzipped.
2022-10-07 14:28:56,748 - INFO - Unzipping file npdes_dmrs_fy2009.zip...
2022-10-07 14:29:23,815 - INFO - --> File npdes_dmrs_fy2009.zip unzipped.
2022-10-07 14:29:23,816 - INFO - Unzipping file npdes_dmrs_fy2010.zip...
2022-10-07 14:29:49,252 - INFO - --> File npdes_dmrs_fy2010.zip unzipped.
2022-10-07 14:29:49,253 - INFO - Unzipping file npdes_dmrs_fy2011.zip...
2022-10-07 14:30:14,237 - INFO - --> File npdes_dmrs_fy2011.zip unzipped.
2022-10-07 14:30:14,238 - INFO - Unzipping file npdes_dmrs_fy2012.zip...
2022-10-07 14:30:42,132 - INFO - --> File npdes_dmrs_fy2012.zip unzipped.
2022-10-07 14:30:42,133 - INFO - Unzipping file npdes_dmrs_fy2013.zip...
2022-10-07 14:31:10,633 - INFO - --> File npdes_dmrs_fy2013

## Removing files not used

In [28]:
logger.info("Preparing list of files to be processed...")
list_of_files = filter(os.path.isfile, glob.glob(unzipped_folder + '/*') )
files = [os.path.basename(item) for item in list_of_files]

for item in files:
  table_name, file_ext = os.path.splitext(os.path.basename(item))
  if table_name.lower() not in os.getenv('USED_SOURCE'):
    os.remove(os.path.join(unzipped_folder, item))


2022-10-07 14:43:43,501 - INFO - Preparing list of files to be processed...
