First, we are going to import the libraries needed to perform data ingestion.

In [1]:
# Import libraries
from pathlib import Path
import typer
from loguru import logger
from tqdm import tqdm
import os
import urllib.request
import requests
import datetime
import pandas as pd

Imports the predefined paths of the "PROCESSED_DATA_DIR" and "RAW_DATA_DIR" folders, configured in the config.py file

In [2]:
from modules.config import PROCESSED_DATA_DIR, RAW_DATA_DIR

[32m2024-09-30 12:01:51.482[0m | [1mINFO    [0m | [36mmodules.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /mnt/Data/Documents/Unison/1 Semestre/Ingeniería de Características/Projects/Sonora-River-Farming/Data-Science-Project[0m


In [7]:
RAW_DATA_DIR

PosixPath('/mnt/Data/Documents/Unison/1 Semestre/Ingeniería de Características/Projects/Sonora-River-Farming/Data-Science-Project/data/raw')

The download_file function downloads a file from an external link and saves it to a specific path. It will first check if the file does not exist, and will download it, and a text file will be generated with a brief description of the data source, the download date and the data download link.

In [19]:
def download_file(url: str, output_path: Path):
    """
    Downloads a file from an external link and saves it to a specific path.
    """
    SOURCE = url
    SUBDIR = output_path.parent
    FILE_NAME = output_path.name

    logger.info(f"Starting download from {SOURCE}")
    response = requests.get(SOURCE, stream=True)
    total_size = int(response.headers.get('content-length', 0))    

    # Check if the file already exists
    if not os.path.exists(output_path):
        logger.info(f"File {FILE_NAME} not found. Starting download...")

        # Create subdirectory if it does not exist
        if not os.path.exists(SUBDIR):
            os.makedirs(SUBDIR)
            logger.info(f"Created directory {SUBDIR}")

        # Download the file from the link provided
        urllib.request.urlretrieve(SOURCE, output_path)
        logger.success(f"Download completed: {output_path}")

        with open(output_path, 'wb') as file, tqdm(
            desc="Downloading",
            total=total_size,
            unit='B',
            unit_scale=True,
            unit_divisor=1024,
        ) as bar:
            for data in response.iter_content(1024):
                bar.update(len(data))
                file.write(data)

        logger.success(f"Download completed: {output_path}")

        INFO_FILE_NAME = FILE_NAME.split('.')[0] + ".txt"
        INFO_FILE_PATH = os.path.join(SUBDIR, INFO_FILE_NAME)
        logger.info(f"Creating {INFO_FILE_NAME} file with dataset details")

        with open(INFO_FILE_PATH, 'w') as f:
            f.write("Information from water quality monitoring sites operated by Conagua throughout the country\n\n")
            info = (
                "The information includes data on lotic, lentic, coastal, and underground water bodies, covering physicochemical "
                "and microbiological parameters according to the type of water body. These data are organized in an Excel file "
                "with three spreadsheets.\n\n"

                "First sheet: Contains details about the monitoring sites, such as key, name, aquifer, state, municipality, type "
                "of water body, latitude, longitude, among others.\n\n"

                "Second sheet: Presents the results of the monitoring, grouped by site, type of water body, date of completion, "
                "and the physicochemical and microbiological parameters recorded.\n\n"

                "Third sheet: Offers a dictionary that describes each parameter, indicating its key, name, and unit of measurement.\n\n"

                "The data was obtained from the National Water Commission (https://www.gob.mx/conagua/articulos/calidad-del-agua) "
                "dated August 6, 2024."
            )
            f.write(info + '\n')
            f.write("Downloaded on " + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "\n")
            f.write("From: " + SOURCE + "\n")
            f.write("Name: " + FILE_NAME + "\n")
        logger.success(f"Info file {INFO_FILE_NAME} created at {SUBDIR}")
    else:
        logger.info(f"File {FILE_NAME} already exists in the directory {SUBDIR}. Skipping download. ")



In [15]:
SOURCE = "https://files.conagua.gob.mx/aguasnacionales/TODOS%20LOS%20MONITOREOS.xlsb"
OUTPUT_PATH = RAW_DATA_DIR / 'water_quality_data.xlsb'
download_file(url = SOURCE, output_path=OUTPUT_PATH)

[32m2024-09-30 12:40:32.898[0m | [1mINFO    [0m | [36m__main__[0m:[36mdownload_file[0m:[36m9[0m - [1mStarting download from https://files.conagua.gob.mx/aguasnacionales/TODOS%20LOS%20MONITOREOS.xlsb[0m
[32m2024-09-30 12:40:33.447[0m | [1mINFO    [0m | [36m__main__[0m:[36mdownload_file[0m:[36m15[0m - [1mFile water_quality_data.xlsb not found. Starting download...[0m
[32m2024-09-30 12:40:46.539[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mdownload_file[0m:[36m24[0m - [32m[1mDownload completed: /mnt/Data/Documents/Unison/1 Semestre/Ingeniería de Características/Projects/Sonora-River-Farming/Data-Science-Project/data/raw/water_quality_data.xlsb[0m


Downloading: 100%|██████████| 97.4M/97.4M [00:22<00:00, 4.63MB/s]


[32m2024-09-30 12:41:08.876[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mdownload_file[0m:[36m37[0m - [32m[1mDownload completed: /mnt/Data/Documents/Unison/1 Semestre/Ingeniería de Características/Projects/Sonora-River-Farming/Data-Science-Project/data/raw/water_quality_data.xlsb[0m
[32m2024-09-30 12:41:08.877[0m | [1mINFO    [0m | [36m__main__[0m:[36mdownload_file[0m:[36m41[0m - [1mCreating water_quality_data.txt file with dataset details[0m
[32m2024-09-30 12:41:08.880[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mdownload_file[0m:[36m65[0m - [32m[1mInfo file water_quality_data.txt created at /mnt/Data/Documents/Unison/1 Semestre/Ingeniería de Características/Projects/Sonora-River-Farming/Data-Science-Project/data/raw[0m
