# Introducción

Este script hace la ejecución de Extracción, Procesamiento y Carga (ETL) de los archivos del sitio [Quién es Quién en los precios](https://datos.profeco.gob.mx/datos_abiertos/qqp.php).

Una vez procesados estarán disponibles para la parte A y B del trabajo.

In [1]:
# Library installation of resources not in conda environment "arquitectura"
%pip install awswrangler
%pip install boto3
%pip install rarfile
%pip install selenium
%pip install tqdm
%pip install unidecode

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import libraries
import awswrangler as wr
import boto3
import csv
import matplotlib.pyplot as plt
import os
import pandas as pd
import rarfile
import re
import requests
import seaborn as sns
import subprocess
import time
from bs4 import BeautifulSoup
from tqdm import tqdm
from unidecode import unidecode

### Carga y Preparacion de datos

Descargar los archivos desde el sitio [Quién es Quién en los precios](https://datos.profeco.gob.mx/datos_abiertos/qqp.php) y descomprimirlos en la carpeta `data/`.

In [3]:
# Function to ensure the data directory exists
def ensure_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

# Base URL for making complete links
base_url = "https://datos.profeco.gob.mx/datos_abiertos/"

# URL of the page to scrape
url = "https://datos.profeco.gob.mx/datos_abiertos/qqp.php"

# Ensure the data directory exists
ensure_dir('data')

# Send HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all <li> tags that contain <a> tags within them
    links = soup.find_all('li')
    file_links = []
    
    for link in links:
        a_tag = link.find('a', href=True)
        if a_tag and 'file.php?t=' in a_tag['href']:
            # Create the complete URL for the link
            complete_url = base_url + a_tag['href']
            file_links.append((complete_url, a_tag.text))

    # Visit each link and download the file
    for file_link, name in file_links:
        try:
            # Make the request
            response = requests.get(file_link)
            # Save the content to a file
            if response.status_code == 200:
                file_path = os.path.join('data', name.replace('/', '_') + '.rar')  # Replace slashes just in case
                with open(file_path, 'wb') as file:
                    file.write(response.content)
                print(f"File saved: {file_path}")
            else:
                print(f"Failed to download the file from {file_link}. Status code: {response.status_code}")
        except Exception as e:
            print(f"An error occurred while downloading {file_link}: {str(e)}")
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

File saved: data\Base de Datos Histórica Quién es Quién en los Precios 2024.rar
File saved: data\Base de Datos Histórica Quién es Quién en los Precios 2023.rar
File saved: data\Base de Datos Histórica Quién es Quién en los Precios 2022.rar
File saved: data\Base de Datos Histórica Quién es Quién en los Precios 2021.rar
File saved: data\Base de Datos Histórica Quién es Quién en los Precios 2020.rar
File saved: data\Base de Datos Histórica Quién es Quién en los Precios 2019.rar
File saved: data\Base de Datos Histórica Quién es Quién en los Precios 2018.rar
File saved: data\Base de Datos Histórica Quién es Quién en los Precios 2017.rar
File saved: data\Base de Datos Histórica Quién es Quién en los Precios 2016.rar
File saved: data\Base de Datos Histórica Quién es Quién en los Precios 2015.rar


Desempacar los archivos

In [4]:
def unpack_rar_files(directory):
    # Change this path to your 7-Zip executable if it's not in the PATH
    seven_zip_path = "7z"
    
    # List all files in the given directory
    files = os.listdir(directory)
    
    # Filter for .rar files
    rar_files = [file for file in files if file.endswith('.rar')]
    
    # Extract each .rar file
    for rar in rar_files:
        # Construct the full file path
        file_path = os.path.join(directory, rar)
        # Command to extract the files
        command = [seven_zip_path, 'x', file_path, '-o' + directory]
        # Run the command
        subprocess.run(command, check=True)

# Replace 'data' with your directory path if different
unpack_rar_files('data')

Esquema de los archivos

In [None]:
# Define the expected columns based on the provided schema
expected_columns = [
    'producto', 'presentacion', 'marca', 'categoria', 'catalogo', 'precio',
    'fecha_registro', 'cadena_comercial', 'giro', 'nombre_comercial', 'direccion',
    'estado', 'municipio', 'latitud', 'longitud'
]

Directorio que contiene los archivos

In [None]:
# Directory containing the CSV files
data_dir = 'data'  # Adjust this path as needed in your local setup

Encontrar todos los archivos en el directorio "data"

In [None]:
# Function to find all CSV files in directory and subdirectories
def find_csv_files(directory):
    csv_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.csv'):
                csv_files.append(os.path.join(root, file))
    return csv_files

# Get all CSV files
csv_files = find_csv_files(data_dir)

Leer todos los archivos para generar uno solo de la categoría indicada (en este caso, "Material escolar")

In [None]:
# Initialize an empty DataFrame for valid data and one for errors
all_data = pd.DataFrame(columns=['filename'] + expected_columns)
errors = pd.DataFrame(columns=['filename', 'error_row'])

# Function to clean and transform data
def clean_data(data):
    # Remove accents
    for column in data.columns:
        data[column] = data[column].apply(lambda x: unidecode(str(x)) if isinstance(x, str) and pd.notnull(x) else x)
    # Convert to lowercase
    data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
    # Change commas to pipes in 'direccion', considering complex rules
    data['direccion'] = data['direccion'].apply(lambda x: re.sub(r',(?![^"]*"(?:(?:[^"]*"){2})*[^"]*$)', '|', x) if pd.notnull(x) else x)
    return data

# Read each file and validate rows
for file_path in tqdm(csv_files):
    try:
        # Read the file assuming no headers and using the expected columns
        data = pd.read_csv(file_path, header=None, names=expected_columns)

        # Filter data for 'categoria' equals 'material escolar'
        filtered_data = data[data['categoria'] == 'MATERIAL ESCOLAR']

        # Clean and transform data
        data = clean_data(data)

        # Add filename column to the DataFrame
        data['filename'] = os.path.basename(file_path)

        # Ensure filename is the first column
        data = data[['filename'] + expected_columns]

        # Drop rows and columns that are completely NA
        valid_data = filtered_data.dropna(how='all').dropna(axis=1, how='all')
        
        # Concatenate data while checking for non-empty DataFrame to avoid FutureWarning
        if not valid_data.empty:
            all_data = pd.concat([all_data, valid_data], ignore_index=True)
        
        # Identify invalid rows and add them to the errors DataFrame
        invalid_rows = data[data.isna().any(axis=1)]
        if not invalid_rows.empty:
            invalid_row_str = invalid_rows[expected_columns].apply(lambda x: ','.join(x.fillna('').map(str)), axis=1)
            errors_df = pd.DataFrame({
                'filename': os.path.basename(file_path),
                'error_row': invalid_row_str
            })
            errors = pd.concat([errors, errors_df], ignore_index=True)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")

# Output results
print(f"Total valid rows loaded: {len(all_data)}")
print(f"Total invalid rows found: {len(errors)}")


  all_data = pd.concat([all_data, valid_data], ignore_index=True)
100%|██████████| 450/450 [49:45<00:00,  6.64s/it]

Total valid rows loaded: 5189153
Total invalid rows found: 527494





Exportar los datos y los errores a la carpeta indicada ya comprimidos

In [None]:
# Export errors df to a compressed csv file using gzip
errors.to_csv('./data_clean/errors.csv.gz', index=False, compression='gzip')

# Export the cleaned data to a compressed CSV file using gzip
all_data.to_csv('./data_clean/all_data.csv.gz', index=False, compression='gzip')

Cargar los archivos a S3

In [None]:
# Initialize a session using a specific profile
session = boto3.Session(profile_name='arquitectura_AWS_ITAM_2024', region_name='us-east-1')

# Create an S3 client from this session
s3 = session.client('s3')

In [None]:
# File paths to upload
error_file_path = './data_clean/errors.csv.gz'
data_file_path = './data_clean/all_data.csv.gz'

In [None]:
# Bucket name
bucket_name = 'mdge-e3-2024'

In [None]:
# Upload files
try:
    s3.upload_file(error_file_path, bucket_name, 'errors.csv.gz')
    s3.upload_file(data_file_path, bucket_name, 'all_data.csv.gz')
    print("Files uploaded successfully")
except boto3.exceptions.S3UploadFailedError as e:
    print("Failed to upload: ", e)
except Exception as e:
    print("An error occurred: ", e)

Files uploaded successfully


In [None]:
# list  all distinct values in filename column
test = all_data['filename'].unique()

test

array([nan], dtype=object)