# Introducción

Este script hace la ejecución de Extracción, Procesamiento y Carga (ETL) de los archivos del sitio [Quién es Quién en los precios](https://datos.profeco.gob.mx/datos_abiertos/qqp.php).

Una vez procesados estarán disponibles para la parte A y B del trabajo.

In [1]:
# Library installation of resources not in conda environment "arquitectura"
%pip install awswrangler
%pip install boto3
%pip install rarfile
%pip install selenium
%pip install tqdm
%pip install unidecode

In [None]:
# Import libraries
import awswrangler as wr
import boto3
import csv
import matplotlib.pyplot as plt
import os
import pandas as pd
import re
import requests
import seaborn as sns
import subprocess
import time
from bs4 import BeautifulSoup
from tqdm import tqdm
from unidecode import unidecode

### Carga y Preparacion de datos

Descargar los archivos desde el sitio [Quién es Quién en los precios](https://datos.profeco.gob.mx/datos_abiertos/qqp.php) y descomprimirlos en la carpeta `data/`.

In [None]:
# Function to ensure the data directory exists
def ensure_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

# Base URL for making complete links
base_url = "https://datos.profeco.gob.mx/datos_abiertos/"

# URL of the page to scrape
url = "https://datos.profeco.gob.mx/datos_abiertos/qqp.php"

# Ensure the data directory exists
ensure_dir('data')

# Send HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all <li> tags that contain <a> tags within them
    links = soup.find_all('li')
    file_links = []
    
    for link in tqdm(links):
        a_tag = link.find('a', href=True)
        if a_tag and 'file.php?t=' in a_tag['href']:
            # Create the complete URL for the link
            complete_url = base_url + a_tag['href']
            file_links.append((complete_url, a_tag.text))

    # Visit each link and download the file
    for file_link, name in tqdm(file_links):
        try:
            # Make the request
            response = requests.get(file_link)
            # Save the content to a file
            if response.status_code == 200:
                file_path = os.path.join('data', name.replace('/', '_') + '.rar')  # Replace slashes just in case
                with open(file_path, 'wb') as file:
                    file.write(response.content)
                print(f"File saved: {file_path}")
            else:
                print(f"Failed to download the file from {file_link}. Status code: {response.status_code}")
        except Exception as e:
            print(f"An error occurred while downloading {file_link}: {str(e)}")
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

Desempacar los archivos

In [None]:
def unpack_rar_files(directory):
    # Change this path to your 7-Zip executable if it's not in the PATH
    seven_zip_path = "7z"
    
    # List all files in the given directory
    files = os.listdir(directory)
    
    # Filter for .rar files
    rar_files = [file for file in files if file.endswith('.rar')]
    
    # Extract each .rar file
    for rar in tqdm(rar_files):
        # Construct the full file path
        file_path = os.path.join(directory, rar)
        # Command to extract the files
        command = [seven_zip_path, 'x', file_path, '-o' + directory]
        # Run the command
        subprocess.run(command, check=True)

# Replace 'data' with your directory path if different
unpack_rar_files('data')

Esquema de los archivos

In [None]:
# Define the expected columns based on the provided schema
expected_columns = [
    'producto',
    'presentacion',
    'marca',
    'categoria',
    'catalogo',
    'precio',
    'fecha_registro',
    'cadena_comercial',
    'giro',
    'nombre_comercial',
    'direccion',
    'estado',
    'municipio',
    'latitud',
    'longitud'
]

Directorio que contiene los archivos

In [None]:
# Directory containing the CSV files
data_dir = 'data'  # Adjust this path as needed in your local setup

Encontrar todos los archivos en el directorio "data"

In [None]:
# Function to find all CSV files in directory and subdirectories
def find_csv_files(directory):
    csv_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.csv'):
                csv_files.append(os.path.join(root, file))
    return csv_files

# Get all CSV files
csv_files = find_csv_files(data_dir)

In [None]:
# Function to clean and transform data
def clean_data(data):
    # Remove accents
    for column in data.columns:
        data[column] = data[column].apply(lambda x: unidecode(str(x)) if isinstance(x, str) and pd.notnull(x) else x)
    # Convert to lowercase
    data = data.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
    # Change commas to pipes in 'direccion', considering complex rules
    # Updated regex pattern to avoid syntax error
    data['direccion'] = data['direccion'].apply(
        lambda x: re.sub(r',(?![^"]*"(?:(?:[^"]*"){2})*[^"]*$)', '|', x) if pd.notnull(x) else x)
    return data

# Ensure the 'data_clean' folder exists
clean_folder = 'data_clean'
os.makedirs(clean_folder, exist_ok=True)

# Read each file and process
for file_path in tqdm(csv_files):
    try:
        # Read the file assuming no headers and using the expected columns
        data = pd.read_csv(file_path, header=None, names=expected_columns)

        # Clean and transform data
        cleaned_data = clean_data(data)

        # Ensure filename is included
        cleaned_data['filename'] = os.path.basename(file_path)

        # Ensure filename is the first column
        cleaned_data = cleaned_data[['filename'] + [col for col in expected_columns if col in cleaned_data.columns]]

        # Save the cleaned data to a new gzip file in the 'data_clean' directory
        clean_file_path = os.path.join(clean_folder, os.path.basename(file_path).replace('.csv', '.csv.gz'))
        cleaned_data.to_csv(clean_file_path, index=False, compression='gzip')
        
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")

print("Data processing completed.")

Cargar los archivos a S3

In [None]:
# Initialize a session
session = boto3.Session(profile_name='arquitectura_AWS_ITAM_2024', region_name='us-east-1')

In [None]:
# Create an S3 client from this session
s3 = session.client('s3')

In [None]:
# Folder to take and S3 bucket to upload
folder_path = 'data_clean'
bucket_name = 'mdge-e3-2024'

In [None]:
# Function to upload files to S3|
def upload_files_to_s3(folder_path, bucket_name):
    
    # Iterate over files in the directory
    for filename in tqdm(os.listdir(folder_path)):
        file_path = os.path.join(folder_path, filename)
        
        # Check if it's a file
        if os.path.isfile(file_path):
            # Upload the file
            try:
                s3.upload_file(file_path, bucket_name, filename)
                print(f"Uploaded {filename} to S3 bucket {bucket_name}")
            except Exception as e:
                print(f"Failed to upload {filename}: {str(e)}")

In [None]:
# Upload files
upload_files_to_s3(folder_path, bucket_name)