# Pipeline de Análisis de Datos de Celulares
Este notebook demuestra el flujo completo del pipeline de análisis de datos para el proyecto de celulares.

## 1. Importación de Librerías

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import re
from pymongo import MongoClient
import requests
from bs4 import BeautifulSoup

## 2. Carga de Datos

### Funciones utilitarias

In [14]:
def get_html(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f'Error accesing site: {response.status_code}')
        return None
    return response.content

def extract_product_data(soup, product_class, name_class, price_class, offer_price_class, bold_price_class, brand_class):
    products = soup.find_all('a', class_=product_class)
    if not products:
        print("No products were found")
        return []
    product_data = []
    for product in products:
        try:
            name_element = product.find('p', class_=name_class)
            name = name_element.text.strip() if name_element else "Unknown"
            brand_element = product.find('p', class_=brand_class)
            brand = brand_element.text.strip() if brand_element else "Unknown Brand"
            price_element = (product.find('span', class_=offer_price_class) or
                             product.find('span', class_=price_class) or
                             product.find('span', class_=bold_price_class))
            if price_element:
                price_text = price_element.text.strip()
                price = float(price_text.replace('₡', '').replace(',', ''))
            else:
                price = 0.0  
            product_data.append({
                'product': name,
                'price': price,
                'brand': brand
            })
        except Exception as e:
            print(f"Error: {e}")
    return product_data

def save_raw_data(product_data, file_name='UnimartCellphoneData.csv'):
    if not product_data:
        print("Could not extract products")
        return
    df = pd.DataFrame(product_data)
    raw_data_path = os.path.join("..", "Data", "raw")
    os.makedirs(raw_data_path, exist_ok=True)
    file_path = os.path.join(raw_data_path, file_name)
    df.to_csv(file_path, index=False)
    print(f"Data stored: {file_path}")

### Parámetros y ejecución

In [15]:
url = 'https://www.unimart.com/collections/celulares'
product_class = 'product-item'
name_class = 'font-normal text-body leading-5 mb-1 order-4'
price_class = 'money line-through text-unimart-gray-200 text-xs'
offer_price_class = 'money text-base font-semibold mr-2 text-accent-red'
bold_price_class = 'money text-black text-base font-semibold'
brand_class = 'font-normal text-[10px] uppercase text-unimart-gray-200 mb-1 order-3'

html_content = get_html(url)

if html_content:
    soup = BeautifulSoup(html_content, 'html.parser')
    product_data = extract_product_data(soup, product_class, name_class, price_class, offer_price_class, bold_price_class, brand_class)
    save_raw_data(product_data, 'UnimartCellphoneData.csv')

Data stored: ..\Data\raw\UnimartCellphoneData.csv


## 3. Limpieza y Preprocesamiento de Datos

### Funciones de limpieza

In [16]:
def clean_product_name(name):
    name = re.sub(r'\+.*', '', name).strip() 
    name = name.replace('Teléfono Celular', '').strip()
    name = re.sub(r',.*', '', name).strip()
    return name

def extract_memory(name):
    match = re.search(r'(\d+GB|\d+TB)', name)
    return match.group(0) if match else 'Unknown'

### Limpieza principal y ejecución

In [17]:
def clean_dataset(input_file='../Data/raw/UnimartCellphoneData.csv', output_file='../Data/clean/UnimartCellphoneData.csv'):
    df = pd.read_csv(input_file)
    df['memory'] = df['product'].apply(extract_memory)
    df['product'] = df['product'].apply(clean_product_name)
    df.dropna(inplace=True)
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    df.to_csv(output_file, index=False)
    print(f"Clean Dataset stored: {output_file}")

clean_dataset()

Clean Dataset stored: ../Data/clean/UnimartCellphoneData.csv


## 4. Análisis Exploratorio de Datos (EDA)

### Gráfico: Precio promedio por marca

In [18]:
def plot_avg_price_per_brand(input_file='../Data/clean/UnimartCellphoneData.csv', output_file='../Data Analysis/Charts/average_cost_by_brand.png'):
    df = pd.read_csv(input_file)
    avg_price = df.groupby('brand')['price'].mean().sort_values()
    
    plt.figure(figsize=(10, 6))
    avg_price.plot(kind='bar', color='skyblue', edgecolor='black')
    plt.title('Average Cost per Brand')
    plt.xlabel('Brand')
    plt.ylabel('Average Cost (₡)')
    plt.xticks(rotation=45)
    
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    plt.savefig(output_file)
    plt.close()
    print(f'Plot saved inside folder:  {output_file}')

###  Gráfico: Top 10 celulares más caros

In [19]:
def plot_top_10_most_expensive_phones(input_file='../Data/clean/UnimartCellphoneData.csv', output_file='../Data Analysis/Charts/top_10_most_expensive_phones.png'):
    df = pd.read_csv(input_file)
    top_10 = df.nlargest(10, 'price')[['product', 'price']]
    
    plt.figure(figsize=(8, 6))
    plt.barh(top_10['product'], top_10['price'], color='red')
    plt.xlabel('Price (₡)')
    plt.ylabel('Product')
    plt.title('Top 10 Most Expensive Phones')
    plt.gca().invert_yaxis()
    
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    plt.savefig(output_file)
    plt.close()
    print(f'Plot saved inside folder:  {output_file}')

### Gráfico: Distribución de precios por marca

In [20]:
def plot_price_distribution_per_brand(input_file='../Data/clean/UnimartCellphoneData.csv', output_file='../Data Analysis/Charts/price_distribution_by_brand.png'):
    df = pd.read_csv(input_file)
    df = df.sort_values(by=['brand', 'price'])
    
    plt.figure(figsize=(12, 6))
    for brand, data in df.groupby('brand'):
        plt.bar(data['product'], data['price'], label=brand)
    
    plt.xlabel('Model')
    plt.ylabel('Price (₡)')
    plt.title('Price Distribution per brand')
    plt.xticks(rotation=90, fontsize=8)
    plt.legend(title='Brand')
    
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    plt.savefig(output_file, bbox_inches='tight')
    plt.close()
    print(f'Plot saved inside folder:  {output_file}')

### Gráfico: Precio promedio por cantidad de memoria

In [21]:
def plot_avg_price_per_memory(input_file='../Data/clean/UnimartCellphoneData.csv', output_file='../Data Analysis/Charts/average_price_per_memory_amount.png'):
    df = pd.read_csv(input_file)
    df['memory'] = df['memory'].astype(str)
    df = df[df['memory'] != 'Unknown']
    df['memory'] = df['memory'].str.replace('GB', '').astype(int)
    df = df[df['memory'] >= 64]
    avg_price_memory = df.groupby('memory')['price'].mean().sort_values()
    
    plt.figure(figsize=(10, 6))
    avg_price_memory.plot(kind='bar', color='purple', edgecolor='black')
    plt.title('Average Cost per Memory Amount')
    plt.xlabel('Memory Amount (GB)')
    plt.ylabel('Average Cost (₡)')
    plt.xticks(rotation=45)
    
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    plt.savefig(output_file)
    plt.close()
    print(f'Plot saved inside folder:  {output_file}')

Ejecución de los gráficos

In [22]:
plot_avg_price_per_brand()
plot_top_10_most_expensive_phones()
plot_price_distribution_per_brand()
plot_avg_price_per_memory()

Plot saved inside folder:  ../Data Analysis/Charts/average_cost_by_brand.png
Plot saved inside folder:  ../Data Analysis/Charts/top_10_most_expensive_phones.png
Plot saved inside folder:  ../Data Analysis/Charts/price_distribution_by_brand.png
Plot saved inside folder:  ../Data Analysis/Charts/average_price_per_memory_amount.png


## 5. Respaldo en base de datos NoSQL (MongoDB)

Conexión a MongoDB

In [None]:
# Check if MongoDB is up and running
# Client info has to be changed for your own MongoDB client

client = MongoClient("mongodb+srv://<user>:<password>@datalab.i76dg.mongodb.net/?retryWrites=true&w=majority&appName=DataLab")
try:
    print(client.list_database_names())
    print("Conexión exitosa.")
except Exception as e:
    print("Error de conexión:", e)

db = client["DataLab"]

Error de conexión: bad auth : Authentication failed., full error: {'ok': 0, 'errmsg': 'bad auth : Authentication failed.', 'code': 8000, 'codeName': 'AtlasError'}


Subir datos limpios a MongoDB

In [None]:
collection = db["CleanData"]

df = pd.read_csv("data/clean/UnimartCellphoneData.csv")
data_dict = df.to_dict(orient="records")
collection.insert_many(data_dict)
print("Clean Data has been uploaded to MongoDB succesfully.")

Subir imágenes de visualizaciones a MongoDB

In [None]:
collection = db["Visualizations"]
image_folder = "Data Analysis/Charts"

for image_name in os.listdir(image_folder):
    with open(os.path.join(image_folder, image_name), "rb") as image_file:
        image_data = image_file.read()
        collection.insert_one({"filename": image_name, "image": image_data})
print("Charts uploaded to MongoDB succesfully.")