# Import Required Libraries
Import libraries needed for web scraping, data processing, and database operations, including requests_html, BeautifulSoup, pymongo, json, re, and csv.

In [1]:
# Import Required Libraries

from requests_html import HTMLSession
from bs4 import BeautifulSoup
from pymongo import MongoClient
import json
import re
import csv
import requests

# Extract Laptop Links
Use requests_html HTMLSession to navigate through pagination and collect all laptop URLs from appinformatica.com/ordenadores/ordenadores-portatiles, similar to the PCBox implementation.

In [2]:
# Extract Laptop Links

session = HTMLSession()
laptop_links = []

for i in range(1, 21):
    print(f'Pagina {i}')
    r = session.get(url=f"https://www.appinformatica.com/ordenadores/ordenadores-portatiles?page={i}")
    
    # Get all links with the specified class
    links = r.html.find('a.vtex-product-summary-2-x-clearLink.vtex-product-summary-2-x-clearLink--main-product-summary.h-100.flex.flex-column')
    # Extract absolute URLs and remove duplicates
    links = list(set([link.absolute_links.pop() for link in links]))
    laptop_links.extend(links)

laptop_links

Pagina 1
Pagina 2
Pagina 3
Pagina 4
Pagina 5
Pagina 6
Pagina 7
Pagina 8
Pagina 9
Pagina 10
Pagina 11
Pagina 12
Pagina 13
Pagina 14
Pagina 15
Pagina 16
Pagina 17
Pagina 18
Pagina 19
Pagina 20


['https://www.appinformatica.com/e1504fa-nj158w-portatil-asus-e1504fa-nj158w-amd-r5-7520u-4-3ghz--8gb--512gb-ssd/p',
 'https://www.appinformatica.com/578b1ea-abe-15s-fq4038ns-i5-1155g7-512gb-8gb-15-6in-w-11/p',
 'https://www.appinformatica.com/9rc55kf5feja01es000-portatil-gigabyte-g5-kf5-53es354sd-i5-13500h-4060-16gb-ddr5/p',
 'https://www.appinformatica.com/82r4001csp-portatil-lenovo-ideapad-1-r5-8gb-512ssd-15-6--w11h/p',
 'https://www.appinformatica.com/9rx6l9sgdsjhhdes000-gigabyte-portatil-aorus-16x-9sg-43esc64sh-i7-13650hx-32gb-d/p',
 'https://www.appinformatica.com/nh-qpneb-009-portatil-gaming-acer-predator-helios-neo-16--nh-qpneb-009---core-u/p',
 'https://www.appinformatica.com/9j1e1ea-abe-t-15-fd0031ns-ci3-n305-8gb-256gb-w11h/p',
 'https://www.appinformatica.com/9rc6l9kghsja01es000-portatil-gigabyte-g6x-9kg-43es854sd-i7-136250hx-4060-16gb-d/p',
 'https://www.appinformatica.com/83em00d6sp-ips3-15irh8-15-6fhd-tn-ag-250n-n-core-i7-13620h-2-4g-10c-16t--1tb-ss/p',
 'https://www.appi

# Scrape Laptop Specifications
Implement a function to extract detailed specifications from each laptop page, parsing HTML tables and collecting data like processor, memory, storage, graphics, display, operating system, and price.

In [4]:
def scrape_laptop_specs(url):
    try:
        # Realiza la petición HTTP
        response = requests.get(url)
        response.raise_for_status()  # Raises HTTPError for bad responses (4xx, 5xx)

    except requests.RequestException as e:
        print(f"Error al realizar la petición HTTP: {e}")
        return None

    try:
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Características del portátil
        sections = {}

        # URL del producto
        sections["URL"] = url
        
        # Título del portátil
        title = soup.find("h1", class_="vtex-store-components-3-x-productNameContainer vtex-store-components-3-x-productNameContainer--quickview mv0 t-heading-4")
        if title:
            sections["Titulo"] = title.text
        
        # Marca
        marca = soup.find("span", class_="ticnova-commons-components-0-x-ProductInfo c-action-primary ml2")
        if marca:
            sections['Marca'] = marca.text

        # Se buscan todas las tablas con la clase utilizada en las especificaciones
        tables = soup.find_all("table", class_="vtex-table-description-extended_carac")
        
        # Iteramos sobre cada tabla
        for table in tables:
            current_section = None
            # Iteramos sobre las filas que contienen la clase indicativa
            for row in table.find_all("tr", class_="vtex-table-description-row"):
                try:
                    # Si la fila tiene un <td> con colspan="2" y un div de título, es un encabezado de sección
                    header_td = row.find("td", colspan="2")
                    if header_td:
                        title_div = header_td.find("div", class_="vtex-table-description-title")
                        if title_div:
                            current_section = title_div.get_text(strip=True)
                            sections.setdefault(current_section, {})
                            continue

                    # Si la fila contiene clave y valor, los extraemos
                    key_td = row.find("td", class_="vtex-table-description-key")
                    value_td = row.find("td", class_="vtex-table-description-value")
                    if key_td and value_td and current_section:
                        key = key_td.get_text(strip=True)
                        value = value_td.get_text(strip=True)
                        sections[current_section][key] = value
                except Exception as e:
                    print(f"Error al procesar una fila de la tabla: {e}")
                    continue

        # Mapeamos las secciones extraídas a los campos requeridos:
        mapping = {
            "Procesador": "Procesador",
            "Memoria": "RAM",
            "Medios de almacenaje": "Almacenamiento",
            "Gráficos": "Graficos",
            "Exhibición": "Pantalla",
            "Software": "Sistema Operativo",
            "Batería": "Bateria"
        }
        
        try:
            price = soup.find('div', class_='ticnova-commons-components-0-x-price')
            if price:
                sections['Precio'] = price.text.replace('\xa0', '').replace('€', '').strip()
        except Exception as e:
            print(f"Error al extraer el precio: {e}")

        result = {}
        for seccion_original, campo in mapping.items():
            result[campo] = sections.get(seccion_original, {})

        return sections

    except Exception as e:
        print(f"Error general al procesar la página: {e}")
        return None

# Example usage
laptop_specs = scrape_laptop_specs(laptop_links[0])
laptop_specs

{'URL': 'https://www.appinformatica.com/e1504fa-nj158w-portatil-asus-e1504fa-nj158w-amd-r5-7520u-4-3ghz--8gb--512gb-ssd/p',
 'Titulo': 'PORTATIL ASUS E1504FA-NJ158W AMD R5-7520U 4.3GHZ/ 8GB/ 512GB SSD/ 15.6" FHD/ W11 ',
 'Marca': 'Asus',
 'Procesador': {'Fabricante de procesador': 'AMD',
  'Familia de procesador': 'AMD Ryzen™ 5',
  'Modelo del procesador': '7520U',
  'Frecuencia del procesador': '2,8 GHz',
  'Frecuencia del procesador turbo': '4,3 GHz',
  'Caché del procesador': '6 MB',
  'Tipo de cache en procesador': 'L2 & L3',
  'Número de núcleos de procesador': '4'},
 'Memoria': {'Memoria interna': '8 GB',
  'Tipo de memoria interna': 'LPDDR5-SDRAM',
  'Forma de factor de memoria': 'Incorporado',
  'Memoria interna máxima': '8 GB'},
 'Medios de almacenaje': {'Capacidad total de almacenaje': '512 GB',
  'Unidad de almacenamiento': 'SSD',
  'Capacidad total de SSD': '512 GB',
  'Número de unidades SSD instalados': '1',
  'SDD, capacidad': '512 GB',
  'Interfaces del SDD': 'PCI Expre

# Insert Data into MongoDB
Connect to MongoDB and insert the scraped laptop specifications, creating a collection for appinformatica laptops similar to the pcbox database structure.

In [5]:
# Insert Data into MongoDB

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['appinformatica']
collection = db['portatiles']

# Parse each specs_json into a dict and insert into MongoDB
all_laptops_specs = []

for link in laptop_links:
    if any(json.loads(specs_str).get("URL") == link for specs_str in all_laptops_specs):
        print(f"Skipping {link} - already processed")
        continue
    try:
        specs = scrape_laptop_specs(link)
        if specs is not None:
            try:
                specs_json = json.dumps(specs, indent=4, ensure_ascii=False)
                all_laptops_specs.append(specs_json)
            except json.JSONEncodeError as e:
                print(f"Error encoding JSON for {link}: {e}")
    except Exception as e:
        print(f"Error processing link {link}: {e}")
        continue

# Insert documents into MongoDB
for laptop_specs_json in all_laptops_specs:
    # Parse JSON string into dict if needed
    if isinstance(laptop_specs_json, str):
        laptop_specs_dict = json.loads(laptop_specs_json)
    else:
        laptop_specs_dict = laptop_specs_json
    
    # Insert document
    collection.insert_one(laptop_specs_dict)

# Simplify Specifications
Create helper functions to simplify and standardize the raw specifications data, extracting relevant information like processor model, RAM capacity, storage size, and screen dimensions.

In [7]:
# Simplify Specifications

def simplify_procesador(proc_dict):
    fabricante = proc_dict.get("Fabricante de procesador", "").strip()
    familia = proc_dict.get("Familia de procesador", "").strip()
    modelo = proc_dict.get("Modelo del procesador", "").strip()
    # Eliminar símbolos de marca (® y ™)
    familia_clean = re.sub(r"[®™]", "", familia).strip()
    # Si el fabricante no está incluido, lo anteponemos
    if fabricante and fabricante not in familia_clean:
        familia_clean = f"{fabricante} {familia_clean}".strip()
    # Si el modelo empieza con el mismo token final de la familia, lo eliminamos para evitar duplicados
    last_token = familia_clean.split()[-1]
    if modelo.startswith(last_token):
        new_model = modelo[len(last_token):]
        # Si new_model comienza con guión, lo concatenamos directamente
        if new_model.startswith("-"):
            final = f"{familia_clean}{new_model}"
        else:
            final = f"{familia_clean}-{modelo}"
    else:
        final = f"{familia_clean}-{modelo}"
    return final

def simplify_ram(ram_dict):
    # Extrae el número de GB desde "Memoria interna" (por ejemplo, "8 GB")
    ram_value = ram_dict.get("Memoria interna", "")
    match = re.search(r"(\d+)", ram_value)
    return match.group(1) if match else ""

def simplify_tipo_ram(ram_dict):
    tipo_ram = ram_dict.get("Tipo de memoria interna", "")
    # Extrae la parte DDR (por ejemplo, "DDR4" de "DDR4-SDRAM")
    match = re.search(r"(DDR\d+)", tipo_ram, re.IGNORECASE)
    return match.group(1).upper() if match else ""

def simplify_almacenamiento(alm_dict):
    # Se prefiere "Capacidad total de SSD", pero si no existe se usa "SDD, capacidad"
    valor = alm_dict.get("Capacidad total de SSD", "") or alm_dict.get("SDD, capacidad", "")
    match = re.search(r"(\d+)", valor)
    return match.group(1) if match else ""

def simplify_graficos(graf_dict):
    return graf_dict.get("Modelo de adaptador gráfico incorporado", "").strip()

def simplify_pantalla(pant_dict):
    diag = pant_dict.get("Diagonal de la pantalla", "")
    # Se extrae la medida en pulgadas que aparece entre paréntesis, por ejemplo: (15.6")
    match = re.search(r"\(([\d\.]+)\"", diag)
    return match.group(1) if match else ""

def simplify_resolucion(pant_dict):
    resol = pant_dict.get("Resolución de la pantalla", "")
    # Se buscan los dos números (ancho y alto)
    numbers = re.findall(r"(\d+)", resol)
    if len(numbers) >= 2:
        return f"{numbers[0]}x{numbers[1]}"
    return ""

def simplify_sistema(soft_dict):
    return soft_dict.get("Sistema operativo instalado", "").strip()

def simplify_bateria(bat_dict):
    bat = bat_dict.get("Capacidad de batería", "")
    match = re.search(r"(\d+)", bat)
    return match.group(1) if match else ""

def simplify_specs(specs):
    simplified = {}
    simplified["Procesador"] = simplify_procesador(specs.get("Procesador", {}))
    simplified["RAM"] = simplify_ram(specs.get("Memoria", {}))
    simplified["Tipo RAM"] = simplify_tipo_ram(specs.get("Memoria", {}))
    simplified["Almacenamiento"] = simplify_almacenamiento(specs.get("Medios de almacenaje", {}))
    simplified["Graficos"] = simplify_graficos(specs.get("Gráficos", {}))
    simplified["Pantalla"] = simplify_pantalla(specs.get("Exhibición", {}))
    simplified["Resolucion"] = simplify_resolucion(specs.get("Exhibición", {}))
    simplified["Sistema Operativo"] = simplify_sistema(specs.get("Software", {}))
    simplified["Bateria"] = simplify_bateria(specs.get("Batería", {}))
    simplified["Precio"] = specs.get("Precio", "")
    return simplified

# Export to CSV
Generate a CSV file with the simplified laptop specifications, ensuring consistent column naming and data formatting to match the PCBox dataset structure.

In [9]:
import os

# Export to CSV

# Use the dataset_portatils list that already contains simplified specifications
# No need to process all_laptops_specs again

# Define the columns for the CSV in the desired order
fieldnames = [
    "Procesador",
    "RAM",
    "Tipo RAM",
    "Almacenamiento",
    "Graficos",
    "Pantalla",
    "Resolucion",
    "Sistema Operativo",
    "Bateria",
    "Precio"
]

# Check if file exists and load existing data to avoid duplicates
existing_data = []
try:
    if os.path.exists("specs_simplified.csv"):
        with open("specs_simplified.csv", "r", encoding="utf-8") as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                existing_data.append(row)
        
        print(f"Loaded {len(existing_data)} existing entries from CSV")
    else:
        print("No existing CSV file found, will create new file")
except Exception as e:
    print(f"Error loading existing data: {e}")
    existing_data = []

# Identify unique entries by comparing processors and other key specs
unique_specs = {}
try:
    # Process existing entries first
    for laptop in existing_data:
        # Create a unique key based on multiple fields to identify duplicates
        key = f"{laptop.get('Procesador', '')}-{laptop.get('RAM', '')}-{laptop.get('Almacenamiento', '')}-{laptop.get('Pantalla', '')}"
        unique_specs[key] = laptop
    
    # Add new entries from dataset_portatils
    for laptop in dataset_portatils:
        # Create a unique key based on multiple fields to identify duplicates
        key = f"{laptop.get('Procesador', '')}-{laptop.get('RAM', '')}-{laptop.get('Almacenamiento', '')}-{laptop.get('Pantalla', '')}"
        unique_specs[key] = laptop

    print(f"Total unique laptops: {len(unique_specs)}")
    print(f"New entries added: {len(unique_specs) - len(existing_data)}")
except Exception as e:
    print(f"Error processing laptop data: {e}")

# Write all unique specs to CSV
try:
    with open("specs_simplified.csv", "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for laptop in unique_specs.values():
            writer.writerow(laptop)
    print(f"Successfully exported {len(unique_specs)} laptop specifications to specs_simplified.csv")
except Exception as e:
    print(f"Error writing CSV file: {e}")

Loaded 247 existing entries from CSV
Total unique laptops: 183
New entries added: -64
Successfully exported 183 laptop specifications to specs_simplified.csv
