# Webscrapping
- De la web clickcars escanea toda la info disponible para los coches

In [10]:
import os
import csv
import requests
from bs4 import BeautifulSoup

# Base URL
base_url = "https://www.clicars.com/coches-segunda-mano-ocasion?yearMin=2010"

# Límite de páginas
max_pages = 500  # Cambia este valor según lo necesites

# Nombre del archivo CSV
csv_file = 'car_data_with_urls.csv'

# Crear un conjunto para almacenar filas únicas
unique_rows = set()

# Cargar datos existentes si el archivo ya existe
if os.path.exists(csv_file):
    with open(csv_file, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader, None)  # Saltar los encabezados
        for row in reader:
            unique_rows.add(tuple(row))

# Realizar web scraping y agregar nuevos datos
for page in range(1, max_pages + 1):
    print(f"Scraping page {page}...")
    response = requests.get(f"{base_url}&page={page}")
    if response.status_code != 200:
        print(f"Error al cargar la página {page}: {response.status_code}")
        break

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extraer datos de los coches en la página
    makers = soup.find_all('h2', class_='maker ellipsis')
    versions = soup.find_all('span', class_='version ellipsis')
    prices = soup.find_all('span', class_='priceweb')
    infos = soup.find_all('span', class_='info ellipsis')
    fuels = soup.find_all('span', class_='fuelName')

    # Extraer URLs del producto desde los enlaces
    product_links = soup.find_all('a', class_='analytics-list-click-car')
    product_urls = [
        "https://www.clicars.com" + link['href'] if not link['href'].startswith("http") else link['href']
        for link in product_links
    ]

    # Extraer URLs de la imagen desde <img class="vehicle-img">
    image_elements = soup.find_all('img', class_='vehicle-img')
    image_urls = [img['src'] for img in image_elements if img.get('src')]

    # Determinar la cantidad máxima de elementos a procesar
    max_length = max(len(makers), len(versions), len(prices), len(infos), len(fuels), len(product_urls), len(image_urls))

    for i in range(max_length):
        maker_name = makers[i].find('strong').text.strip() if i < len(makers) and makers[i].find('strong') else "N/A"
        version = versions[i].text.strip() if i < len(versions) else "N/A"
        price = prices[i].text.strip() if i < len(prices) else "N/A"
        info = infos[i].text.strip() if i < len(infos) else "N/A"
        fuel = fuels[i].text.strip() if i < len(fuels) else "N/A"
        product_url = product_urls[i] if i < len(product_urls) else "N/A"
        image_url = image_urls[i] if i < len(image_urls) else "N/A"

        # Crear una tupla de la fila para verificar duplicados
        row = (maker_name, version, price, info, fuel, product_url, image_url)

        # Agregar al conjunto si no es un duplicado
        unique_rows.add(row)

# Filtrar duplicados basados en "URL Producto"
unique_by_url = {}
for row in unique_rows:
    product_url = row[5]  # Índice de "URL Producto"
    if product_url != "N/A":
        unique_by_url[product_url] = row

# Convertir el diccionario de vuelta a un conjunto
unique_rows = set(unique_by_url.values())

# Eliminar duplicados y guardar los datos combinados en el archivo CSV
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Maker", "Version", "Price", "Info", "Fuel", "URL Producto", "URL Imagen"])  # Encabezados
    for row in sorted(unique_rows):
        writer.writerow(row)

print(f"Datos actualizados y guardados en '{csv_file}'")


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping 