In [75]:
import requests
import re
from bs4 import BeautifulSoup
import time
import pandas as pd
import json
from urllib.parse import unquote
import os

In [46]:
def scrap_trocvelo(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)"
    }

    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Erreur {response.status_code} - L'URL semble invalide")

    soup = BeautifulSoup(response.content, 'html.parser')

    # Titre
    title_tag = soup.find("h1")
    title = title_tag.text.strip() if title_tag else None

    # Description
    desc_container = soup.find_all("div", class_="Product_text__sDr5U")
    desc_container = next((div for div in soup.find_all("div", class_="Product_text__sDr5U") if div.find("p")),None)
    description = desc_container.get_text(strip=True) if desc_container else None
    description = description.replace('\xa0', ' ')

    # Prix
    price_tag = soup.find("span", class_="Price_priceText__y6tqK")
    print(price_tag)
    price = price_tag.text.strip() if price_tag else None

    # Images
    image_urls = []
    image_tags = soup.find_all("img")
    for img in image_tags:
        src = img.get("src", "")
        if "media-cdn.troc-velo.com" in src and src not in image_urls:
            image_urls.append(src)

    # Autres features
    other_features = soup.find_all("div", class_="Info_info__mklBc")

    keys = []
    values = []

    for feature in other_features:
        label = feature.find("span", class_="Info_label__WumoC")
        value = feature.find("span", class_="Info_value__SiW4P")

        if label and value:
            keys.append(label.get_text(strip=True))
            values.append(value.get_text(strip=True))
    other_features=dict(zip(keys, values))

    return {
        "title": title,
        "description": description,
        "price": price,
        "image_urls": image_urls,
        "other_features": other_features
    }

In [47]:
base_url = "https://www.troc-velo.com/fr-fr/categorie/velos-complets/route"
urls = get_trocvelo_urls_js(base_url, max_pages=100)

full_scrap = []
for url in urls:
    try:
        print(f"Scraping {url}")
        data = scrap_trocvelo(url)
        full_scrap.append(data)
        time.sleep(1)
    except Exception as e:
        print(f"❌ Erreur sur {url}: {e}")

🔎 Page 1: https://www.troc-velo.com/fr-fr/categorie/velos-complets/route?p=1
✅ 50 annonces trouvées
🔎 Page 2: https://www.troc-velo.com/fr-fr/categorie/velos-complets/route?p=2
✅ 50 annonces trouvées
🔎 Page 3: https://www.troc-velo.com/fr-fr/categorie/velos-complets/route?p=3
✅ 50 annonces trouvées
🔎 Page 4: https://www.troc-velo.com/fr-fr/categorie/velos-complets/route?p=4
✅ 50 annonces trouvées
🔎 Page 5: https://www.troc-velo.com/fr-fr/categorie/velos-complets/route?p=5
✅ 50 annonces trouvées
🔎 Page 6: https://www.troc-velo.com/fr-fr/categorie/velos-complets/route?p=6
✅ 50 annonces trouvées
🔎 Page 7: https://www.troc-velo.com/fr-fr/categorie/velos-complets/route?p=7
✅ 50 annonces trouvées
🔎 Page 8: https://www.troc-velo.com/fr-fr/categorie/velos-complets/route?p=8
✅ 18 annonces trouvées
🔎 Page 9: https://www.troc-velo.com/fr-fr/categorie/velos-complets/route?p=9
✅ 48 annonces trouvées
🔎 Page 10: https://www.troc-velo.com/fr-fr/categorie/velos-complets/route?p=10
✅ 50 annonces trouvée

In [48]:
len(full_scrap)

2334

In [52]:
pd.DataFrame(full_scrap).to_csv("trocvelo.csv", index=False)

In [53]:
pd.DataFrame(urls).to_csv("trocvelo_urls.csv", index=False)

In [54]:
df_full_scrap = pd.DataFrame(full_scrap)

In [56]:
df_full_scrap['image_urls'][100]

['/_next/image?url=https%3A%2F%2Fmedia-cdn.troc-velo.com%2Ffull%2Fmain%2FbikeAds%2F308%2F85308%2F3985308%2F3985308-peugeot-aubisque-t53-1743585654.jpg&w=1920&q=75',
 '/_next/image?url=https%3A%2F%2Fmedia-cdn.troc-velo.com%2Ffull%2Fmain%2FbikeAds%2F308%2F85308%2F3985308%2F3985308-peugeot-aubisque-t53-1743585655.jpg&w=1920&q=75',
 '/_next/image?url=https%3A%2F%2Fmedia-cdn.troc-velo.com%2Ffull%2Fmain%2FbikeAds%2F308%2F85308%2F3985308%2F3985308-peugeot-aubisque-t53-1743585657.jpg&w=1920&q=75']

In [95]:
rows = []
img_count = 0
img_folder = '/Users/vloi/code/vloi/helperz/data_viz/data/4/img_tvlo'
for i, data, in enumerate(full_scrap):
    row = {"id": f"{i:05d}",
        "title": data.get("title"),
        "description": data.get("description"),
        "price": data.get("price"),
        "other_features": data.get("other_features")
    }
    for j,url in enumerate(data["image_urls"]):
        url = url.replace('/_next/image?url=','')
        if '.jpg' in url:
            url = url.split('.jpg')[0] + '.jpg'
        url = unquote(url)
        filename = f'{img_folder}/tvlo_{i:05d}_{j}.jpg'
        try:
            r = requests.get(url, stream=True)
            if r.status_code == 200:
                with open(filename, 'wb') as f:
                    f.write(r.content)
                    f.flush()
            row[f"image_{j+1}"] = filename
            img_count +=1
        except Exception as e:
            print(f"Erreur lors du téléchargement de l'image {filename}: {e}")
            row[f"image_{j+1}"] = None
        print(f'🌆 got {img_count} images')
    rows.append(row)


🌆 got 1 images
🌆 got 2 images
🌆 got 3 images
🌆 got 4 images
🌆 got 5 images
🌆 got 6 images
🌆 got 7 images
🌆 got 8 images
🌆 got 9 images
🌆 got 10 images
🌆 got 11 images
🌆 got 12 images
🌆 got 13 images
🌆 got 14 images
🌆 got 15 images
🌆 got 16 images
🌆 got 17 images
🌆 got 18 images
🌆 got 19 images
🌆 got 20 images
🌆 got 21 images
🌆 got 22 images
🌆 got 23 images
🌆 got 24 images
🌆 got 25 images
🌆 got 26 images
🌆 got 27 images
🌆 got 28 images
🌆 got 29 images
🌆 got 30 images
🌆 got 31 images
🌆 got 32 images
🌆 got 33 images
🌆 got 34 images
🌆 got 35 images
🌆 got 36 images
🌆 got 37 images
🌆 got 38 images
🌆 got 39 images
🌆 got 40 images
🌆 got 41 images
🌆 got 42 images
🌆 got 43 images
🌆 got 44 images
🌆 got 45 images
🌆 got 46 images
🌆 got 47 images
🌆 got 48 images
🌆 got 49 images
🌆 got 50 images
🌆 got 51 images
🌆 got 52 images
🌆 got 53 images
🌆 got 54 images
🌆 got 55 images
🌆 got 56 images
🌆 got 57 images
🌆 got 58 images
🌆 got 59 images
🌆 got 60 images
🌆 got 61 images
🌆 got 62 images
🌆 got 63 images
🌆

In [126]:
data['image_paths'][1]

KeyError: 'image_paths'

In [141]:
trocvelo_df = pd.read_csv("trocvelo.csv")
trocvelo_full_scrap_df = pd.DataFrame(rows)

rows.to_csv("trocvelo_full_scrap.csv", index=False)

AttributeError: 'list' object has no attribute 'to_csv'

In [151]:
df = pd.DataFrame(rows)

In [155]:
df['id'] = 'i_'+df['id'].astype(str)

In [158]:
df.to_csv("trocvelo_full_scrap.csv", index=False)

In [161]:
pd.read_csv("trocvelo_full_scrap.csv").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2334 entries, 0 to 2333
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              2334 non-null   object
 1   title           2334 non-null   object
 2   description     2330 non-null   object
 3   price           2334 non-null   object
 4   other_features  2334 non-null   object
 5   image_1         2278 non-null   object
 6   image_2         1887 non-null   object
 7   image_3         1689 non-null   object
 8   image_4         1285 non-null   object
 9   image_5         997 non-null    object
 10  image_6         705 non-null    object
 11  image_7         531 non-null    object
 12  image_8         405 non-null    object
 13  image_9         303 non-null    object
 14  image_10        157 non-null    object
 15  image_11        2 non-null      object
dtypes: object(16)
memory usage: 291.9+ KB


In [None]:
df[['']]

'/Users/vloi/code/vloi/helperz/data_viz/data/4/img_tvlo/tvlo_00001_0.jpg'

In [196]:
df_features = pd.json_normalize(df["other_features"])

In [200]:
df_clean = pd.concat([df, df_features], axis=1)

In [205]:
df_clean.to_csv('full_scrap_clean.csv', index=False)

In [206]:
df_clean.columns

Index(['id', 'title', 'description', 'price', 'other_features', 'image_1',
       'image_2', 'image_3', 'image_4', 'image_5', 'image_6', 'image_7',
       'image_8', 'image_9', 'image_10', 'image_11', 'Année', 'État', 'Groupe',
       'Taille du cadre', 'Diametre roues', 'Matière principale',
       'Couleurs principales', 'Type de freinage', 'Pratique',
       'Nombres de vitesses', 'Groupe électrique (AXS, Di2, eTap, ..)',
       'Poids', 'Type d'étrier de frein', 'Type de fixation du disque',
       'Type de pneumatique', 'Type de (corps) roue libre', 'Type(s) d'axe(s)',
       'Hauteur jante (en mm)', 'Largeur jante (en mm)', 'Largeur du cintre'],
      dtype='object')

In [232]:
df_features_only = df_clean.loc[:, ~df_clean.columns.str.contains('image')]
df_images_only = df_clean.loc[:, df_clean.columns.str.contains('image') | (df_clean.columns == 'id')]

In [233]:
df_features_only.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2334 entries, 0 to 2333
Data columns (total 25 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   id                                      2334 non-null   object
 1   title                                   2334 non-null   object
 2   description                             2334 non-null   object
 3   price                                   2334 non-null   object
 4   other_features                          2334 non-null   object
 5   Année                                   2032 non-null   object
 6   État                                    2334 non-null   object
 7   Groupe                                  2178 non-null   object
 8   Taille du cadre                         2246 non-null   object
 9   Diametre roues                          1960 non-null   object
 10  Matière principale                      2233 non-null   object
 11  Coul

In [234]:
df_features_only = df_features_only.loc[:,df_clean.isnull().mean()<0.8]
df_features_only.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2334 entries, 0 to 2333
Data columns (total 16 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   id                                      2334 non-null   object
 1   title                                   2334 non-null   object
 2   description                             2334 non-null   object
 3   price                                   2334 non-null   object
 4   other_features                          2334 non-null   object
 5   Année                                   2032 non-null   object
 6   État                                    2334 non-null   object
 7   Groupe                                  2178 non-null   object
 8   Taille du cadre                         2246 non-null   object
 9   Diametre roues                          1960 non-null   object
 10  Matière principale                      2233 non-null   object
 11  Coul

In [235]:
df_features_only.to_csv('full_scrap_clean_features_only.csv', index=False)
df_images_only.to_csv('full_scrap_clean_images_only.csv', index=False)

In [241]:
df_images_only.info()
df_features_only.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2334 entries, 0 to 2333
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        2334 non-null   object
 1   image_1   2278 non-null   object
 2   image_2   1887 non-null   object
 3   image_3   1689 non-null   object
 4   image_4   1285 non-null   object
 5   image_5   997 non-null    object
 6   image_6   705 non-null    object
 7   image_7   531 non-null    object
 8   image_8   405 non-null    object
 9   image_9   303 non-null    object
 10  image_10  157 non-null    object
 11  image_11  2 non-null      object
dtypes: object(12)
memory usage: 218.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2334 entries, 0 to 2333
Data columns (total 16 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   id                                      2334 non-null   object 
 1  

In [240]:
df_features_only['price'] = df_features_only['price'].str.replace('€', '').str.replace(' ', '').astype(float)