In [12]:
from datasets import load_dataset
import pandas as pd
import os
import requests
from tqdm import tqdm


In [3]:
dataset = load_dataset("DBQ/Chanel.Product.prices.Germany")
dataset

Generating train split: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1428/1428 [00:00<00:00, 50764.64 examples/s]


DatasetDict({
    train: Dataset({
        features: ['website_name', 'competence_date', 'country_code', 'currency_code', 'brand', 'category1_code', 'category2_code', 'category3_code', 'product_code', 'title', 'itemurl', 'imageurl', 'full_price', 'price', 'full_price_eur', 'price_eur', 'flg_discount'],
        num_rows: 1428
    })
})

In [16]:
url_column = "imageurl"      # üîÅ change le nom ici

# 2. Cr√©er un dossier pour stocker les images
output_dir = "Images"
os.makedirs(output_dir, exist_ok=True)

valid_rows = []  # stocke les lignes valides
image_paths = [] # stocke les chemins d'images valides

# 3. T√©l√©charger chaque image
for idx, url in tqdm(df[url_column].items(), total=len(df), desc="T√©l√©chargement images"):

    if pd.isna(url):
        continue

    filename = os.path.join(output_dir, f"{idx:05d}.jpg")

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()

        # Sauvegarde de l'image
        with open(filename, "wb") as f:
            f.write(response.content)

        # Ligne valide ‚Üí on stocke la ligne + chemin
        valid_rows.append(idx)
        image_paths.append(filename)

    except:
        # Pas d'affichage d'erreur (totalement silencieux)
        pass

# 4. Nettoyage : garder uniquement les bonnes lignes
df_clean = df.loc[valid_rows].reset_index(drop=True)

# Ajout de la colonne image_path
df_clean["image_path"] = image_paths

# 5. Sauvegarde du dataset propre
df_clean.to_csv("data_clean.csv", index=False)

print("‚úî T√©l√©chargement termin√©")
print(f"‚úî Lignes valides conserv√©es : {len(df_clean)}")
print("‚úî Chemins d'images ajout√©s dans la colonne 'image_path'")
print("‚úî Dataset nettoy√© : data_clean.csv")

T√©l√©chargement images: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1428/1428 [06:22<00:00,  3.73it/s]

‚úî T√©l√©chargement termin√©
‚úî Lignes valides conserv√©es : 900
‚úî Chemins d'images ajout√©s dans la colonne 'image_path'
‚úî Dataset nettoy√© : data_clean.csv





## Partie 1 : Analyse approfondie du jeu de donn√©es

### 1. Exploration Initiale

In [4]:
df = dataset["train"].to_pandas()
df.head()

Unnamed: 0,website_name,competence_date,country_code,currency_code,brand,category1_code,category2_code,category3_code,product_code,title,itemurl,imageurl,full_price,price,full_price_eur,price_eur,flg_discount
0,Chanel,2023-11-17,DEU,EUR,CHANEL,FASHION,HANDBAGS,N.A.,PAS4152B13455,Tweed & goldfarbenes Metall Schwarz Rosa & B...,https://www.chanel.com/de/mode/p/AS4152B13455N...,https://res.cloudinary.com/dwayrkp5z/image/fet...,5200.0,5200.0,5200.0,5200.0,0
1,Chanel,2023-11-17,DEU,EUR,CHANEL,FASHION,HANDBAGS,N.A.,PA67085Y09953,n.a.,https://www.chanel.com/de/mode/p/A67085Y099539...,https://res.cloudinary.com/dwayrkp5z/image/fet...,5750.0,5750.0,5750.0,5750.0,0
2,Chanel,2023-11-17,DEU,EUR,CHANEL,EYEWEAR,BLUE LIGHT GLASSES,N.A.,PA71441,Schwarz,https://www.chanel.com/de/brillen/p/A71441X081...,https://res.cloudinary.com/dwayrkp5z/image/fet...,420.0,420.0,420.0,420.0,0
3,Chanel,2023-11-17,DEU,EUR,CHANEL,EYEWEAR,SUNGLASSES,N.A.,PA71449,Silberfarben. Gl√§ser: Braun Mit Verlauf,https://www.chanel.com/de/brillen/p/A71449X099...,https://res.cloudinary.com/dwayrkp5z/image/fet...,580.0,580.0,580.0,580.0,0
4,Chanel,2023-11-17,DEU,EUR,CHANEL,EYEWEAR,SUNGLASSES,N.A.,PA71466,Blau & Rosa. Gl√§ser: Grau Mit Verlauf,https://www.chanel.com/de/brillen/p/A71466X010...,https://res.cloudinary.com/dwayrkp5z/image/fet...,480.0,480.0,480.0,480.0,0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1428 entries, 0 to 1427
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   website_name     1428 non-null   object 
 1   competence_date  1428 non-null   object 
 2   country_code     1428 non-null   object 
 3   currency_code    1428 non-null   object 
 4   brand            1428 non-null   object 
 5   category1_code   1428 non-null   object 
 6   category2_code   1428 non-null   object 
 7   category3_code   1428 non-null   object 
 8   product_code     1428 non-null   object 
 9   title            1428 non-null   object 
 10  itemurl          1428 non-null   object 
 11  imageurl         1428 non-null   object 
 12  full_price       1428 non-null   float64
 13  price            1428 non-null   float64
 14  full_price_eur   1428 non-null   float64
 15  price_eur        1428 non-null   float64
 16  flg_discount     1428 non-null   int64  
dtypes: float64(4),

In [None]:
df.info()
print("\nTaux de valeurs manquantes par colonne (%):")
print((df.isna().mean() * 100).round(2))


Ceci nous donne le type de chaque colonne, combien de valeurs non nulles, et le pourcentage de NaN par colonne.

Cependant, on remarque que les r√©sultats sont √† 0 partout, alors qu'on a vu que certaines colones poss√®dent des NA, il faut donc explorer cela.

In [18]:
df.describe(include='all')

Unnamed: 0,website_name,competence_date,country_code,currency_code,brand,category1_code,category2_code,category3_code,product_code,title,itemurl,imageurl,full_price,price,full_price_eur,price_eur,flg_discount
count,1428,1428,1428,1428,1428,1428,1428,1428,1428,1428,1428,1428,1428.0,1428.0,1428.0,1428.0,1428.0
unique,1,1,1,1,1,7,51,1,1428,824,1428,1383,,,,,
top,Chanel,2023-11-17,DEU,EUR,CHANEL,FASHION,READY-TO-WEAR,N.A.,PAS4152B13455,18 Karat Wei√ügold Diamanten,https://www.chanel.com/de/mode/p/AS4152B13455N...,https://res.cloudinary.com/dwayrkp5z/image/fet...,,,,,
freq,1428,1428,1428,1428,1428,550,243,1428,1,68,1,46,,,,,
mean,,,,,,,,,,,,,6076.253725,6076.253725,6076.253725,6076.253725,0.0
std,,,,,,,,,,,,,15219.005999,15219.005999,15219.005999,15219.005999,0.0
min,,,,,,,,,,,,,23.0,23.0,23.0,23.0,0.0
25%,,,,,,,,,,,,,195.0,195.0,195.0,195.0,0.0
50%,,,,,,,,,,,,,1415.0,1415.0,1415.0,1415.0,0.0
75%,,,,,,,,,,,,,5800.0,5800.0,5800.0,5800.0,0.0


Ici, on remarque plusieurs choses int√©r√©ssantes :

Les colonnes website_name, competence_date, country_code, currency_code, brand, category3_code ne sont pas tr√®s utiles : elles ont la m√™me valeur sur toutes les lignes sans exceptions.
Tous les product_code, itemurl sont bien diff√©rents.
Il y a des titres en communs, dont "18 Karat Wei√ügold Diamanten" qui revient jusqu'√† 68 fois.
Il y a √©galement des imageurl qui reviennent plusieurs fois (jusqu'√† 46 fois)

In [17]:
df_clean

Unnamed: 0,website_name,competence_date,country_code,currency_code,brand,category1_code,category2_code,category3_code,product_code,title,itemurl,imageurl,full_price,price,full_price_eur,price_eur,flg_discount,image_path
0,Chanel,2023-11-17,DEU,EUR,CHANEL,FASHION,HANDBAGS,N.A.,PAS4152B13455,Tweed & goldfarbenes Metall Schwarz Rosa & B...,https://www.chanel.com/de/mode/p/AS4152B13455N...,https://res.cloudinary.com/dwayrkp5z/image/fet...,5200.0,5200.0,5200.0,5200.0,0,Images/00000.jpg
1,Chanel,2023-11-17,DEU,EUR,CHANEL,EYEWEAR,BLUE LIGHT GLASSES,N.A.,PA71441,Schwarz,https://www.chanel.com/de/brillen/p/A71441X081...,https://res.cloudinary.com/dwayrkp5z/image/fet...,420.0,420.0,420.0,420.0,0,Images/00002.jpg
2,Chanel,2023-11-17,DEU,EUR,CHANEL,EYEWEAR,SUNGLASSES,N.A.,PA71449,Silberfarben. Gl√§ser: Braun Mit Verlauf,https://www.chanel.com/de/brillen/p/A71449X099...,https://res.cloudinary.com/dwayrkp5z/image/fet...,580.0,580.0,580.0,580.0,0,Images/00003.jpg
3,Chanel,2023-11-17,DEU,EUR,CHANEL,EYEWEAR,SUNGLASSES,N.A.,PA71466,Blau & Rosa. Gl√§ser: Grau Mit Verlauf,https://www.chanel.com/de/brillen/p/A71466X010...,https://res.cloudinary.com/dwayrkp5z/image/fet...,480.0,480.0,480.0,480.0,0,Images/00004.jpg
4,Chanel,2023-11-17,DEU,EUR,CHANEL,EYEWEAR,SUNGLASSES,N.A.,PA71481,Schwarz. Gl√§ser: Grau Mit Verlauf,https://www.chanel.com/de/brillen/p/A71481X021...,https://res.cloudinary.com/dwayrkp5z/image/fet...,620.0,620.0,620.0,620.0,0,Images/00005.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,Chanel,2023-11-17,DEU,EUR,CHANEL,FASHION,READY-TO-WEAR,N.A.,PP75365K10782,Kaschmir & Mohair Beige Schwarz & Korallenrosa,https://www.chanel.com/de/mode/p/P75365K10782N...,https://res.cloudinary.com/dwayrkp5z/image/fet...,2750.0,2750.0,2750.0,2750.0,0,Images/01421.jpg
896,Chanel,2023-11-17,DEU,EUR,CHANEL,MAKEUP,BRUSHES AND ACCESSORIES,N.A.,PP137500,TASCHENSPIEGEL MIT ZWEI FACETTEN,https://www.chanel.com/de/make-up/p/137500/mir...,https://res.cloudinary.com/dwayrkp5z/image/fet...,35.0,35.0,35.0,35.0,0,Images/01422.jpg
897,Chanel,2023-11-17,DEU,EUR,CHANEL,MAKEUP,BRUSHES AND ACCESSORIES,N.A.,PP138848,PR√ÑZISER PUDER-PINSEL,https://www.chanel.com/de/make-up/p/138848/pin...,https://res.cloudinary.com/dwayrkp5z/image/fet...,50.0,50.0,50.0,50.0,0,Images/01423.jpg
898,Chanel,2023-11-17,DEU,EUR,CHANEL,SKINCARE,TONERS & LOTIONS,N.A.,PP140650,POLSTERT AUF ‚Äì GLEICHT AUS ‚Äì SCHENKT STRAHLKRAFT,https://www.chanel.com/de/hautpflege/p/140650/...,https://res.cloudinary.com/dwayrkp5z/image/fet...,92.0,92.0,92.0,92.0,0,Images/01425.jpg
