# Amazon Data Processing

In [40]:
import os
import pandas as pd
import requests
from rich.progress import Progress, TimeRemainingColumn, TimeElapsedColumn, BarColumn, TextColumn
from time import time

In [4]:
df = pd.read_csv("../../dataset/Amazon Dataset/amz_uk_processed_data.csv")
print(df.head())


         asin                                              title  \
0  B09B96TG33  Echo Dot (5th generation, 2022 release) | Big ...   
1  B01HTH3C8S  Anker Soundcore mini, Super-Portable Bluetooth...   
2  B09B8YWXDF  Echo Dot (5th generation, 2022 release) | Big ...   
3  B09B8T5VGV  Echo Dot with clock (5th generation, 2022 rele...   
4  B09WX6QD65  Introducing Echo Pop | Full sound compact Wi-F...   

                                              imgUrl  \
0  https://m.media-amazon.com/images/I/71C3lbbeLs...   
1  https://m.media-amazon.com/images/I/61c5rSxwP0...   
2  https://m.media-amazon.com/images/I/61j3SEUjMJ...   
3  https://m.media-amazon.com/images/I/71yf6yTNWS...   
4  https://m.media-amazon.com/images/I/613dEoF9-r...   

                               productURL  stars  reviews  price  \
0  https://www.amazon.co.uk/dp/B09B96TG33    4.7    15308  21.99   
1  https://www.amazon.co.uk/dp/B01HTH3C8S    4.7    98099  23.99   
2  https://www.amazon.co.uk/dp/B09B8YWXDF    4.7  

In [6]:
df["categoryName"].unique()

array(['Hi-Fi Speakers', 'CD, Disc & Tape Players', 'Wearable Technology',
       'Light Bulbs', 'Bathroom Lighting',
       'Heating, Cooling & Air Quality', 'Coffee & Espresso Machines',
       'Lab & Scientific Products', 'Smart Speakers',
       'Motorbike Clothing', 'Motorbike Accessories',
       'Motorbike Batteries', 'Motorbike Boots & Luggage',
       'Motorbike Chassis', 'Handmade Home & Kitchen Products',
       'Hardware', 'Storage & Home Organisation',
       'Fireplaces, Stoves & Accessories', 'PC Gaming Accessories',
       'USB Gadgets', 'Blank Media Cases & Wallets', 'Car & Motorbike',
       'Boys', 'Sports & Outdoors', 'Microphones', 'String Instruments',
       'Karaoke Equipment', 'PA & Stage',
       'General Music-Making Accessories', 'Wind Instruments',
       'Handmade Gifts', 'Fragrances', 'Calendars & Personal Organisers',
       'Furniture & Lighting', 'Computer Printers', 'Ski Goggles',
       'Snowboards', 'Skiing Poles', 'Downhill Ski Boots',
       'Hiki

In [35]:
# List of categories to sample from
fashion_categories = [
    "Motorbike Clothing", "Women's Sports & Outdoor Shoes", "Tennis Shoes", "Boating Footwear",
    "Cycling Shoes", "Ski Clothing", "Beauty", "Make-up", "Basketball Footwear",
    "Ballet & Dancing Footwear", "Cricket Shoes", "Golf Shoes", "Boxing Shoes", "Men",
    "Handmade Clothing, Shoes & Accessories", "Handmade Jewellery", "Hockey Shoes",
    "Climbing Footwear", "Equestrian Sports Boots", "Hair Care", "Women"
]

# Filter DataFrame for the selected categories
df_filtered = df[df['categoryName'].isin(fashion_categories)]

# Sample up to 100 items per category
sampled_df = (
    df_filtered.groupby('categoryName', group_keys=False)
    .apply(lambda x: x.sample(n=min(100, len(x)), random_state=42)[x.columns])
)

sampled_df.shape

  .apply(lambda x: x.sample(n=min(100, len(x)), random_state=42)[x.columns])


(2100, 10)

In [37]:
sampled_df.to_csv("sampled_fashion_categories.csv", index=False)

In [44]:
df_extracted = sampled_df[['asin', 'imgUrl']].dropna()

df_extracted.shape

(2100, 2)

In [49]:
df_extracted.to_csv("extracted_fashion_categories.csv", index=False)

In [41]:
# Create directory for images
save_dir = "../../dataset/Amazon Dataset/images"
os.makedirs(save_dir, exist_ok=True)

In [48]:
# Initialize progress bar
total_images = len(df_extracted)
start_time = time()

with Progress(
    TextColumn("[bold blue]{task.description}"),
    BarColumn(),
    TextColumn("{task.completed}/{task.total}"),
    TimeElapsedColumn(),
    TimeRemainingColumn(),
    transient=False  # Keeps progress visible after completion
) as progress:
    task = progress.add_task("Downloading Images...", total=total_images)

    for index, row in df_extracted.iterrows():
        asin = row['asin']
        url = row['imgUrl']
        filename = os.path.join(save_dir, f"{asin}.jpg")

        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                with open(filename, "wb") as file:
                    file.write(response.content)
        except requests.RequestException:
            print(f"[red]Failed to download:[/red] {url}")

        progress.update(task, advance=1)

print("[green]Download completed![/green]")

Output()

[green]Download completed![/green]


In [53]:
failed_rows = df_extracted[df_extracted['imgUrl'] == "https://m.media-amazon.com/images/I/61NQ-oaR4WL._AC_UL320_.jpg"]

failed_rows.head()

Unnamed: 0,asin,imgUrl
1530964,B0CHRVR8GG,https://m.media-amazon.com/images/I/61NQ-oaR4W...
