# Amazon photo downloader

In [22]:
from exceptiongroup import catch

REVIEW_PATH = './Sports_and_Outdoors.jsonl'
META_PATH = './meta_Sports_and_Outdoors.jsonl'
SAVE_PATH = './test.jsonl'
SUBSET = 0.1

## Product filtering

In [2]:
import pandas as pd
import json

data = []
with open(REVIEW_PATH) as f:
    for line in f:
        entry = json.loads(line)

        entry = {
            'parent_asin': entry['parent_asin']
        }

        data.append(entry)

df = pd.DataFrame(data)
df.head(5)

Unnamed: 0,parent_asin
0,B0BGFR76CF
1,B00NXQLFQQ
2,B0957WLR63
3,B00IET8S80
4,B01C2SW7XA


In [3]:
item_counts = df.groupby("parent_asin").size().reset_index(name="review_count")
item_counts.head(5)

Unnamed: 0,parent_asin,review_count
0,0007325614,1
1,0201397544,7
2,030827962X,4
3,0316287229,11
4,0318279622,7


In [4]:
item_counts.sort_values(by=['review_count'], ascending=False, inplace=True)

count = int(item_counts.shape[0] * 0.1)
item_subset = item_counts.iloc[:count]

item_subset.head(10)

Unnamed: 0,parent_asin,review_count
542655,B00NWXLQD2,30369
992373,B07BQRWTDJ,23638
1578374,B0C5RBPW2Y,20298
788846,B01L6RE7Z4,16590
1468642,B09MJKJYLQ,15679
1524787,B0B7J8Y581,14565
1578780,B0C5XW2T2N,14478
1466402,B09LW2KHPM,14029
1532183,B0BBFB48YQ,13986
1561750,B0BTNZ41Y7,13856


In [5]:
df = df[df['parent_asin'].isin(item_subset['parent_asin'])]
df.head(5)

Unnamed: 0,parent_asin
0,B0BGFR76CF
1,B00NXQLFQQ
3,B00IET8S80
5,B09NQK7MH9
6,B01DFBQEKA


## (item_id, url) preparation

In [6]:
data = []
with open(META_PATH) as f:
    for line in f:
        entry = json.loads(line)

        images_list = entry['images']

        if len(images_list) == 0:
            continue

        entry = {
            'parent_asin': entry['parent_asin'],
            'url': images_list[0].get('thumb', None),
        }

        data.append(entry)

df = pd.DataFrame(data)
df.head(5)

Unnamed: 0,parent_asin,url
0,B01HDXC8AG,https://m.media-amazon.com/images/I/510tgKWHp2...
1,B07R5BQ4YD,https://m.media-amazon.com/images/I/31tHUCk7ls...
2,B003K8GZ7G,https://m.media-amazon.com/images/I/41RVlcUTMs...
3,B08GC4GBWB,https://m.media-amazon.com/images/I/41bDwvySkD...
4,B07BYV947H,https://m.media-amazon.com/images/I/41FrdMQKGi...


In [7]:
df = df[df['parent_asin'].isin(item_subset['parent_asin'])]
df.head(5)

Unnamed: 0,parent_asin,url
12,B07BN4PZQF,https://m.media-amazon.com/images/I/41iyo2i0n7...
15,B0BZM3PWZ9,https://m.media-amazon.com/images/I/41fHl4HumY...
16,B07KT27VCZ,https://m.media-amazon.com/images/I/41whDEgmGr...
19,B07GCNXDNV,https://m.media-amazon.com/images/I/51AK7r-KqW...
24,B08C2F6CBW,https://m.media-amazon.com/images/I/41HM4GY2DI...


## Parallel requests with exponential backoff

In [28]:
WINDOW_SIZE = 100
BASE_DELAY = 1
PATIENCE = 3
CONCURRENCY = 5
USER_AGENTS = [
    # Chrome (Windows/Mac/Linux)
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",

    # Firefox (Windows/Mac/Linux)
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14.5; rv:127.0) Gecko/20100101 Firefox/127.0",
    "Mozilla/5.0 (X11; Linux i686; rv:127.0) Gecko/20100101 Firefox/127.0",

    # Safari (Mac/iOS)
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Safari/605.1.15",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Mobile/15E148 Safari/604.1",

    # Edge (Windows)
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0",

    # Mobile Devices (Android/Samsung)
    "Mozilla/5.0 (Linux; Android 14; SM-S928U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.147 Mobile Safari/537.36",
    "Mozilla/5.0 (Linux; Android 14; Pixel 8 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.147 Mobile Safari/537.36",

    # Tablets
    "Mozilla/5.0 (iPad; CPU OS 17_5 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (Linux; Android 14; SM-X810) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.147 Safari/537.36",

    # Less Common Browsers
    "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Vivaldi/6.6",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 OPR/102.0.0.0"
]

In [29]:
import asyncio
import aiohttp
from tqdm import tqdm
import random
import uuid
import base64

data = []
async def make_request(item_id, url, session, semaphore):
    global data
    for i in range(PATIENCE):
        try:
            async with semaphore:
                headers = {
                    "User-Agent": random.choice(USER_AGENTS),
                    "Accept-Language": random.choice(["en-US", "en-GB", "en-CA"])
                }
                cookies={
                    "session-id": str(uuid.uuid4()),
                    "ubid-acbca": random.randint(1000000000, 9999999999)
                }

                async with session.get(url, headers=headers, cookies=cookies) as response:
                    if response.status == 200: # OK
                        photo_data = await response.read()

                        data.append({
                            'parent_asin': item_id,
                            'photo_bytes': base64.b64encode(photo_data)
                        })

                    elif response.status in (429, 503): # backoff
                        retry_after = response.headers.get("Retry-After")
                        wait_time = float(retry_after) if retry_after else BASE_DELAY * (2 ** i)
                        await asyncio.sleep(wait_time)

                    elif 500 <= response.status < 600:
                        wait_time = BASE_DELAY * (2 ** i)
                        await asyncio.sleep(wait_time)

                    else: # Something bad happened
                        break

        except (aiohttp.ClientError, asyncio.TimeoutError): # backoff
            wait_time = BASE_DELAY * (2 ** i) * (1 + random.random())
            await asyncio.sleep(wait_time)

    data.append({'parent_asin': item_id, 'photo_bytes': None})


async def handle_batch(batch, semaphore):
    global data
    async with aiohttp.ClientSession() as session:
        tasks = [make_request(item_id, url, session, semaphore) for item_id, url in batch if url]

        await asyncio.gather(*tasks)

        photo_df = pd.DataFrame(data)

        photo_df.to_csv(SAVE_PATH, index=False, mode='a', header=False, sep='\t')

        data = []


async def download():
    semaphore = asyncio.Semaphore(CONCURRENCY)

    for i in tqdm(range(0, count, WINDOW_SIZE)):
        batch = df.iloc[i:i+WINDOW_SIZE]

        item_ids = batch['parent_asin'].tolist()
        urls = batch['url'].tolist()

        await handle_batch(zip(item_ids, urls), semaphore)
        await asyncio.sleep(BATCH_DELAY)

In [30]:
await download()

  7%|▋         | 107/1588 [06:43<1:33:01,  3.77s/it]


CancelledError: 

In [None]:
df = pd.DataFrame(data)
df.head(5)

## Photo embedding

## Incremental PCA for dimention reduction

## Save embeddings