In [52]:
import pandas as pd 
import numpy as np 
import json
import time
import tqdm

import requests
import cloudscraper

from curl_params import HEADERS, COOKIES, get_params

SLEEP_TIME = 20

# Parse men shoes

In [53]:
cloudscraper = cloudscraper.create_scraper()

with open('raw_data_urls.json', "r") as file:
    data = json.load(file)

page_list = [str(page_number) for page_number in range(1, 42)]
page_list = list(set(page_list) - set(list(data['men']['pages'].keys())))

for page_number in tqdm.tqdm(page_list):
    response = cloudscraper.get('https://www.ssense.com/en-id/men/shoes.json',
                         params=get_params(page_number=int(page_number)),
                         cookies=COOKIES,
                         headers=HEADERS)
    if response.status_code == 200:
        with open('raw_data_urls.json', "r") as file:
            data = json.load(file)
            if page_number not in data['men']['pages'].keys():
                data['men']['pages'][page_number] = response.json()['products']
                with open('raw_data_urls.json', "w") as file:
                    json.dump(data, file)
    else:
        print(f"response failed with status code {response.status_code}, page_number {page_number}")
    time.sleep(SLEEP_TIME)

100%|██████████| 41/41 [14:28<00:00, 21.19s/it]


# Parse women shoes

In [55]:
cloudscraper = cloudscraper.create_scraper()

with open('raw_data_urls.json', "r") as file:
    data = json.load(file)

page_list = [str(page_number) for page_number in range(1, 48)]
page_list = list(set(page_list) - set(list(data['women']['pages'].keys())))

for page_number in tqdm.tqdm(page_list):
    response = cloudscraper.get('https://www.ssense.com/en-id/women/shoes.json',
                         params=get_params(page_number=int(page_number)),
                         cookies=COOKIES,
                         headers=HEADERS)
    if response.status_code == 200:
        with open('raw_data_urls.json', "r") as file:
            data = json.load(file)
            if page_number not in data['women']['pages'].keys():
                data['women']['pages'][page_number] = response.json()['products']
                with open('raw_data_urls.json', "w") as file:
                    json.dump(data, file)
    else:
        print(f"response failed with status code {response.status_code}, page_number {page_number}")
    time.sleep(SLEEP_TIME)

100%|██████████| 47/47 [17:07<00:00, 21.86s/it]


In [80]:
def extract_brand(product: dict) -> str: 
    """extract brand from json

    Args:
        product (dict): 

    Returns:
        str: 
    """
    return product['brand']['name']['en']
    

def extract_model(product: dict) -> str: 
    """extract brand from json

    Args:
        product (dict): 

    Returns:
        str: 
    """
    return product['name']['en']
    

def extract_photos(product: dict) -> str: 
    """extract photos from json

    Args:
        product (dict): 

    Returns:
        str: 
    """
    IMAGE_PARAMS = "b_white,c_lpad,g_center,h_1412,w_940/c_scale,h_960/f_auto,dpr_2.0"
    ID = product['image'][0]
    ID = ID[67:-6]
    return [f"https://res.cloudinary.com/ssenseweb/image/upload/{IMAGE_PARAMS}/{ID}_{x}.jpg"
            for x in range(1, 5)]
    

In [81]:
extract_photos(response.json()['products'][0])

['https://res.cloudinary.com/ssenseweb/image/upload/b_white,c_lpad,g_center,h_1412,w_940/c_scale,h_960/f_auto,dpr_2.0/241129F122007_1.jpg',
 'https://res.cloudinary.com/ssenseweb/image/upload/b_white,c_lpad,g_center,h_1412,w_940/c_scale,h_960/f_auto,dpr_2.0/241129F122007_2.jpg',
 'https://res.cloudinary.com/ssenseweb/image/upload/b_white,c_lpad,g_center,h_1412,w_940/c_scale,h_960/f_auto,dpr_2.0/241129F122007_3.jpg',
 'https://res.cloudinary.com/ssenseweb/image/upload/b_white,c_lpad,g_center,h_1412,w_940/c_scale,h_960/f_auto,dpr_2.0/241129F122007_4.jpg']

# Dataset from raw data

In [87]:
with open('raw_data_urls.json', "r") as file:
    data = json.load(file)

In [97]:
df = pd.DataFrame()

for gender in data.keys():
    for page_number in tqdm.tqdm(data[gender]['pages'].keys()):
        for product in data[gender]['pages'][page_number]:
            brand = extract_brand(product=product)
            model = extract_model(product=product)
            photos = extract_photos(product=product)
            tmp_df = pd.DataFrame({"brand": [brand]*4,
                                   "model": [model]*4,
                                   "url": photos})
            df = pd.concat([df, tmp_df])

100%|██████████| 41/41 [00:23<00:00,  1.72it/s]
100%|██████████| 47/47 [00:33<00:00,  1.39it/s]


In [98]:
df

Unnamed: 0,brand,model,url
0,Vans,Navy Old Skool 36 Sneakers,https://res.cloudinary.com/ssenseweb/image/upl...
1,Vans,Navy Old Skool 36 Sneakers,https://res.cloudinary.com/ssenseweb/image/upl...
2,Vans,Navy Old Skool 36 Sneakers,https://res.cloudinary.com/ssenseweb/image/upl...
3,Vans,Navy Old Skool 36 Sneakers,https://res.cloudinary.com/ssenseweb/image/upl...
0,Vans,Off-White Old Skool 36 LX Sneakers,https://res.cloudinary.com/ssenseweb/image/upl...
...,...,...,...
3,Proenza Schouler,Red Square Flat Strappy Sandals,https://res.cloudinary.com/ssenseweb/image/upl...
0,TOTEME,Black 'The Suede Tie' Sandals,https://res.cloudinary.com/ssenseweb/image/upl...
1,TOTEME,Black 'The Suede Tie' Sandals,https://res.cloudinary.com/ssenseweb/image/upl...
2,TOTEME,Black 'The Suede Tie' Sandals,https://res.cloudinary.com/ssenseweb/image/upl...


In [99]:
df.to_csv('url_dataset.csv', index=None)