# load libraries

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from bs4 import BeautifulSoup
import time
import matplotlib.pyplot as plt
import csv
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from IPython.display import display
import re
from tqdm.notebook import tqdm
import networkx as nx
import numpy as np
from datetime import datetime, timedelta
import os

# Get images

## load countries

In [5]:
countries = pd.read_csv('country_links_pages.csv', index_col=0)
countries.head()

Unnamed: 0,url,parent_countries,continent,total_pages
0,/en/stamps/Aaland/,['Finland'],Europe,12.0
1,/en/stamps/Aegean-Islands/,"['Italy', 'Italian Occupations', 'Aegean Islan...",Europe,4.0
2,/en/stamps/Calimno/,"['Italy', 'Italian Occupations', 'Aegean Islan...",Europe,1.0
3,/en/stamps/Caso/,"['Italy', 'Italian Occupations', 'Aegean Islan...",Europe,1.0
4,/en/stamps/Castelrosso/,"['Italy', 'Italian Occupations', 'Aegean Islan...",Europe,1.0


## functions

In [6]:
def login(driver, url):
    driver.get(url)

    # switch to the iframe
    iframe = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, 'sp_message_iframe_1117340')))
    driver.switch_to.frame(iframe)

    # accept the cookies
    try:
        wait = WebDriverWait(driver, 20)  # increase wait time to 20 seconds
        button = wait.until(EC.presence_of_element_located((By.XPATH, '//button[@title="Accept" and @aria-label="Accept"]')))  # use more specific XPath
        button.click()
    except TimeoutException:
        print("Could not find the Accept button.")

    driver.switch_to.default_content()

    time.sleep(1)
    # click the login button
    driver.find_element(By.ID, 'signIn').click()
    time.sleep(1)
    # enter username
    driver.find_element(By.ID, 'username').send_keys('timo1024')
    # enter password
    driver.find_element(By.ID, 'password').send_keys('StampworldSecure')
    # click the login button
    driver.find_element(By.ID, 'loginBtn').click()
    time.sleep(1)

def get_country_name(driver):
    time.sleep(1)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    country_name = soup.find('h6', {'id': 'topBlueBarGroupsCatalogLabel'}).text.split("(")[0].strip()
    return country_name

def get_image_links(driver):
    image_links = []
    try:
        group_boxes = driver.find_elements(By.CSS_SELECTOR, "div[id^='group_box_']")
        for group_box in group_boxes:
            group_id = group_box.get_attribute("id").split("_")[-1]
            image_container = group_box.find_element(By.CLASS_NAME, "images_container")
            imgs = image_container.find_elements(By.TAG_NAME, "img")
            for img in imgs:
                image_links.append((group_id, img.get_attribute("src")))
    except NoSuchElementException:
        pass
    return image_links

def save_image(url, path):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(path, 'wb') as file:
            for chunk in response:
                file.write(chunk)
    else:
        print(f"Could not download image from {url}")

## main

In [9]:
# Split country rows into 62 separate dataframes
countries_split = np.array_split(countries, 62)

# For single indices
indeces = list(range(3, 62))
# indeces = [2]
counter = 1

# Loop over all dataframes
for i in indeces:
    print(f"Processing countries subset nr. {i}")
    countries_subset = countries_split[i]
    try:
        options = Options()
        options.headless = False
        driver = webdriver.Firefox(options=options)
        driver.set_page_load_timeout(120)

        # Login
        login(driver, 'https://www.stampworld.com/en/')
        time.sleep(1)

        for index, row in tqdm(countries_subset.iterrows(), total=countries_subset.shape[0], desc="Processing"):
            link = f"https://www.stampworld.com{row.url}"
            try:
                driver.get(link)
            except TimeoutException:
                print("TimeoutException")
                continue
            country_name = get_country_name(driver)

            # Create country directory if it doesn't exist
            country_path = os.path.join("./images", country_name)
            os.makedirs(country_path, exist_ok=True)

            # Loop over all pages from 1 to the last page
            for page in range(1, int(row.total_pages) + 1):
                # If page is multiple of 40 relogin to website (close driver and open new one)
                if counter % 40 == 0:
                    driver.quit()
                    driver = webdriver.Firefox(options=options)
                    driver.set_page_load_timeout(60)
                    login(driver, 'https://www.stampworld.com/en/')

                time.sleep(1)
                # Get the link of the page
                page_link = f"https://www.stampworld.com{row.url}?page={page}"

                # Set a timeout for the operation
                timeout = timedelta(minutes=1)
                start_time = datetime.now()

                while datetime.now() - start_time < timeout:
                    try:
                        driver.get(page_link)
                        time.sleep(1)
                        # Scroll down the page
                        # driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                        # time.sleep(2)  # Allow time for the page to load after scrolling
                        break  # If successful, break the loop
                    except TimeoutException:
                        print("TimeoutException, retrying...")
                        continue

                image_links = get_image_links(driver)
                x = 0
                for group_id, img_src in image_links:
                    group_path = os.path.join(country_path, group_id)
                    os.makedirs(group_path, exist_ok=True)
                    img_path = os.path.join(group_path, f"{x}.jpg")
                    save_image(img_src, img_path)
                    x += 1

        driver.quit()
    except Exception as e:
        print(e)
        try:
            driver.quit()
        except:
            print("Could not quit driver.")
        continue

Processing countries subset nr. 3


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

Processing countries subset nr. 4


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

TimeoutException, retrying...
Processing countries subset nr. 5


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

Processing countries subset nr. 6


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

Processing countries subset nr. 7


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

Processing countries subset nr. 8


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

Processing countries subset nr. 9


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

'NoneType' object has no attribute 'text'
Processing countries subset nr. 10


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

Processing countries subset nr. 11


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

Processing countries subset nr. 12


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

Processing countries subset nr. 13


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

Processing countries subset nr. 14


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

Processing countries subset nr. 15


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

Processing countries subset nr. 16


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

TimeoutException, retrying...
Message: out of memory
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5
UnknownError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:832:5
wrap@chrome://remote/content/shared/webdriver/Errors.sys.mjs:139:12
receiveMessage@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:182:34

Processing countries subset nr. 17


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

TimeoutException, retrying...
TimeoutException, retrying...
Processing countries subset nr. 18


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

Processing countries subset nr. 19


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

TimeoutException, retrying...
TimeoutException, retrying...
TimeoutException, retrying...
Processing countries subset nr. 20


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

TimeoutException, retrying...
TimeoutException, retrying...
TimeoutException, retrying...
Processing countries subset nr. 21


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

Processing countries subset nr. 22


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

TimeoutException, retrying...
Processing countries subset nr. 23


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

TimeoutException, retrying...
TimeoutException, retrying...
TimeoutException, retrying...
TimeoutException, retrying...
TimeoutException, retrying...
Processing countries subset nr. 24


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

HTTPSConnectionPool(host='www.stampworld.com', port=443): Max retries exceeded with url: /media/catalogue/Redonda/Postage-stamps/V-s.jpg (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000029704C7CE90>, 'Connection to www.stampworld.com timed out. (connect timeout=None)'))
Processing countries subset nr. 25


Error sending stats to Plausible: error sending request for url (https://plausible.io/api/event)


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

TimeoutException, retrying...
Processing countries subset nr. 26


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

TimeoutException, retrying...
TimeoutException, retrying...
TimeoutException, retrying...
Processing countries subset nr. 27


Processing:   0%|          | 0/19 [00:00<?, ?it/s]

TimeoutException, retrying...
Message: InactiveActor: Actor is no longer active
Stacktrace:
receiveMessage@chrome://remote/content/marionette/actors/MarionetteCommandsChild.sys.mjs:64:13

Processing countries subset nr. 28
Message: 
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:511:5
dom.find/</<@chrome://remote/content/shared/DOM.sys.mjs:136:16

Processing countries subset nr. 29
Message: 
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:511:5
dom.find/</<@chrome://remote/content/shared/DOM.sys.mjs:136:16

Processing countries subset nr. 30
Message: 
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:

KeyboardInterrupt: 

# get watermarks

In [3]:
stamps = pd.read_csv('all_countries.csv', index_col=0)

  stamps = pd.read_csv('all_countries.csv', index_col=0)


In [36]:
# get all unique values in the 'Watermark' (except NaN) for each country (column 'Country')
watermarks = stamps.groupby('Country')['Watermark'].unique().apply(lambda x: [i for i in x if i == i]).reset_index()
# convert to integers
watermarks['Watermark'] = watermarks['Watermark'].apply(lambda x: [int(i) for i in x])

In [37]:
# get all watermark images from "https://www.stampworld.com/media/catalogue/[Country with - instead of space]/Postage-stamps/wm[watermark].gif" and save them if it worked in ./images/[country with spaces]/watermarks/[watermark].gif
for index, row in tqdm(watermarks.iterrows(), total=watermarks.shape[0], desc="Processing"):
    country = row['Country']
    watermarks = row['Watermark']
    country_path = os.path.join("./images", country)
    watermarks_path = os.path.join(country_path, "watermarks")
    os.makedirs(watermarks_path, exist_ok=True)
    for watermark in watermarks:
        watermark_path = os.path.join(watermarks_path, f"{watermark}.gif")
        # if watermark is just one digit, add a 0 in front of it
        if watermark < 10:
            watermark = f"0{watermark}"
        watermark_url = f"https://www.stampworld.com/media/catalogue/{country.replace(' ', '-')}/Postage-stamps/wm{watermark}.gif"
        save_image(watermark_url, watermark_path)

Processing:   0%|          | 0/1125 [00:00<?, ?it/s]