In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import time
import matplotlib.pyplot as plt
import csv
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from IPython.display import display
import re
from tqdm.notebook import tqdm
import networkx as nx
import numpy as np
from datetime import datetime, timedelta

# functions

In [12]:
def login(driver, url):
    driver.get(url)

    # switch to the iframe
    iframe_title = "SP Consent Message"
    iframe = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, f"//iframe[@title='{iframe_title}']")))
    # iframe = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, 'sp_message_iframe_1117340')))
    driver.switch_to.frame(iframe)

    # accept the cookies
    try:
        wait = WebDriverWait(driver, 20)  # increase wait time to 20 seconds
        button = wait.until(EC.presence_of_element_located((By.XPATH, '//button[@title="Accept" and @aria-label="Accept"]')))  # use more specific XPath
        button.click()
    except TimeoutException:
        print("Could not find the Accept button.")

    driver.switch_to.default_content()

    time.sleep(1)
    # click the login button
    driver.find_element(By.ID, 'signIn').click()
    time.sleep(1)
    # enter username
    driver.find_element(By.ID, 'username').send_keys('timo1024')
    # enter password
    driver.find_element(By.ID, 'password').send_keys('StampworldSecure')
    # click the login button
    driver.find_element(By.ID, 'loginBtn').click()
    time.sleep(1)

def getCountryLinks(driver, url):
    driver.get(url)
    time.sleep(1)
    # get the page source
    page_source = driver.page_source
    # parse the page source
    soup = BeautifulSoup(page_source, 'html.parser')
    # get the first table in div with class 'sitemap'
    table = soup.find('div', class_='sitemap').find('table')
    # get all the links in the table
    links = table.find_all('a')
    # get the href attribute of each link
    country_links = [link['href'] for link in links]
    return country_links

def getAllLinksPerCountry(driver, url):
    driver.get(url)
    time.sleep(1)
    # get the page source
    page_source = driver.page_source
    # parse the page source
    soup = BeautifulSoup(page_source, 'html.parser')
    # get all a tags in div class 'sitemap'
    links = soup.find('div', class_='sitemap').find_all('a')
    
    return links

# create empty data frame

In [27]:
df = pd.DataFrame(columns=['link', 'country', 'year', 'category'])

options = Options()
options.headless = False
driver = webdriver.Firefox(options=options)

# login
login(driver, 'https://www.stampworld.com/en/')
time.sleep(1)

sitemap_url = 'https://www.stampworld.com/en/sitemap/'
country_links = getCountryLinks(driver, sitemap_url)

# loop over all country links
for country_link in tqdm(country_links):
    print(country_link)
    country_url = f'https://www.stampworld.com{country_link}'

    # get the all individual links [link, country, year, category][]
    all_links_per_country = getAllLinksPerCountry(driver, country_url)

    # add data to df
    for link in all_links_per_country:
        link_url = f'https://www.stampworld.com{link["href"]}'
        # title="Aaland - Franking labels - 1984"
        country = link['title'].split(' - ')[0]
        year = link['title'].split(' - ')[2]
        category = link['title'].split(' - ')[1]
        df = pd.concat([df, pd.DataFrame([[link_url, country, year, category]], columns=['link', 'country', 'year', 'category'])])

driver.quit()

# save as csv
df.to_csv('./csv_files/all_links_all_countries.csv', index=False)

  0%|          | 0/1210 [00:00<?, ?it/s]

/en/sitemap/catalogue/Aaland/
/en/sitemap/catalogue/Aberdeen/
/en/sitemap/catalogue/Abingdon/
/en/sitemap/catalogue/Abkhazia/
/en/sitemap/catalogue/Abu-Dhabi/
/en/sitemap/catalogue/Achtyrka/
/en/sitemap/catalogue/Aden/
/en/sitemap/catalogue/Aegean-Islands/
/en/sitemap/catalogue/Afars-and-Issas/
/en/sitemap/catalogue/Afghanistan/
/en/sitemap/catalogue/Agion-Oras-Athos/
/en/sitemap/catalogue/Aguera,-La/
/en/sitemap/catalogue/Aitutaki/
/en/sitemap/catalogue/Ajman/
/en/sitemap/catalogue/Alabama/
/en/sitemap/catalogue/Alaouites/
/en/sitemap/catalogue/Albania/
/en/sitemap/catalogue/Albany/
/en/sitemap/catalogue/Alderney/
/en/sitemap/catalogue/Alexandretta/
/en/sitemap/catalogue/Alexandria/
/en/sitemap/catalogue/Algeria/
/en/sitemap/catalogue/Allenstein/
/en/sitemap/catalogue/Allied-Military-Government-Italy/
/en/sitemap/catalogue/Allied-Occupation/
/en/sitemap/catalogue/Alwar/
/en/sitemap/catalogue/American-Samoa/
/en/sitemap/catalogue/AMG-Naples/
/en/sitemap/catalogue/AMG-Sicily/
/en/sitema

In [32]:
# get unique categories
categories = df['category'].unique()
categories

array(['Franking labels', 'Postage stamps', 'Parcel Post Stamps',
       'Postage-due stamps', 'Official stamps', 'Parcel post stamps',
       'Tax stamps', 'Telegraph stamps', 'Postage due stamps',
       'Air tax stamps', 'Tax due stamps', 'Newspaper Revenue stamps',
       'Postal labels', 'Unissued', 'Postage-due Stamps',
       'Italien-Issues', 'Montenegro-Issues', 'Romanian-Issues',
       'Serbian-Issues', 'Parcel-post stamps', 'Postage due tax stamps',
       'Officail stamps', 'Airmail stamps', 'Government service stamps',
       'Parcel post', 'Eupen and Malmedy', 'Military Post',
       'Military post in Rheinland', 'Newspaper postage-due stamps',
       'Railway parcelpost stamps', 'Fee stamps', 'Postal tax stamps',
       'Revenue stamps', 'Military post', 'Postal Tax stamps', 'Airmail',
       'Revenue Stamps', 'Military stamps', 'Service stamps',
       'Ribon stamps', 'Official Stamps', 'Heraklion', 'Rethymnon',
       'Revolutionary issues', 'Christmas stamps', 'Posta