In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import time
import matplotlib.pyplot as plt
import csv
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from IPython.display import display
import re
from tqdm.notebook import tqdm
import networkx as nx
import numpy as np
from datetime import datetime, timedelta

# functions

In [26]:
def login(driver, url):
    driver.get(url)

    # switch to the iframe
    iframe_title = "SP Consent Message"
    iframe = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, f"//iframe[@title='{iframe_title}']")))
    # iframe = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.ID, 'sp_message_iframe_1117340')))
    driver.switch_to.frame(iframe)

    # accept the cookies
    try:
        wait = WebDriverWait(driver, 20)  # increase wait time to 20 seconds
        button = wait.until(EC.presence_of_element_located((By.XPATH, '//button[@title="Accept" and @aria-label="Accept"]')))  # use more specific XPath
        button.click()
    except TimeoutException:
        print("Could not find the Accept button.")

    driver.switch_to.default_content()

    time.sleep(1)
    # click the login button
    driver.find_element(By.ID, 'signIn').click()
    time.sleep(1)
    # enter username
    driver.find_element(By.ID, 'username').send_keys('timo1024')
    # enter password
    driver.find_element(By.ID, 'password').send_keys('StampworldSecure')
    # click the login button
    driver.find_element(By.ID, 'loginBtn').click()
    time.sleep(1)

def getInfo(soup, country, year, category):

    columns = ['No', 'ID', 'SetDescription', 'Date', 'Year', 'Watermark', 'Sheetsize', 'Design', 'Engraving', 'Perforation', 'Type', 'Value', 'Color', 'Description', 'StampsIssued', 'Mint', 'Unused', 'Used', 'LetterFDC', 'Currency', 'GroupID', 'Country', 'Category']
    df = pd.DataFrame(columns=columns)

    # get all ids which start with "group_box_"
    ids = [div['id'] for div in soup.find_all('div', {'id': re.compile(r'group_box_')})]

    # loop over ids
    for id in ids:
        # get the id
        GroupID = id.split('_')[-1]
        # get the div and its content
        div = soup.find('div', {'id': id})

        # get the text in the a tag in the div with class table_header
        divHeader = div.find('div', {'class': 'table_header'})
        SetDescription = divHeader.find('a').text.strip()

        Year = year
        Country = country
        Category = category

        setInformation = divHeader.find('p').text.strip().split('\n')
        # strip all elements
        setInformation = [x.strip() for x in setInformation if x.strip() != '']
        Date = setInformation[0]
        
        Watermark = [x for x in setInformation if 'WM' in x]
        if len(Watermark) > 0:
            Watermark = ' '.join(Watermark[0].split(' ')[1:]).strip()
        else:
            Watermark = ''
        Sheetsize = [x for x in setInformation if 'Sheetsize' in x]
        if len(Sheetsize) > 0:
            Sheetsize = ' '.join(Sheetsize[0].split(' ')[1:]).strip()
        else:
            Sheetsize = ''
        Design = [x for x in setInformation if 'Design' in x]
        if len(Design) > 0:
            Design = ' '.join(Design[0].split(' ')[1:]).strip()
        else:
            Design = ''
        Engraving = [x for x in setInformation if 'Engraving' in x]
        if len(Engraving) > 0:
            Engraving = ' '.join(Engraving[0].split(' ')[1:]).strip()
        else:
            Engraving = ''
        Perforation = [x for x in setInformation if 'Perforation' in x]
        if len(Perforation) > 0:
            Perforation = ' '.join(Perforation[0].split(' ')[1:]).strip()
        else:
            Perforation = ''

        # get column names
        # get list of all th with scope col
        colnames = []
        ths = div.find_all('th', {'scope': 'col'})[1:]
        for th in ths:
            # print(th.text.strip())
            # if content of th is a string append it. if it is a img tag append the alt attribute and if there is nothing in there at all append "null"
            if th.find('img') != None:
                colnames.append(th.find('img')['alt'])
            elif th.text.strip() != '':
                colnames.append(th.text.strip())
            else:
                colnames.append('null')

        # make the first of the last "null" entries "Currency"
        colnames[-5] = 'Currency'
        indexType = colnames.index('Type')
        indexValue = colnames.index('D')
        indexColor = colnames.index('Color')
        indexDescription = colnames.index('Description')
        indexStampsIssued = colnames.index('Stamps Issued')
        indexMint = colnames.index('Mint Condition')
        indexUnused = colnames.index('Unused')
        indexUsed = colnames.index('Used')
        indexLetterFDC = colnames.index('Letter/FDC')
        indexCurrency = colnames.index('Currency')

        # get list of stamp rows
        stamps = div.find_all('tr', {'class': 'stamp_tr'})
        # stamps = div.find_all('tr', {'class': 'stamp_tr', 'data-stamp-group-id': GroupID})
        # loop over stamps
        for stamp in stamps:
            No = stamp.find('th').text.strip()

            # get the first th and there the a tag and the id from it
            ID = stamp.find('th').find('a')['id']
            # get all the tds
            tds = stamp.find_all('td')

            # get the type
            Type = tds[indexType].text.strip()
            # get the value
            Value = tds[indexValue].text.strip()
            # get the color
            Color = tds[indexColor].text.strip()
            # get the description
            Description = tds[indexDescription].text.strip()
            # get the stamps issued
            StampsIssued = tds[indexStampsIssued].text.strip()
            # get the mint
            Mint = tds[indexMint].text.strip()
            # get the unused
            Unused = tds[indexUnused].text.strip()
            # get the used
            Used = tds[indexUsed].text.strip()
            # get the letterfdc
            LetterFDC = tds[indexLetterFDC].text.strip()
            # get the currency
            Currency = tds[indexCurrency].text.strip()

            # append the data to the dataframe
            df = pd.concat([df, pd.DataFrame([[No, ID, SetDescription, Date, Year, Watermark, Sheetsize, Design, Engraving, Perforation, Type, Value, Color, Description, StampsIssued, Mint, Unused, Used, LetterFDC, Currency, GroupID, Country, Category]], columns=columns)], ignore_index=True)

    return df

def getImageLinks(soup, country):

    columns = ['GroupID', 'Country', 'Url']
    df = pd.DataFrame(columns=columns)

    ids = [div['id'] for div in soup.find_all('div', {'id': re.compile(r'group_box_')})]

    # loop over ids
    for id in ids:
        # get the id
        GroupID = id.split('_')[-1]
        # get the div and its content
        div = soup.find('div', {'id': id})
        image_div = div.find('div', {'class': 'images_container'})
        # get all image links
        image_links = [img['src'] for img in image_div.find_all('img')]

        for image_link in image_links:
            df = pd.concat([df, pd.DataFrame([[GroupID, country, image_link]], columns=columns)], ignore_index=True)

    return df

In [11]:
# load all_countries_2.csv
df = pd.read_csv('./csv_files/all_links_all_countries.csv')
df

Unnamed: 0,link,country,year,category
0,https://www.stampworld.com/en/stamps/Aaland/Fr...,Aaland,1984,Franking labels
1,https://www.stampworld.com/en/stamps/Aaland/Fr...,Aaland,1988,Franking labels
2,https://www.stampworld.com/en/stamps/Aaland/Fr...,Aaland,1989,Franking labels
3,https://www.stampworld.com/en/stamps/Aaland/Fr...,Aaland,1990,Franking labels
4,https://www.stampworld.com/en/stamps/Aaland/Fr...,Aaland,1993,Franking labels
...,...,...,...,...
29312,https://www.stampworld.com/en/stamps/Zululand/...,Zululand,1888,Postage stamps
29313,https://www.stampworld.com/en/stamps/Zululand/...,Zululand,1894,Postage stamps
29314,https://www.stampworld.com/en/stamps/Zululand/...,Zululand,1891,Revenue stamps
29315,https://www.stampworld.com/en/stamps/Zurich/Po...,Zurich,1843,Postage stamps


In [33]:
dfInfo = pd.DataFrame(columns=['No', 'ID', 'SetDescription', 'Date', 'Year', 'Watermark', 'Sheetsize', 'Design', 'Engraving', 'Perforation', 'Type', 'Value', 'Color', 'Description', 'StampsIssued', 'Mint', 'Unused', 'Used', 'LetterFDC', 'Currency', 'GroupID', 'Country', 'Category'])
dfImageLinks = pd.DataFrame(columns=['GroupID', 'Country', 'Url'])

# login to stampworld
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)

login(driver, 'https://www.stampworld.com/en/')
time.sleep(1)

# iterate over all rows in df
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
# for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    if(index == 7811):
        try:
            # go to page
            # Set a timeout for the operation
            timeout = timedelta(minutes=1)
            start_time = datetime.now()

            while datetime.now() - start_time < timeout:
                try:
                    driver.get(row['link'])
                    time.sleep(1)
                    # scroll down the div with id 'main' (complteley to the bottom)
                    try:
                        driver.execute_script("document.getElementById('main').scrollTo(0, document.getElementById('main').scrollHeight);")
                        time.sleep(1)
                    except Exception as e:
                        print(f"Error while scrolling at index {index}: {e}")
                        break
                    break
                except TimeoutException:
                    print("TimeoutException, retrying...")
                    continue

            # get the page source
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')

            country = row['country']
            year = row['year']
            category = row['category']

            # get the group id
            newRowDfInfo = getInfo(soup, country, year, category)
            dfInfo = pd.concat([dfInfo, newRowDfInfo], ignore_index=True)

            newRowDfImageLinks = getImageLinks(soup, country)
            dfImageLinks = pd.concat([dfImageLinks, newRowDfImageLinks], ignore_index=True)

        except Exception as e:
            print(f"Error at index {index}: {e}")
            continue

driver.quit()

dfInfo.to_csv('./csv_files/all_stamps_info_2.csv', index=False)
dfImageLinks.to_csv('./csv_files/all_images_url_2.csv', index=False)



  0%|          | 0/29317 [00:00<?, ?it/s]

In [30]:
dfInfo.tail(10)

Unnamed: 0,No,ID,SetDescription,Date,Year,Watermark,Sheetsize,Design,Engraving,Perforation,...,Description,StampsIssued,Mint,Unused,Used,LetterFDC,Currency,GroupID,Country,Category
51,41‑44,a_s_0041-44,2021 \n Franking Labels ...,2. February,2021,,,Julia Perander,,14¼,...,,,5.0,-,5.0,-,EUR,266841,Aaland,Franking labels
52,1,a_s_0001,1984 \n Fishing Boat,1. March,1984,,100.0,Pirkko Vahtero,,11,...,Ships,(2 mill),0.25,-,0.25,-,EUR,8936,Aaland,Postage stamps
53,2,a_s_0002,1984 \n Fishing Boat,1. March,1984,,100.0,Pirkko Vahtero,,11,...,Ships,(2 mill),0.25,-,0.25,-,EUR,8936,Aaland,Postage stamps
54,3,a_s_0003,1984 \n Fishing Boat,1. March,1984,,100.0,Pirkko Vahtero,,11,...,Ships,"(1,5 mill)",0.5,-,0.5,-,EUR,8936,Aaland,Postage stamps
55,1‑3,a_s_0001-3,1984 \n Fishing Boat,1. March,1984,,100.0,Pirkko Vahtero,,11,...,,,1.0,-,1.0,-,EUR,8936,Aaland,Postage stamps
56,4,a_s_0004,1984 \n National Symbols,1. March,1984,,40.0,Pentti Rahikainen and Eeva Oivo,(Feuille de 40 timbres + 10 vignettes),13,...,Perf: 14,(3 mill),0.5,-,0.5,-,EUR,9870,Aaland,Postage stamps
57,5,a_s_0005,1984 \n National Symbols,1. March,1984,,40.0,Pentti Rahikainen and Eeva Oivo,(Feuille de 40 timbres + 10 vignettes),13,...,,"(1,5 mill)",1.0,-,1.0,-,EUR,9870,Aaland,Postage stamps
58,6,a_s_0006,1984 \n National Symbols,1. March,1984,,40.0,Pentti Rahikainen and Eeva Oivo,(Feuille de 40 timbres + 10 vignettes),13,...,Royal,(1 mill),3.0,-,3.0,-,EUR,9870,Aaland,Postage stamps
59,4‑6,a_s_0004-6,1984 \n National Symbols,1. March,1984,,40.0,Pentti Rahikainen and Eeva Oivo,(Feuille de 40 timbres + 10 vignettes),13,...,,,4.5,-,4.5,-,EUR,9870,Aaland,Postage stamps
60,7,a_s_0007,1984 \n Navigation,1. March,1984,,40.0,Eeva Oivo,(Feuille de 40 timbres + 10 vignettes),14,...,,(1 mill),2.5,-,2.5,-,EUR,8938,Aaland,Postage stamps


In [31]:
dfImageLinks.tail(10)

Unnamed: 0,GroupID,Country,Url
42,266841,Aaland,https://www.stampworld.com/media/catalogue/Aal...
43,266841,Aaland,https://www.stampworld.com/media/catalogue/Aal...
44,266841,Aaland,https://www.stampworld.com/media/catalogue/Aal...
45,8936,Aaland,https://www.stampworld.com/media/catalogue/Aal...
46,8936,Aaland,https://www.stampworld.com/media/catalogue/Aal...
47,8936,Aaland,https://www.stampworld.com/media/catalogue/Aal...
48,9870,Aaland,https://www.stampworld.com/media/catalogue/Aal...
49,9870,Aaland,https://www.stampworld.com/media/catalogue/Aal...
50,9870,Aaland,https://www.stampworld.com/media/catalogue/Aal...
51,8938,Aaland,https://www.stampworld.com/media/catalogue/Aal...
