# Find the cards

In [3]:
# ! pip install beautifulsoup4

In [66]:
import os
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

In [38]:
contents = {}
for set_file in os.listdir('sets/'):
    with open('sets/'+set_file) as f:
        set_name = set_file[:-4]
        contents[set_name] = ' '.join(l for l in f)

In [45]:
set_cards = {}
errors = []
for set_name in contents:
    set_cards[set_name] = []
    soup = BeautifulSoup(contents[set_name])
    results = soup.find('table').find('tbody').find_all('tr')
    for r in results:
        try:
            e = r.find_all('td')[2].find('a')
            card_name = e['title']
            card_url = 'https://bulbapedia.bulbagarden.net' + e['href']
            set_cards[set_name].append({"name": card_name, "url":card_url})
        except Exception as e:
            errors.append((str(r), str(e)))

# Get the images URLs

In [61]:
set_codes = {'base_set': 'BaseSet', 
             'base_set_2': 'BaseSet', 
             'base_set_expansion': 'BaseSet', 
             'fossil': 'Fossil', 
             'jungle': 'Jungle', 
             'team_rocket': 'TeamRocket'}

In [93]:
card_images = {}
not_found = []
for set_name in set_cards:
    print(set_name)
    set_code = set_codes[set_name]
    card_images[set_name] = {}
    
    for card in tqdm(set_cards[set_name]):
        pokemon = card['name'].split(' ')[0]
        
        response = requests.get(card['url'])
        soup = BeautifulSoup(response.text, 'html.parser')

        card_link, tcg1_link, tcg2_link = '', '', ''
        for link in [l for l in soup.find_all('a') if l.find('img')]:
            # print(link['href'])
            if pokemon in link['href'] and set_code in link['href'] and card_link == '':
                card_link = 'https://bulbapedia.bulbagarden.net' + link['href']
            elif pokemon in link['href'] and 'TCG1' in link['href']:
                tcg1_link = 'https://bulbapedia.bulbagarden.net' + link['href']
            elif pokemon in link['href'] and 'TCG2' in link['href']:
                tcg2_link = 'https://bulbapedia.bulbagarden.net' + link['href']
        
        # print(card_link, tcg1_link, tcg2_link)
        if tcg1_link or tcg2_link:
            card_images[set_name][card['name']] = (card_link, tcg1_link, tcg2_link)
        else:
            not_found.append(card)

fossil


  0%|          | 0/62 [00:00<?, ?it/s]

base_set


  0%|          | 0/102 [00:00<?, ?it/s]

base_set_expansion


  0%|          | 0/102 [00:00<?, ?it/s]

base_set_2


  0%|          | 0/130 [00:00<?, ?it/s]

jungle


  0%|          | 0/64 [00:00<?, ?it/s]

team_rocket


  0%|          | 0/83 [00:00<?, ?it/s]

In [98]:
card_images

{'fossil': {'Aerodactyl (Fossil 1)': ('https://bulbapedia.bulbagarden.net/wiki/File:AerodactylFossil16.jpg',
   'https://bulbapedia.bulbagarden.net/wiki/File:TCG1_C34_Aerodactyl.png',
   'https://bulbapedia.bulbagarden.net/wiki/File:TCG2_G33_Aerodactyl.png'),
  'Articuno (Fossil 2)': ('https://bulbapedia.bulbagarden.net/wiki/File:ArticunoFossil17.jpg',
   'https://bulbapedia.bulbagarden.net/wiki/File:TCG1_C21_Articuno.png',
   'https://bulbapedia.bulbagarden.net/wiki/File:TCG2_C29_Articuno.png'),
  'Dragonite (Fossil 4)': ('https://bulbapedia.bulbagarden.net/wiki/File:DragoniteFossil4.jpg',
   'https://bulbapedia.bulbagarden.net/wiki/File:TCG1_C46_Dragonite.png',
   'https://bulbapedia.bulbagarden.net/wiki/File:TCG2_B52_Dragonite.png'),
  'Gengar (Fossil 5)': ('https://bulbapedia.bulbagarden.net/wiki/File:GengarFossil5.jpg',
   'https://bulbapedia.bulbagarden.net/wiki/File:TCG1_B36_Gengar.png',
   'https://bulbapedia.bulbagarden.net/wiki/File:TCG2_C45_Gengar.png'),
  'Haunter (Fossil 6

In [94]:
total = 0
for set_name in set_cards:
    print(set_name, ':', len(card_images[set_name]))
    total += len(card_images[set_name])
print('total', ':', total)

fossil : 59
base_set : 93
base_set_expansion : 94
base_set_2 : 119
jungle : 60
team_rocket : 78
total : 503


In [95]:
not_found[:3]

[{'name': 'Ditto (Fossil 3)',
  'url': 'https://bulbapedia.bulbagarden.net/wiki/Ditto_(Fossil_3)'},
 {'name': 'Ditto (Fossil 18)',
  'url': 'https://bulbapedia.bulbagarden.net/wiki/Ditto_(Fossil_18)'},
 {'name': 'Mr. Fuji (Fossil 58)',
  'url': 'https://bulbapedia.bulbagarden.net/wiki/Mr._Fuji_(Fossil_58)'}]

Problem with special names, TBC

# Download the images

In [99]:
cards_mapping = {}
for set_name in card_images:
    cards_mapping[set_name] = {}
    keys = ['artwork', 'tcg1', 'tcg2']
    print(set_name)
    for card_name in tqdm(card_images[set_name]):
        cards_mapping[set_name][card_name] = {}
        for i, image in enumerate(card_images[set_name][card_name]):
            if image != '':
                response = requests.get(image) # Scanned card
                soup = BeautifulSoup(response.text, 'html.parser')
                link = 'https:'+[l for l in soup.find_all('a') if l.find('img')][0]['href']
                with open('images/'+os.path.basename(link), "wb") as f:
                    f.write(requests.get(link).content)
                cards_mapping[set_name][card_name][keys[i]] = 'images/'+os.path.basename(link)

fossil


  0%|          | 0/59 [00:00<?, ?it/s]

base_set


  0%|          | 0/93 [00:00<?, ?it/s]

base_set_expansion


  0%|          | 0/94 [00:00<?, ?it/s]

base_set_2


  0%|          | 0/119 [00:00<?, ?it/s]

jungle


  0%|          | 0/60 [00:00<?, ?it/s]

team_rocket


  0%|          | 0/78 [00:00<?, ?it/s]

In [100]:
import pickle
pickle.dump(cards_mapping, open('cards_mapping.pickle', 'wb'))