# Limitless Data Extraction

In [1]:
import requests
import bs4 as bs4
import json
import time

from random import choice

## Environment Variables

In [2]:
BASE_URL = "https://pocket.limitlesstcg.com"
set_list = []
set_keys = []
card_list = []
dataset = []

session = requests.Session()
session.headers.update(
    {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    }
)

## Extracting Sets URLs

In [3]:
response = session.get(f"{BASE_URL}/cards")
if response.status_code == 200:
    content = bs4.BeautifulSoup(response.content, "html.parser")

    table = content.find("table", {"class": "sets-table"})
    sets = table.find_all("a")

    for set in sets:
        record = { "url": f"{BASE_URL}{set.get("href")}",
                   "name": ' '.join([x.strip() for x in set.text.strip().split('\n')])
                 }
        
        if record["url"] not in set_keys:
          set_keys.append(record["url"])
          set_list.append(record)
else:
    print(f"Failed to retrieve sets data from {BASE_URL}. Status code: {response.status_code}")
    exit(1)
    
print(f"Found {len(set_list)} sets.")

Found 8 sets.


## Extracting Cards from Sets

In [4]:
for url in set_list:
    response = session.get(url["url"])
    if response.status_code == 200:
        content = bs4.BeautifulSoup(response.content, "html.parser")
        grid = content.find("div", {"class": "card-search-grid"})
        cards = grid.find_all("a")

        for row in cards:
            url_card = f"{BASE_URL}{row.get('href')}"
            img = row.img.get("src")
            card_list.append({"set": url, "img": img, "card": url_card, 'key': row.get('href')})
            
        print(f"Retrieved {len(cards)} cards from {url["name"]}")
        
    else:
        print(f"Failed to retrieve cards data from set {url["name"]}. Status code: {response.status_code}")
        
    time.sleep(20)

Retrieved 103 cards from Extradimensional Crisis A3a
Retrieved 239 cards from Celestial Guardians A3
Retrieved 111 cards from Shining Revelry A2b
Retrieved 96 cards from Triumphant Light A2a
Retrieved 207 cards from Space-Time Smackdown A2
Retrieved 86 cards from Mythical Island A1a
Retrieved 286 cards from Genetic Apex A1
Retrieved 83 cards from Promo-A P-A


## Extracting Cards Details

In [5]:
r = 1
dataset = []

for item in card_list:
    content = session.get(item["card"])
    card_page = bs4.BeautifulSoup(content.content, "html.parser")
    card = card_page.find("div", {"class": "card-details-main"})

    line1 = card.find("p", {"class": "card-text-title"})
    try:
        name, ptype, hp = [x.strip() for x in line1.text.split('-')]
    except:
        name = line1.text.strip()
        ptype = None
        hp = None

    line2 = card.find("p", {"class": "card-text-type"})
    try:
        card_type, stage, evolve = [x.strip() for x in line2.text.split("-")]
    except:
        try:
            card_type, stage = [x.strip() for x in line2.text.split("-")]
            evolve = "None"
        except:
            card_type = line2.text.strip()
            stage = None
            evolve = None

    ability = card.find("div", {"class": "card-text-ability"})
    if ability:
        try:
            ability_name = ability.find("p", {"class": "card-text-ability-info"}).text.strip()
            ability_text = ability.find("p", {"class": "card-text-ability-effect"}).text.strip()
        except:
            ability_name = None
            ability_text = None

    attacks = card.find_all("div", {"class": "card-text-attack"})
    card_attacks = []
    for attack in attacks:
        att_cost_name = attack.find("p", {"class": "card-text-attack-info"})
        att_effect = attack.find("p", {"class": "card-text-attack-effect"}).text.strip()
        card_attacks.append(
            [x.strip() for x in att_cost_name.text.split(" ") if x.strip() != ""]
            + [att_effect]
        )

    extras = card.find_all("p", {"class": "card-text-wrr"})
    extra_info = None
    if extras and len(extras) > 0:
      extra_info = [x.strip() for x in extras[0].text.split("\n")]
    
    record = {
        "id": item["key"].replace("/cards", ""),
        "name": name,
        "type": ptype,
        "hp": hp.replace("HP", "").strip() if hp else None,
        "card_type": card_type,
        "stage": stage,
        "evolve": " ".join([x.strip() for x in evolve.split("\n")]) if evolve else None,
        "ability": {"name": ability_name, "text": ability_text} if ability else None,
        "attacks": [
            {
                "cost": x[0],
                "name": " ".join(x[1:-2] if str.isnumeric(x[-2]) else x[1:-1]),
                "damage": x[-2] if str.isnumeric(x[-2]) else None,
                "effect": x[-1],
            }
            for x in card_attacks
        ],
        "weakness": extra_info[1].replace("Weakness: ","") if extra_info else None,
        "retreat_cost": extra_info[2].replace("Retreat: ","") if extra_info else None,
        "img": item["img"],
        "set": item["set"],
    }
    dataset.append(record)

    r += 1
    if r % 10 == 0:
        delayed = choice([1, 3, 5, 7])
        print(f"Processed {r} cards, sleeping for {delayed} seconds to avoid rate limiting.")
        time.sleep(delayed)

Processed 10 cards, sleeping for 3 seconds to avoid rate limiting.
Processed 20 cards, sleeping for 5 seconds to avoid rate limiting.
Processed 30 cards, sleeping for 5 seconds to avoid rate limiting.
Processed 40 cards, sleeping for 1 seconds to avoid rate limiting.
Processed 50 cards, sleeping for 7 seconds to avoid rate limiting.
Processed 60 cards, sleeping for 7 seconds to avoid rate limiting.
Processed 70 cards, sleeping for 5 seconds to avoid rate limiting.
Processed 80 cards, sleeping for 3 seconds to avoid rate limiting.
Processed 90 cards, sleeping for 1 seconds to avoid rate limiting.
Processed 100 cards, sleeping for 1 seconds to avoid rate limiting.
Processed 110 cards, sleeping for 3 seconds to avoid rate limiting.
Processed 120 cards, sleeping for 5 seconds to avoid rate limiting.
Processed 130 cards, sleeping for 7 seconds to avoid rate limiting.
Processed 140 cards, sleeping for 1 seconds to avoid rate limiting.
Processed 150 cards, sleeping for 3 seconds to avoid rate

In [None]:
with open("../data/cards.json", "w") as f:
    json.dump(dataset, f, indent=4)
    print(f"Data saved to limitless_cards.json")
    
print(f"Total cards retrieved: {len(dataset)}")

Data saved to limitless_cards.json
Total cards retrieved: 1211
