In [1]:
import os  
import requests
import numpy as np
import pandas as pd
import regex
import json
from dotenv import load_dotenv
from time import sleep

load_dotenv()


True

In [2]:
API_KEY = os.getenv("JUSTTCG_API_KEY")
BASE_URL = "https://api.justtcg.com/v1"
HEADERS = {"x-api-key": API_KEY}
print(API_KEY)

tcg_a9994473c56d4a309b5b42a661f50fc6


In [3]:
API_KEY = os.getenv("JUSTTCG_API_KEY")
BASE_URL = "https://api.justtcg.com/v1"
HEADERS = {"x-api-key": API_KEY}

games_to_pull_from = [
    'Pokemon',
    'Disney Lorcana',
    'mtg',
    'YuGiOh',
    'one-piece-card-game',
    'digimon-card-game',
]

def get_current_year_set_name(game : str):
    pokemon_sets_file_path = f'{game.lower()}_sets_current.json'
    with open(pokemon_sets_file_path, 'r') as file:
        pokemon_data = json.load(file)['data']

    pokemon_df = pd.json_normalize(pokemon_data)
    pokemon_df.sort_values(by='release_date', ascending=False, inplace=True)
    pokemon_df.dropna(subset=['release_date'], inplace=True)
    pokemon_df['year'] = pokemon_df['release_date'].str.replace(r'T.+', '', regex=True).replace(r'-.+', '', regex=True).astype(int)
    focus_df = pokemon_df.loc[pokemon_df['year'] >= 2024]
    set_names = focus_df['name'].tolist()
    set_ids = focus_df['id'].tolist()
    return list(zip(set_names, set_ids))

def download_cards_for_set(game: str, set_name: str, set_id: str, limit: int = 100, time_period: str = None):
    """Download top cards from a specific set for a given time period
    
    Args:
        game: Game name
        set_name: Set name
        set_id: Set ID
        limit: Number of cards to retrieve
        time_period: Time period for statistics ('7d', '30d', '90d', '1y', or None for default)
    """
    url = f"{BASE_URL}/cards"
    
    params = {
        "game": game,
        "set": set_id,
        "limit": limit,
        "include_price_history": True
    }
    
    # Add time period if specified
    if time_period:
        params["include_statistics"] = time_period
    
    try:
        response = requests.get(url, headers=HEADERS, params=params, timeout=30)
        
        # Debug: Print response details
        if response.status_code != 200:
            print(f"Status code: {response.status_code}")
            print(f"    Response: {response.text[:200]}")
        
        response.raise_for_status()
        
        data = response.json()
        
        # Check if we got any cards
        if not data.get('data'):
            print(f"No cards returned (empty data array)")
            return None
            
        return data
    except requests.exceptions.Timeout:
        print(f"Timeout error - API took too long to respond")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
        if hasattr(e, 'response') and e.response is not None:
            print(f"Response text: {e.response.text[:200]}")
        return None
    except json.JSONDecodeError as e:
        print(f"JSON decode error: {e}")
        return None
    except Exception as e:
        print(f" Unexpected error: {e}")
        return None

def download_cards_all_timeframes(game: str, set_name: str, set_id: str, limit: int = 100):
    """Download cards for all time periods and combine the data
    
    Returns a dictionary with combined data from all timeframes
    """
    timeframes = ['7d', '30d', '90d', '1y']
    combined_data = {}
    
    print(f"    Fetching data for timeframes: {', '.join(timeframes)}")
    
    for timeframe in timeframes:
        print(f"      - {timeframe}...", end=" ")
        data = download_cards_for_set(game, set_name, set_id, limit, time_period=timeframe)
        
        if data:
            combined_data[timeframe] = data
            print(f"({len(data.get('data', []))} cards)")
        else:
            print("failed")
        
        # Longer delay between timeframe requests to avoid rate limiting
        sleep(0.5)
    
    return combined_data if combined_data else None

def save_raw_json(data, game: str, set_name: str, output_dir: str = "raw_card_data"):
    """Save raw JSON response to file"""
    os.makedirs(output_dir, exist_ok=True)
    
    # Sanitize filename
    safe_set_name = set_name.replace('/', '-').replace('\\', '-').replace(':', '-')
    safe_game = game.replace('/', '-').replace('\\', '-')
    
    filename = f"{safe_game}_{safe_set_name}.json"
    filepath = os.path.join(output_dir, filename)
    
    with open(filepath, 'w') as f:
        json.dump(data, f, indent=4)
    
    return filepath

def flatten_card_data_multi_timeframe(combined_data: dict, game: str, set_name: str):
    """Flatten card data from multiple timeframes into DataFrame-friendly format
    
    Args:
        combined_data: Dictionary with keys '7d', '30d', '90d', '1y' containing card data
        game: Game name
        set_name: Set name
    
    Returns:
        List of flattened rows with data from all timeframes merged
    """
    flattened_rows = []
    
    # Use 30d as the base data (most common reference point)
    base_timeframe = '30d'
    if base_timeframe not in combined_data:
        # Fallback to any available timeframe
        base_timeframe = list(combined_data.keys())[0] if combined_data else None
    
    if not base_timeframe:
        return []
    
    base_data = combined_data[base_timeframe]
    
    # Create a lookup for variants across all timeframes
    # Structure: {variant_id: {timeframe: variant_data}}
    variant_lookup = {}
    
    for timeframe, data in combined_data.items():
        for card in data.get('data', []):
            card_id = card.get('id')
            for variant in card.get('variants', []):
                variant_id = variant.get('id')
                if variant_id not in variant_lookup:
                    variant_lookup[variant_id] = {
                        'card_info': card,
                        'timeframes': {}
                    }
                variant_lookup[variant_id]['timeframes'][timeframe] = variant
    
    # Now create flattened rows
    for variant_id, variant_data in variant_lookup.items():
        card = variant_data['card_info']
        timeframes = variant_data['timeframes']
        
        # Base card information
        row = {
            'game': game,
            'set_name': set_name,
            'card_id': card.get('id'),
            'card_name': card.get('name'),
            'number': card.get('number'),
            'rarity': card.get('rarity'),
            'set': card.get('set'),
            'set_name_api': card.get('set_name'),
            'tcgplayerId': card.get('tcgplayerId'),
            'details': card.get('details'),
            'variant_id': variant_id,
        }
        
        # Get variant info from any timeframe (should be same across all)
        first_variant = next(iter(timeframes.values()))
        row.update({
            'condition': first_variant.get('condition'),
            'printing': first_variant.get('printing'),
            'language': first_variant.get('language'),
            'tcgplayerSkuId': first_variant.get('tcgplayerSkuId'),
            'price': first_variant.get('price'),
            'lastUpdated': first_variant.get('lastUpdated'),
            'priceChange24hr': first_variant.get('priceChange24hr'),
        })
        
        # Add timeframe-specific data with prefixes
        for timeframe in ['7d', '30d', '90d', '1y']:
            if timeframe in timeframes:
                variant = timeframes[timeframe]
                prefix = f'{timeframe}_'
                
                row.update({
                    f'{prefix}priceChange': variant.get(f'priceChange{timeframe}'),
                    f'{prefix}avgPrice': variant.get(f'avgPrice{timeframe}'),
                    f'{prefix}minPrice': variant.get(f'minPrice{timeframe}'),
                    f'{prefix}maxPrice': variant.get(f'maxPrice{timeframe}'),
                    f'{prefix}stddevPopPrice': variant.get(f'stddevPopPrice{timeframe}'),
                    f'{prefix}covPrice': variant.get(f'covPrice{timeframe}'),
                    f'{prefix}iqrPrice': variant.get(f'iqrPrice{timeframe}'),
                    f'{prefix}trendSlope': variant.get(f'trendSlope{timeframe}'),
                    f'{prefix}priceChangesCount': variant.get(f'priceChangesCount{timeframe}'),
                    f'{prefix}priceRelativeToRange': variant.get(f'priceRelativeTo{timeframe}Range'),
                })
            else:
                # Set None for missing timeframes
                prefix = f'{timeframe}_'
                row.update({
                    f'{prefix}priceChange': None,
                    f'{prefix}avgPrice': None,
                    f'{prefix}minPrice': None,
                    f'{prefix}maxPrice': None,
                    f'{prefix}stddevPopPrice': None,
                    f'{prefix}covPrice': None,
                    f'{prefix}iqrPrice': None,
                    f'{prefix}trendSlope': None,
                    f'{prefix}priceChangesCount': None,
                    f'{prefix}priceRelativeToRange': None,
                })
        
        # Add all-time min/max data (from 1y timeframe typically has this)
        if '1y' in timeframes:
            variant_1y = timeframes['1y']
            row.update({
                'minPriceAllTime': variant_1y.get('minPriceAllTime'),
                'minPriceAllTimeDate': variant_1y.get('minPriceAllTimeDate'),
                'maxPriceAllTime': variant_1y.get('maxPriceAllTime'),
                'maxPriceAllTimeDate': variant_1y.get('maxPriceAllTimeDate'),
            })
        else:
            row.update({
                'minPriceAllTime': None,
                'minPriceAllTimeDate': None,
                'maxPriceAllTime': None,
                'maxPriceAllTimeDate': None,
            })
        
        flattened_rows.append(row)
    
    return flattened_rows

def download_all_cards(games_list, cards_per_set: int = 100, output_dir: str = "raw_card_data", save_interval: int = 5):
    """Download cards for all games and sets, save JSONs and create master DataFrame
    
    Args:
        games_list: List of game names to download
        cards_per_set: Number of cards to download per set
        output_dir: Directory to save raw JSON files
        save_interval: Save progress to CSV every N sets (prevents data loss)
    """
    
    all_card_rows = []
    total_sets = 0
    successful_downloads = 0
    csv_filename = "all_cards_master_data.csv"
    
    print("Starting card data download...")
    print("="*60)
    
    for game in games_list:
        print(f"\nProcessing game: {game}")
        print("-"*60)
        
        try:
            set_data = get_current_year_set_name(game)
            print(f"Found {len(set_data)} sets for {game}")
            
            for set_name, set_id in set_data:
                total_sets += 1
                print(f"\n  [{total_sets}] Downloading: {set_name} (ID: {set_id})")
                
                try:
                    # Download card data for all timeframes
                    combined_data = download_cards_all_timeframes(game, set_name, set_id, limit=cards_per_set)
                    
                    if combined_data:
                        # Save raw JSON for each timeframe
                        for timeframe, data in combined_data.items():
                            json_filename = f"{game}_{set_name}_{timeframe}"
                            json_path = save_raw_json(data, json_filename, "", output_dir)
                        
                        num_cards = len(list(combined_data.values())[0].get('data', []))
                        print(f"Saved {num_cards} cards across {len(combined_data)} timeframes")
                        
                        # Flatten and add to master list with all timeframe data
                        flattened = flatten_card_data_multi_timeframe(combined_data, game, set_name)
                        all_card_rows.extend(flattened)
                        successful_downloads += 1
                        
                        print(f"Added {len(flattened)} variant rows with multi-timeframe data")
                        
                        # Periodic save to prevent data loss
                        if successful_downloads % save_interval == 0:
                            temp_df = pd.DataFrame(all_card_rows)
                            temp_df.to_csv(csv_filename, index=False)
                            print(f"Progress saved ({successful_downloads} sets)")
                    else:
                        print(f"Failed to download data")
                    
                except Exception as e:
                    print(f"Error downloading set: {e}")
                    continue
                
                # Longer delay to avoid rate limiting (4 timeframes per set + main delay)
                sleep(0.8)
                
        except Exception as e:
            print(f"Error processing game {game}: {e}")
            continue
    
    # Create final master DataFrame
    print("\n" + "="*60)
    print("Creating final DataFrame...")
    
    if all_card_rows:
        df = pd.DataFrame(all_card_rows)
        
        # Save final CSV
        df.to_csv(csv_filename, index=False)
        
        print(f"\nMaster CSV saved: {csv_filename}")
        print(f"  Total rows: {len(df)}")
        print(f"  Total games: {len(games_list)}")
        print(f"  Sets processed: {successful_downloads}/{total_sets}")
        print(f"  Unique cards: {df['card_id'].nunique()}")
        
        return df
    else:
        print("No data was downloaded")
        return pd.DataFrame()

counter = 0
for game in games_to_pull_from:
    set_data = get_current_year_set_name(game)
    for set_name, set_id in set_data:
        print(f'{game}: {set_name} ({set_id})')
        counter+=1

print(f'Total queries needed: {counter}')

Pokemon: ME02: Phantasmal Flames (me02-phantasmal-flames-pokemon)
Pokemon: ME: Mega Evolution Promo (me-mega-evolution-promo-pokemon)
Pokemon: ME01: Mega Evolution (me01-mega-evolution-pokemon)
Pokemon: MEE: Mega Evolution Energies (mee-mega-evolution-energies-pokemon)
Pokemon: SV: White Flare (sv-white-flare-pokemon)
Pokemon: SV: Black Bolt (sv-black-bolt-pokemon)
Pokemon: SV10: Destined Rivals (sv10-destined-rivals-pokemon)
Pokemon: SV09: Journey Together (sv09-journey-together-pokemon)
Pokemon: McDonald's Promos 2024 (mcdonald-s-promos-2024-pokemon)
Pokemon: SV: Prismatic Evolutions (sv-prismatic-evolutions-pokemon)
Pokemon: SV08: Surging Sparks (sv08-surging-sparks-pokemon)
Pokemon: SV07: Stellar Crown (sv07-stellar-crown-pokemon)
Pokemon: Trick or Trade BOOster Bundle 2024 (trick-or-trade-booster-bundle-2024-pokemon)
Pokemon: SV: Shrouded Fable (sv-shrouded-fable-pokemon)
Pokemon: Battle Academy 2024 (battle-academy-2024-pokemon)
Pokemon: SV06: Twilight Masquerade (sv06-twilight-m

In [4]:
# TEST RUN: Download just one game first to test if kernel is stable
# Uncomment to run a small test first
test_games = ['digimon-card-game']  # Just test with Pokemon
test_df = download_all_cards(test_games, cards_per_set=20)  # Only 20 cards per set
print("\nTest completed! If this worked, proceed with full download.")

Starting card data download...

Processing game: digimon-card-game
------------------------------------------------------------
Found 33 sets for digimon-card-game

  [1] Downloading: Starter Deck 22: Amethyst Mandala Advanced Deck Set (ID: starter-deck-22-amethyst-mandala-advanced-deck-set-digimon-card-game)
    Fetching data for timeframes: 7d, 30d, 90d, 1y
      - 7d... (16 cards)
      - 30d... (16 cards)
      - 90d... (16 cards)
      - 1y... (16 cards)
Saved 16 cards across 4 timeframes
Added 16 variant rows with multi-timeframe data

  [2] Downloading: Limited Card Pack -Billion Bullet- (ID: limited-card-pack-billion-bullet-digimon-card-game)
    Fetching data for timeframes: 7d, 30d, 90d, 1y
      - 7d... (20 cards)
      - 30d... (20 cards)
      - 90d... (20 cards)
      - 1y... (20 cards)
Saved 20 cards across 4 timeframes
Added 25 variant rows with multi-timeframe data

  [3] Downloading: Hackers' Slumber (ID: hackers-slumber-digimon-card-game)
    Fetching data for timefr

In [None]:
# Download all cards and create master dataset

cards_df = download_all_cards(games_to_pull_from, cards_per_set=100)

Starting card data download...

Processing game: Pokemon
------------------------------------------------------------
Found 18 sets for Pokemon

  [1] Downloading: ME02: Phantasmal Flames (ID: me02-phantasmal-flames-pokemon)
    Fetching data for timeframes: 7d, 30d, 90d, 1y
      - 7d... (100 cards)
      - 30d... (100 cards)
      - 90d... (100 cards)
      - 1y... (100 cards)
Saved 100 cards across 4 timeframes
Added 295 variant rows with multi-timeframe data

  [2] Downloading: ME: Mega Evolution Promo (ID: me-mega-evolution-promo-pokemon)
    Fetching data for timeframes: 7d, 30d, 90d, 1y
      - 7d... (38 cards)
      - 30d... (38 cards)
      - 90d... (38 cards)
      - 1y... 

In [None]:
# DIAGNOSTIC: Test API connection and check what's failing
print("=== API DIAGNOSTICS ===\n")

# Check API key
print(f"1. API Key loaded: {'Yes' if API_KEY else 'No'}")
if API_KEY:
    print(f"Key preview: {API_KEY}...")

# Test basic connection
print("\n2. Testing basic API connection...")
try:
    test_response = requests.get(f"{BASE_URL}/sets", headers=HEADERS, params={"game": "Pokemon"}, timeout=10)
    print(f"   Status: {test_response.status_code}")
    if test_response.status_code == 200:
        print("API connection working")
    else:
        print(f"API returned error: {test_response.text[:200]}")
except Exception as e:
    print(f"Connection failed: {e}")

# Test card download for one set
print("\n3. Testing card download for a single set...")
try:
    # Get first Pokemon set
    set_data = get_current_year_set_name('Pokemon')
    if set_data:
        test_set_name, test_set_id = set_data[0]
        print(f"Testing with: {test_set_name} (ID: {test_set_id})")
        
        test_cards = download_cards_for_set('Pokemon', test_set_name, test_set_id, limit=5)
        if test_cards:
            print(f"Successfully downloaded {len(test_cards.get('data', []))} cards")
            
            # Check API limits
            meta = test_cards.get('_metadata', {})
            if meta:
                print(f"\n4. API Usage Stats:")
                print(f"   Requests remaining: {meta.get('apiRequestsRemaining', 'N/A')}")
                print(f"   Daily requests remaining: {meta.get('apiDailyRequestsRemaining', 'N/A')}")
                print(f"   Rate limit: {meta.get('apiRateLimit', 'N/A')} requests/second")
                print(f"   Plan: {meta.get('apiPlan', 'N/A')}")
        else:
            print("Card download failed")
    else:
        print("No sets found")
except Exception as e:
    print(f"Test failed: {e}")
    import traceback
    traceback.print_exc()

print("\n" + "="*60)

=== API DIAGNOSTICS ===

1. API Key loaded: ✓ Yes
Key preview: tcg_c3493590ccb14cddb97479ed21395c79...

2. Testing basic API connection...
   Status: 200
   ✓ API connection working

3. Testing card download for a single set...
   Testing with: ME02: Phantasmal Flames (ID: me02-phantasmal-flames-pokemon)
   Status: 200
   ✓ API connection working

3. Testing card download for a single set...
   Testing with: ME02: Phantasmal Flames (ID: me02-phantasmal-flames-pokemon)
   ✓ Successfully downloaded 5 cards

4. API Usage Stats:
   Requests remaining: 9998
   Daily requests remaining: 998
   Rate limit: 50 requests/second
   Plan: Starter Plan

   ✓ Successfully downloaded 5 cards

4. API Usage Stats:
   Requests remaining: 9998
   Daily requests remaining: 998
   Rate limit: 50 requests/second
   Plan: Starter Plan



In [None]:
# Preview the master dataset
if 'cards_df' in locals():
    print("Dataset Overview:")
    print("="*60)
    print(f"Shape: {cards_df.shape}")
    print(f"\nColumns: {list(cards_df.columns)}")
    print(f"\nGames included: {cards_df['game'].unique().tolist()}")
    print(f"\nFirst few rows:")
    display(cards_df.head())
    
    print(f"\nData types:")
    print(cards_df.dtypes)
    
    print(f"\nCards per game:")
    print(cards_df.groupby('game')['card_id'].nunique())

Dataset Overview:
Shape: (48112, 28)

Columns: ['game', 'set_name', 'card_id', 'card_name', 'number', 'rarity', 'set', 'set_name_api', 'tcgplayerId', 'details', 'variant_id', 'condition', 'printing', 'language', 'tcgplayerSkuId', 'price', 'lastUpdated', 'priceChange24hr', 'priceChange30d', 'avgPrice30d', 'minPrice30d', 'maxPrice30d', 'stddevPopPrice30d', 'covPrice30d', 'iqrPrice30d', 'trendSlope30d', 'priceChangesCount30d', 'priceRelativeTo30dRange']

Games included: ['Pokemon', 'Disney Lorcana', 'mtg', 'YuGiOh', 'one-piece-card-game', 'digimon-card-game']

First few rows:


Unnamed: 0,game,set_name,card_id,card_name,number,rarity,set,set_name_api,tcgplayerId,details,...,priceChange30d,avgPrice30d,minPrice30d,maxPrice30d,stddevPopPrice30d,covPrice30d,iqrPrice30d,trendSlope30d,priceChangesCount30d,priceRelativeTo30dRange
0,Pokemon,ME02: Phantasmal Flames,pokemon-me02-phantasmal-flames-phantasmal-flam...,Phantasmal Flames Booster Box Case,,,me02-phantasmal-flames-pokemon,ME02: Phantasmal Flames,655281,,...,-18.66,1948.94,1702.81,2094.23,137.24,0.070418,261.25,-14.570886,47,0.001456
1,Pokemon,ME02: Phantasmal Flames,pokemon-me02-phantasmal-flames-phantasmal-flam...,Phantasmal Flames Booster Bundle Case,,,me02-phantasmal-flames-pokemon,ME02: Phantasmal Flames,654162,,...,-21.01,1850.14,1655.9,2096.41,91.29,0.049344,84.19,-8.934438,12,0.0
2,Pokemon,ME02: Phantasmal Flames,pokemon-me02-phantasmal-flames-phantasmal-flam...,Phantasmal Flames Elite Trainer Box Case,,,me02-phantasmal-flames-pokemon,ME02: Phantasmal Flames,654170,,...,-18.68,1623.47,1402.67,1724.96,116.2,0.071577,209.64,-11.873617,7,0.0
3,Pokemon,ME02: Phantasmal Flames,pokemon-me02-phantasmal-flames-phantasmal-flam...,Phantasmal Flames Sleeved Booster Case,,,me02-phantasmal-flames-pokemon,ME02: Phantasmal Flames,655282,,...,-7.78,1385.99,1291.09,1437.37,39.91,0.028799,0.0,-2.619082,11,0.0
4,Pokemon,ME02: Phantasmal Flames,pokemon-me02-phantasmal-flames-mega-charizard-...,Mega Charizard X ex - 125/094,125/094,Special Illustration Rare,me02-phantasmal-flames-pokemon,ME02: Phantasmal Flames,662184,,...,-7.8,838.82,731.7,967.72,88.35,0.105332,171.38,-38.402264,13,0.220236



Data types:
game                        object
set_name                    object
card_id                     object
card_name                   object
number                      object
rarity                      object
set                         object
set_name_api                object
tcgplayerId                 object
details                     object
variant_id                  object
condition                   object
printing                    object
language                    object
tcgplayerSkuId              object
price                      float64
lastUpdated                  int64
priceChange24hr            float64
priceChange30d             float64
avgPrice30d                float64
minPrice30d                float64
maxPrice30d                float64
stddevPopPrice30d          float64
covPrice30d                float64
iqrPrice30d                float64
trendSlope30d              float64
priceChangesCount30d         int64
priceRelativeTo30dRange    float64
dtype: 

In [None]:
cards_df['condition'].unique()

array(['Sealed', 'Near Mint', 'Damaged', 'Lightly Played',
       'Moderately Played', 'Heavily Played'], dtype=object)