# Exploratory Data Analysis

In [16]:
import requests
import json
from collections import Counter

In [2]:
url = "https://api.scryfall.com/bulk-data"
resp = requests.get(url)
print(resp)
j = resp.json()
print(j.keys())
for d in j["data"]:
    print(d)

<Response [200]>
dict_keys(['object', 'has_more', 'data'])
{'object': 'bulk_data', 'id': '27bf3214-1271-490b-bdfe-c0be6c23d02e', 'type': 'oracle_cards', 'updated_at': '2023-09-12T21:02:07.525+00:00', 'uri': 'https://api.scryfall.com/bulk-data/27bf3214-1271-490b-bdfe-c0be6c23d02e', 'name': 'Oracle Cards', 'description': 'A JSON file containing one Scryfall card object for each Oracle ID on Scryfall. The chosen sets for the cards are an attempt to return the most up-to-date recognizable version of the card.', 'size': 130951843, 'download_uri': 'https://data.scryfall.io/oracle-cards/oracle-cards-20230912210207.json', 'content_type': 'application/json', 'content_encoding': 'gzip'}
{'object': 'bulk_data', 'id': '6bbcf976-6369-4401-88fc-3a9e4984c305', 'type': 'unique_artwork', 'updated_at': '2023-09-12T21:03:37.489+00:00', 'uri': 'https://api.scryfall.com/bulk-data/6bbcf976-6369-4401-88fc-3a9e4984c305', 'name': 'Unique Artwork', 'description': 'A JSON file of Scryfall card objects that toget

In [3]:
download_uri = j["data"][3]["download_uri"]
print(download_uri)
resp = requests.get(download_uri)
print(resp)

https://data.scryfall.io/all-cards/all-cards-20230912212341.json
<Response [200]>


In [4]:
filename = download_uri.rsplit("/", 1)[-1]
with open(f"data/{filename}", "w", encoding="UTF-8") as f:
    json.dump(resp.json(), f)

In [5]:
with open(f"data/{filename}", "r", encoding="UTF-8") as f:
    cards = json.load(f)

In [107]:
def get_attribute(list_of_dicts: list, attribute_name: str) -> list:
    """Return a list containing the values of the attributes"""
    values = list()
    missing_count = 0
    # if not all([type(d) is dict for d in list_of_dicts]):
    #     print(type(list_of_dicts[0]))
    #     raise Exception
    for d in list_of_dicts:
        if attribute_name in d:
            values.append(d[attribute_name])
        else:
            missing_count += 1
    # print(f"{missing_count} cards were missing the attribute {attribute_name}.")
    return (values, missing_count)

In [32]:
def get_lengths(iterable: list) -> set:
    """Return a set containing all possible lengths for the values in iterable"""
    lengths = set()
    for e in iterable:
        lengths.add(len(e))
    return lengths

In [33]:
def get_types(iterable: list) -> set:
    """Return a set containing all possible data types for the values in iterable"""
    types = set()
    for e in iterable:
        types.add(type(e))
    return types

## Metadata

In [77]:
len(cards)

430878

In [78]:
attributes = set()
for card in cards:
    for key in card.keys():
        attributes.add(key)
attributes

{'all_parts',
 'arena_id',
 'artist',
 'artist_ids',
 'attraction_lights',
 'booster',
 'border_color',
 'card_back_id',
 'card_faces',
 'cardmarket_id',
 'cmc',
 'collector_number',
 'color_identity',
 'color_indicator',
 'colors',
 'digital',
 'edhrec_rank',
 'finishes',
 'flavor_name',
 'flavor_text',
 'foil',
 'frame',
 'frame_effects',
 'full_art',
 'games',
 'hand_modifier',
 'highres_image',
 'id',
 'illustration_id',
 'image_status',
 'image_uris',
 'keywords',
 'lang',
 'layout',
 'legalities',
 'life_modifier',
 'loyalty',
 'mana_cost',
 'mtgo_foil_id',
 'mtgo_id',
 'multiverse_ids',
 'name',
 'nonfoil',
 'object',
 'oracle_id',
 'oracle_text',
 'oversized',
 'penny_rank',
 'power',
 'preview',
 'prices',
 'printed_name',
 'printed_text',
 'printed_type_line',
 'prints_search_uri',
 'produced_mana',
 'promo',
 'promo_types',
 'purchase_uris',
 'rarity',
 'related_uris',
 'released_at',
 'reprint',
 'reserved',
 'rulings_uri',
 'scryfall_set_uri',
 'scryfall_uri',
 'security_s

In [79]:
print(len(attributes))

85


In [115]:
def get_metadata(list_of_objs: list, attributes: set):
    """Return a dictionary containing metadata about the list_of_objs passed to it"""
    metadata = dict()
    for attribute in attributes:
        data = dict()
        attribute_values, missing_count = get_attribute(list_of_objs, attribute)
        data["missing_count"] = missing_count
        types = [str(t) for t in get_types(attribute_values)]
        data["types"] = types
        if types == ["<class 'str'>"]:
            data["lengths"] = list(get_lengths(attribute_values))
        else:
            data["lengths"] = None
        try:
            distinct_count = len(set(attribute_values))
        except TypeError:
            distinct_count = None
        data["distinct_value_count"] = distinct_count
        metadata[attribute] = data
    return metadata

In [116]:
metadata = get_metadata(cards, attributes)
metadata

{'type_line': {'missing_count': 18,
  'types': ["<class 'str'>"],
  'lengths': [4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   40,
   41,
   42,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
   50,
   51,
   52,
   53,
   54,
   55,
   56,
   57,
   58,
   59,
   60,
   61,
   62,
   63,
   64,
   65,
   66,
   67,
   68,
   69,
   70,
   71,
   72,
   73,
   74,
   79,
   84,
   86],
  'distinct_value_count': 3497},
 'id': {'missing_count': 0,
  'types': ["<class 'str'>"],
  'lengths': [36],
  'distinct_value_count': 430878},
 'component': {'missing_count': 430878,
  'types': [],
  'lengths': None,
  'distinct_value_count': 0},
 'object': {'missing_count': 0,
  'types': ["<class 'str'>"],
  'lengths': [4],
  'distinct_value_count': 1},
 'name': {'missing_count': 0,
  'types': ["<class '

In [117]:
with open("data/metadata.json", "w") as f:
    json.dump(metadata, f, indent=4)

Take the time to view the generated metadata.json file. It dictates the data types for the database.

## released_at

In [118]:
released_ats = get_attribute(cards, "released_at")
released_ats[:5]

(['2006-10-06',
  '2009-10-02',
  '2017-08-25',
  '2015-05-22',
  '2017-09-29',
  '2002-05-27',
  '1994-04-01',
  '2016-09-30',
  '2005-10-07',
  '2021-09-24',
  '2021-06-18',
  '2021-11-19',
  '2012-10-05',
  '2005-10-07',
  '2001-10-01',
  '2022-11-18',
  '2022-04-29',
  '2023-09-08',
  '2023-06-23',
  '2019-10-04',
  '2015-01-17',
  '2019-07-12',
  '2007-10-12',
  '2018-07-13',
  '2020-07-03',
  '2020-11-20',
  '2020-07-17',
  '2017-04-28',
  '1997-10-14',
  '2022-09-09',
  '2015-07-17',
  '2021-06-21',
  '2011-01-10',
  '2021-03-19',
  '2000-06-05',
  '2022-04-29',
  '2014-05-02',
  '2021-02-05',
  '2013-09-27',
  '2023-08-04',
  '2011-09-30',
  '2011-07-15',
  '2015-11-13',
  '2021-11-19',
  '1995-06-03',
  '2018-08-09',
  '2020-09-25',
  '2017-09-29',
  '2018-08-09',
  '1997-03-24',
  '2008-01-01',
  '2012-07-13',
  '2009-02-06',
  '1995-04-01',
  '1998-06-24',
  '2018-01-19',
  '2006-10-06',
  '2013-05-03',
  '2023-04-21',
  '2011-05-13',
  '1993-12-10',
  '2019-10-04',
  '2017-

The released_at field in the data is saved as a string but will be converted to a data in the database.

## collector_number

In [119]:
collector_numbers, _ = get_attribute(cards, "collector_number")
collector_numbers[:5]

['157', '21', '73', '5', '78']

It seems like the collector_number is usually an integer yet they were saved as strings. Why?

In [120]:
distinct_collector_numbers = set(collector_numbers)
distinct_collector_numbers

{'1437',
 '832',
 '81940',
 'et15sb',
 '867',
 '889',
 '1618',
 'et364',
 '98127',
 '910',
 'cr292',
 '160',
 '91263',
 '1464',
 '95411',
 '570',
 '80861',
 '107★',
 'ml24',
 '91353',
 '103406',
 '129★',
 'jk0b',
 '32559',
 '88',
 '218b',
 'shr103',
 '20p',
 'K25',
 '105666',
 '99731',
 '55898',
 '82d',
 '32571',
 '86064',
 '36298',
 '79963',
 '76★',
 '1668',
 'jf134',
 '36180',
 '91361',
 '86182',
 '334',
 '64420',
 '133',
 '99745',
 '1u',
 '32017',
 '17b',
 '105718',
 '39628',
 'js438',
 '875',
 '92838',
 '175z',
 '157',
 '118†',
 '86280',
 '105668',
 '202★',
 '72231',
 '99837',
 'J45',
 '81874',
 '67f',
 '232',
 '1433',
 '65795',
 '1320',
 '431',
 'A-130',
 'js0b',
 '2021-6',
 '230a',
 '310',
 '95379',
 '242★',
 '230f',
 'H40',
 '79895',
 'A-69',
 'bl328',
 '98055',
 '138★',
 '44315',
 '339',
 '78p',
 '103448',
 '88398',
 '57592',
 '83762',
 '204b',
 '79855',
 '72239',
 '245★',
 '70653',
 '246★',
 '93922',
 '97961',
 '33c',
 '13d',
 '58895',
 '204c',
 '195p',
 'B30',
 '89982',
 '49d

collector_number is not always an integer.

## life_modifier

In [121]:
life_modifers, _ = get_attribute(cards, "life_modifier")
print(len(life_modifers))
print(set(life_modifers))

119
{'+8', '+15', '+3', '-4', '-6', '-8', '+6', '+0', '+9', '+7', '+10', '+18', '+2', '+30', '+5', '+1', '-2', '+12', '+4', '-5', '-3', '-7', '-1'}


## cmc

In [122]:
cmcs, _ = get_attribute(cards, "cmc")
print(set(cmcs))

{0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 0.5, 1000000.0}


## mana_cost

In [123]:
mana_costs, _ = get_attribute(cards, "mana_cost")
for mana_cost in mana_costs:
    if len(mana_cost) == 46:   # from metadata.json
        print(mana_cost)

{X}{W} // {2}{R} // {2}{U} // {3}{B} // {1}{G}
{X}{W} // {2}{R} // {2}{U} // {3}{B} // {1}{G}


## frame

In [124]:
frames, _ = get_attribute(cards, "frame")
print(set(frames))

{'2015', '1997', '2003', '1993', 'future'}


## all_parts

In [125]:
all_parts, _ = get_attribute(cards, "all_parts")
all_parts[:5]   # Currently a list of lists of dictionaries

[[{'object': 'related_card',
   'id': '4d8542f6-ee34-42c6-acd5-07b0c7cc2f63',
   'component': 'combo_piece',
   'name': 'Funeral Pyre',
   'type_line': 'Instant',
   'uri': 'https://api.scryfall.com/cards/4d8542f6-ee34-42c6-acd5-07b0c7cc2f63'},
  {'object': 'related_card',
   'id': '66210a3f-010b-4a9b-a08f-97d3ca962b0c',
   'component': 'combo_piece',
   'name': 'Haunted Dead',
   'type_line': 'Creature — Zombie',
   'uri': 'https://api.scryfall.com/cards/66210a3f-010b-4a9b-a08f-97d3ca962b0c'},
  {'object': 'related_card',
   'id': 'd333e35c-ca90-4aaa-950a-48b5623c31a6',
   'component': 'combo_piece',
   'name': 'Blessed Defiance',
   'type_line': 'Instant',
   'uri': 'https://api.scryfall.com/cards/d333e35c-ca90-4aaa-950a-48b5623c31a6'},
  {'object': 'related_card',
   'id': 'f0ad0796-0357-4e74-9d65-c7761a3f223c',
   'component': 'combo_piece',
   'name': "Slayer's Plate",
   'type_line': 'Artifact — Equipment',
   'uri': 'https://api.scryfall.com/cards/f0ad0796-0357-4e74-9d65-c7761a3

In [126]:
get_types(all_parts)

{list}

In [127]:
get_lengths(all_parts)

{2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 23,
 24,
 25,
 26,
 29,
 30,
 32,
 33,
 34,
 36,
 37,
 38,
 39,
 42,
 43,
 45,
 47,
 50,
 53,
 55,
 60,
 65,
 68,
 70,
 80,
 90,
 95,
 97,
 191,
 223}

In [128]:
parts = list()
for part_grouping in all_parts:
    for part in part_grouping:
        parts.append(part)
parts[:5]

[{'object': 'related_card',
  'id': '4d8542f6-ee34-42c6-acd5-07b0c7cc2f63',
  'component': 'combo_piece',
  'name': 'Funeral Pyre',
  'type_line': 'Instant',
  'uri': 'https://api.scryfall.com/cards/4d8542f6-ee34-42c6-acd5-07b0c7cc2f63'},
 {'object': 'related_card',
  'id': '66210a3f-010b-4a9b-a08f-97d3ca962b0c',
  'component': 'combo_piece',
  'name': 'Haunted Dead',
  'type_line': 'Creature — Zombie',
  'uri': 'https://api.scryfall.com/cards/66210a3f-010b-4a9b-a08f-97d3ca962b0c'},
 {'object': 'related_card',
  'id': 'd333e35c-ca90-4aaa-950a-48b5623c31a6',
  'component': 'combo_piece',
  'name': 'Blessed Defiance',
  'type_line': 'Instant',
  'uri': 'https://api.scryfall.com/cards/d333e35c-ca90-4aaa-950a-48b5623c31a6'},
 {'object': 'related_card',
  'id': 'f0ad0796-0357-4e74-9d65-c7761a3f223c',
  'component': 'combo_piece',
  'name': "Slayer's Plate",
  'type_line': 'Artifact — Equipment',
  'uri': 'https://api.scryfall.com/cards/f0ad0796-0357-4e74-9d65-c7761a3f223c'},
 {'object': 're

In [129]:
# Get attributes of the all_parts object
attributes = set()
for part_grouping in all_parts:
    # print(type(part_grouping))
    # print(part_grouping)
    for part in part_grouping:
        for key in part.keys():
            attributes.add(key)
attributes

{'component', 'id', 'name', 'object', 'type_line', 'uri'}

In [131]:
parts_metadata = get_metadata(parts, attributes)

{'type_line': {'missing_count': 0,
  'types': ["<class 'str'>"],
  'lengths': [4,
   5,
   6,
   7,
   8,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   25,
   26,
   27,
   28,
   29,
   30,
   31,
   32,
   33,
   34,
   35,
   36,
   37,
   38,
   39,
   40,
   41,
   42,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
   51,
   52,
   53,
   54,
   55,
   56,
   57,
   58,
   59,
   60,
   61,
   62,
   63,
   64,
   65,
   66,
   68,
   69,
   72,
   73,
   74,
   80],
  'distinct_value_count': 1255},
 'id': {'missing_count': 0,
  'types': ["<class 'str'>"],
  'lengths': [36],
  'distinct_value_count': 12768},
 'component': {'missing_count': 0,
  'types': ["<class 'str'>"],
  'lengths': [9, 11, 5],
  'distinct_value_count': 4},
 'object': {'missing_count': 0,
  'types': ["<class 'str'>"],
  'lengths': [12],
  'distinct_value_count': 1},
 'name': {'missing_count': 0,
  'types': ["<class 'str'>"],
  'lengths': [2,
   3,
 

In [134]:
with open("data/all_parts_metadata.json", "w") as f:
    json.dump(parts_metadata, f, indent=4)