In [None]:
from tqdm.notebook import tqdm
import pandas as pd
from pathlib import Path
from transformers import GPT2Tokenizer
import numpy as np
import functools
import requests
import json
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199

In [None]:
from mtg_ai import MTGColorCombo, MTG_STANDARD_COLOR_ORDER
from mtg_ai import data_processing
from mtg_combos import Combos
combos = Combos()

In [None]:
# from tqdm.auto import tqdm
tqdm.pandas()

In [None]:

drop_columns = [
    "life_modifier",
    "hand_modifier",
    "attraction_lights",
    "object",
    "multiverse_ids",
    "mtgo_id",
    "mtgo_foil_id",
    "tcgplayer_id",
    "cardmarket_id",
    "uri",
    "scryfall_uri",
    "highres_image",
    "image_status",
    "image_uris",
    "reserved",
    "foil",
    "nonfoil",
    "finishes",
    "oversized",
    "promo",
    "reprint",
    "variation",
    "set_uri",
    "set_search_uri",
    "scryfall_set_uri",
    "prints_search_uri",
    "flavor_text",
    "artist_ids",
    "illustration_id",
    "border_color",
    "frame",
    "textless",
    "booster",
    "story_spotlight",
    "related_uris",
    "purchase_uris",
    "security_stamp",
    "preview",
    "penny_rank",
    "frame_effects",
    "watermark",
    "card_faces",
    "tcgplayer_etched_id",
    "promo_types",
    "prices",
    "artist",
    "digital",
    "games",
    "set_id",
    "set",
    "set_name",
    "collector_number",
    "full_art",
    "all_parts",
    "arena_id",
    "released_at",
    "content_warning",
    "card_back_id",
    "lang",
    "id",
]

column_order = ["oracle_id", "name", "rarity", "mana_cost", "cmc", "colors", "color_identity",  "type_line", "power", "toughness", "loyalty", "produced_mana", "keywords", "set_type", "oracle_text", "layout", "edhrec_rank", "color_indicator",]

def process():
    mtg_data_path = Path("./data/oracle-cards-20231121100139.json")
    mtg_ruling_path = Path("./data/wotc_rulings.csv")
    rulings = pd.read_csv(mtg_ruling_path)
    df: pd.DataFrame = pd.read_json(mtg_data_path)
    df.drop(drop_columns, axis=1, inplace=True)
    legalities = expand_legalities_column(df)
    df = filter_legalities(df, legalities, "modern")
    df = clean_columns(df)
    if len(column_order) != len(df.columns):
        raise Exception()
    df = df.reindex(columns=column_order)
    tokenized_df = tokenize(df)
    df.set_index("oracle_id", inplace=True)

    return df, tokenized_df

def _clean_empty(df, column: str, replace_value):
    df[column].fillna(replace_value, inplace=True)

def _sort_color_strings(df, column):
    df[column] = df[column].apply(MTGColorCombo._sort_multicolor_str)

def clean_columns(df):
    df.mana_cost = df.mana_cost.str.replace("{", "").str.replace("}", "").str.upper()
    _clean_empty(df, "colors", MTGColorCombo.COLORLESS)
    _clean_empty(df, "color_identity", "")
    _clean_empty(df, "produced_mana", "")
    _clean_empty(df, "color_indicator", "")
    _clean_empty(df, "power", "<NaN>")
    _clean_empty(df, "toughness", "<NaN>")
    _clean_empty(df, "loyalty", "<NaN>")
    _clean_empty(df, "edhrec_rank", 0)
    df["name"] = df["name"].str.casefold()
    df["oracle_text"] = df["oracle_text"].str.casefold()
    df.produced_mana = df.produced_mana.str.join("")
    df.edhrec_rank = df.edhrec_rank.astype("int")
    _sort_color_strings(df, "colors")
    _sort_color_strings(df, "color_identity")
    _sort_color_strings(df, "color_indicator")
    _sort_color_strings(df, "produced_mana")
    df = convert_types(df)
    # df.color_identity = df.color_identity.apply(MTGColorCombo._sort_multicolor_str)
    return df

def calculate_columns(df):
    df["legendary"] = df.type_line.str.contains("Legendary")
    return df

def filter_legalities(df: pd.DataFrame, legalities: pd.DataFrame, format: str = "modern"):
    df = df.loc[legalities[format]].reset_index(drop=True)
    return df

def convert_types(df: pd.DataFrame):
    df = df.astype({"name": "str", "oracle_text": "str", "rarity": "str", "cmc": "int"})
    return df


def expand_prices(df: pd.DataFrame):
    prices = pd.json_normalize(df["prices"])
    return prices

def expand_legalities_column(df: pd.DataFrame):
    legalities = pd.json_normalize(df["legalities"])
    for c in legalities:
        legalities[c] = legalities[c].apply(lambda x: True if x == "legal" else False)
    df.drop("legalities", axis=1, inplace=True)
    return legalities

def merge(main_df: pd.DataFrame, *dfs: pd.DataFrame):
    for df in dfs:
        main_df = main_df.merge(df, left_index=True, right_index=True)
    return main_df

def tokenize(df: pd.DataFrame):
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenized_df = pd.DataFrame()
    tokenize_columns = ["oracle_text", "name", "rarity", "type_line", "colors",]
    for column in (pbar := tqdm(tokenize_columns, desc="Tokenizing Columns")):
        pbar.set_postfix_str(f"column: {column}")
        tokenized_df[column] = df[column].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
    return tokenized_df

df, tokenized_df = process()

In [None]:
df, tokenized_df = data_processing.process()


In [None]:
df

In [None]:
def non_empty_columns(column: str):
    return df[column].loc[df[column] != ""]

In [None]:
non_empty_columns("color_indicator")

In [None]:
column = "rulings_uri"
df[column]

In [None]:
import requests
rulings_data = []
for url in tqdm(df.rulings_uri):
    ruling = requests.get(url)
    rulings_data.append(ruling.json())


In [None]:
rulings_data

In [None]:
result = []
for item in rulings_data:
    data = item["data"]
    rules = []
    oracle_id = ""
    for ruling in data:
        oracle_id = ruling["oracle_id"]
        rules.append(ruling["comment"])
    if not oracle_id:
        continue
    result.append({"oracle_id": oracle_id, "rulings": rules})

In [None]:
rulings_df = pd.DataFrame(result)
rulings_df.set_index("oracle_id", inplace=True)

In [None]:
output = Path("./data/wotc_rulings.json")
rulings_df.to_json(output, orient="columns")

In [None]:
df.merge(rulings_df, left_index=True, right_index=True)

In [None]:
data = ruling.json()

In [None]:
data

In [None]:
oracle_id = 
[i["comment"] for i in data["data"]]

In [None]:
data