In [13]:
from FlagEmbedding import BGEM3FlagModel # type: ignore
import json
import os
import numpy as np
import pandas as pd
import faiss
useful_features = ["card_faces" , "cmc", "color_identity", "defense", "edhrec_rank", "game_changer", 
                   "keywords",  "life_modifier", "loyalty", "mana_cost", "name", "oracle_text", 
                   "power", "produced_mana", "toughness", "type_line", "rulings"]

saved_embeddings = "Embedded_Magic_cards.pkl"
redo_embeddings = True

In [None]:
#TODO # Need to be able to define the importance of words. 
# For example we get a lot of cards that are similar because they share the same "name " for example all of the urzas are more similar than other cards
# I want cards to similar soly their attributes. 
# (It may be better for now to just do the oracle text and ignore everything else? But these you still ahve the problem where cards reference themselves and the model thinks that means they are similar)
# Which yes they are however the name of the card should not matter. Only Some aspects of card text we care about
def gen_text_cards(df):
    samples = []
    for card_name, row in df.iterrows():
        row_strings = [f"{card_name};; "]
        for column in df.columns:
            if column != 'name':
                value = row[column]
                if value == '':
                    continue
                row_strings.append(f"{column}; {value}")
        # Combine into single string
        sample = f"{card_name}; " + ", ".join(row_strings)
        samples.append(sample)
    
    return samples

In [15]:
if not os.path.exists(saved_embeddings) or True:
    print(f"Building data frame and doing embeddings")
    with open("Data/oracle-cards-20250419090233.json", "r") as file:
        data = json.load(file)
    with open("Data/rulings-20250419210039.json", "r") as file:
        rulings = json.load(file)

    oracle_id_loc = {}
    for i in range(len(data)):
        oracle_id_loc[data[i]["oracle_id"]] = i
    
    for r in rulings:
        if 'rulings' not in data[oracle_id_loc[r["oracle_id"]]]:
            data[oracle_id_loc[r["oracle_id"]]]['rulings'] = r['comment'] 
        else: 
            data[oracle_id_loc[r["oracle_id"]]]['rulings'] + " " + r['comment']
    #Process the data
    df = pd.DataFrame(data, columns=useful_features)
    df = df[df['type_line'] != "Card // Card"]
    df = df.fillna('')
    df.set_index('name', inplace=True)
else:
    print(f"Dataframe read from {saved_embeddings}")
    df = pd.read_pickle(saved_embeddings)

Building data frame and doing embeddings


In [16]:
if redo_embeddings:
    #Create Text card object to encode as sentences 
    text_cards = gen_text_cards(df)
    embedding_model = BGEM3FlagModel('BAAI/bge-m3') 
    #card_embeddings = embedding_model.encode(df.index.tolist(), return_dense=True)
    card_embeddings = embedding_model.encode(text_cards, return_dense=True)
    print(card_embeddings)
    print(df.shape, card_embeddings['dense_vecs'].shape)
    print(card_embeddings['dense_vecs'][-1])

    df['embeddings'] = card_embeddings['dense_vecs'].tolist()
    df.to_pickle(saved_embeddings)
    print(f"Build data frame and saved do {saved_embeddings}")

Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 299593.14it/s]
pre tokenize: 100%|██████████| 128/128 [00:01<00:00, 70.37it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Inference Embeddings: 100%|██████████| 128/128 [01:00<00:00,  2.11it/s]


{'dense_vecs': array([[-0.0621  ,  0.012245, -0.0032  , ...,  0.02725 ,  0.05362 ,
        -0.01979 ],
       [-0.02542 ,  0.01863 , -0.0348  , ..., -0.04245 ,  0.0245  ,
        -0.01683 ],
       [-0.05505 ,  0.00622 , -0.0319  , ..., -0.02635 , -0.02461 ,
        -0.004234],
       ...,
       [-0.04327 ,  0.0317  , -0.0303  , ..., -0.00509 ,  0.003313,
        -0.00922 ],
       [-0.03735 ,  0.001635, -0.03058 , ..., -0.03848 ,  0.0333  ,
        -0.01534 ],
       [-0.07263 ,  0.0237  , -0.02298 , ..., -0.009926,  0.0214  ,
        -0.006805]], dtype=float16), 'lexical_weights': None, 'colbert_vecs': None}
(32662, 16) (32662, 1024)
[-0.07263   0.0237   -0.02298  ... -0.009926  0.0214   -0.006805]
Build data frame and saved do Embedded_Magic_cards.pkl


In [17]:
embeddings = np.array(df["embeddings"].tolist()).astype('float32')
string_to_int_id = {str_id: i for i, str_id in enumerate(df.index)}
#int_ids = np.array([string_to_int_id[idx] for idx in df.index]).astype('int64')

index_flat = faiss.IndexFlatL2(embeddings.shape[1])
index_flat.add(embeddings)

res = faiss.StandardGpuResources()
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)

In [18]:
print(embeddings.shape)
print(embeddings[200])

(32662, 1024)
[ 0.00566483  0.02557373 -0.03842163 ... -0.00123692  0.01812744
 -0.02024841]


In [19]:
print(df.head())

                         card_faces  cmc color_identity defense edhrec_rank  \
name                                                                          
Nissa, Worldsoul Speaker             4.0            [G]              9466.0   
Static Orb                           3.0             []              4776.0   
Sensory Deprivation                  1.0            [U]             26645.0   
Road of Return                       2.0            [G]              8185.0   
Storm Crow                           2.0            [U]             18277.0   

                          game_changer    keywords life_modifier loyalty  \
name                                                                       
Nissa, Worldsoul Speaker         False  [Landfall]                         
Static Orb                       False          []                         
Sensory Deprivation              False   [Enchant]                         
Road of Return                   False   [Entwine]                

In [20]:
# Compute the most similar cards.
# Cosing similarity A*B/||A||*||B||. 
#   Dot product over the product of the maginitudes of the two vectors

#Inorder to find the most similar cards. \
#   compute the dot product of the query vector to every vector

# Recursive definition? can maybe you can compute the k most similar and then use the intersection to compute the similarity between all cards. might have a better run time. interesting experiment

# Better to compute as a matrix instead
from sklearn.metrics.pairwise import cosine_similarity
def cosine_similarity_search(df, card_name, all_embeddings):
    if card_name not in df.index:
        print(f"{card_name} is not a card present within dataframe")
        return None
    else:
        target_embedding = np.array(df.loc[card_name, "embeddings"]).reshape(1, -1)
        similarities = cosine_similarity(target_embedding, all_embeddings)[0]

        similarities_df = pd.DataFrame({'card_name': df.index,
                                        'similarity': similarities})
        similarities_df = similarities_df[similarities_df['card_name'] != card_name]

        return similarities_df.sort_values(by='similarity', ascending=False)

In [None]:
all_embeddings = np.stack(df["embeddings"].values)
print(all_embeddings)
print(all_embeddings.shape)

[[-0.06210327  0.01224518 -0.00320053 ...  0.0272522   0.05361938
  -0.01979065]
 [-0.02542114  0.01863098 -0.03479004 ... -0.04244995  0.02450562
  -0.01683044]
 [-0.05505371  0.00622177 -0.03189087 ... -0.02635193 -0.02461243
  -0.00423431]
 ...
 [-0.04327393  0.03170776 -0.03030396 ... -0.00508881  0.00331306
  -0.00921631]
 [-0.03735352  0.0016346  -0.03057861 ... -0.03848267  0.03329468
  -0.01534271]
 [-0.07263184  0.0236969  -0.02297974 ... -0.00992584  0.02139282
  -0.00680542]]
(32662, 1024)


In [25]:
#cos_search_results = cosine_similarity_search(df, "K'rrik, Son of Yawgmoth", all_embeddings)
cos_search_results = cosine_similarity_search(df, "Urza, Lord High Artificer", all_embeddings)
if cos_search_results is not None:
    print("Top 5 most similar cards to search")
    print(cos_search_results.head(10))
    for k in cos_search_results.head(10)["card_name"]:
        print(k + ":\n", df.loc[k]["oracle_text"], df.loc[k])


Top 5 most similar cards to search
                         card_name  similarity
20082        Urza, Chief Artificer    0.860714
15776   A-Urza, Powerstone Prodigy    0.795338
9695                   Urza's Saga    0.783761
13491        Urza, Prince of Kroog    0.780918
10439          Audacious Reshapers    0.779171
32170  Muzzio, Visionary Architect    0.772876
3080                 Urza's Bauble    0.770881
1068                 Treasure Mage    0.769854
13904              Artificer Class    0.769338
24735                 Tribute Mage    0.768341
Urza, Chief Artificer:
 Affinity for artifact creatures (This spell costs {1} less to cast for each artifact creature you control.)
Artifact creatures you control have menace.
At the beginning of your end step, create a 0/0 colorless Construct artifact creature token with "This token gets +1/+1 for each artifact you control." card_faces                                                         
cmc                                                 

In [23]:
print(df.index.tolist)

<bound method IndexOpsMixin.tolist of Index(['Nissa, Worldsoul Speaker', 'Static Orb', 'Sensory Deprivation',
       'Road of Return', 'Storm Crow', 'Snarlfang Vermin', 'Walking Sponge',
       'Ravnica at War', 'Greta, Sweettooth Scourge', 'Torrent of Fire',
       ...
       'Freyalise's Winds', 'Clearwater Goblet',
       'Young Blue Dragon // Sand Augury', 'Quarry Beetle', 'Devoted Hero',
       'Without Weakness', 'Firesong and Sunspeaker',
       'Toralf, God of Fury // Toralf's Hammer', 'Samut, the Tested',
       'Sinew Sliver'],
      dtype='object', name='name', length=32662)>


In [24]:

#import matplotlib.pyplot as plt
#import numpy as np
#import hdbscan
#import faiss
#from mpl_toolkits.mplot3d import Axes3D
#from sklearn.decomposition import PCA

#num_random = 1000
#random_samp = [0]
#possible_indices = np.arange(1, embeddings.shape[0])
#random_indices = np.random.choice(possible_indices, size=num_random, replace=False)
#random_samp.extend(random_indices)

#r_embeddings = embeddings[random_samp]


#r_embeddings = embeddings[:500]

#index_flat = faiss.IndexFlatL2(r_embeddings.shape[1])
#index_flat.add(r_embeddings)

#res = faiss.StandardGpuResources()
#gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)

#query_index = 0  
#query_vector_original = r_embeddings[query_index].reshape(1, -1)

##knn
#k = 20  
#distances, indices = gpu_index_flat.search(query_vector_original, k)
#nearest_neighbor_vectors_original = r_embeddings[indices[0]]

## Apply PCA for Dimensionality Reduction to 3D
#n_components = 3
#pca = PCA(n_components=n_components)
#reduced_embeddings = pca.fit_transform(r_embeddings)
#reduced_query_vector = pca.transform(query_vector_original)[0]
#reduced_nearest_neighbors = pca.transform(nearest_neighbor_vectors_original)

##HDBScan
#clusterer = hdbscan.HDBSCAN(min_cluster_size=3)
#clusters = clusterer.fit_predict(reduced_embeddings)
##clusters = clusterer.fit_predict(r_embeddings)

#fig = plt.figure(figsize=(10, 12))
#ax = fig.add_subplot(111, projection='3d')

##ax.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], reduced_embeddings[:, 2], color='black', label='Data Points (PCA Reduced)', alpha=0.4)
#ax.scatter(reduced_query_vector[0], reduced_query_vector[1], reduced_query_vector[2], color='red', marker='x', s=100, label="Nissa, Worldsoul Speaker")
#ax.scatter(reduced_nearest_neighbors[:, 0], reduced_nearest_neighbors[:, 1], reduced_nearest_neighbors[:, 2], color='orange', s=80, label='Nearest Neighbors (PCA Reduced)', alpha=0.5)
#unique_clusters = np.unique(clusters)
##for label, color in zip(np.unique(unique_clusters), colors):
##    cluster_mask = (clusters == label)
##    ax.scatter(reduced_embeddings[cluster_mask, 0],
##               reduced_embeddings[cluster_mask, 1],
##               reduced_embeddings[cluster_mask, 2],
##               c=[color],  # Use a list to ensure color is applied correctly
##               marker='o',
##               alpha=1,)
#unique_labels = np.unique(clusters)
#cmap = plt.cm.viridis
## Plot each cluster
#for label in unique_labels:
    #cluster_mask = (clusters == label)
    #if label == -1:
        ## Outliers: make them black
        #ax.scatter(reduced_embeddings[cluster_mask, 0],
                   #reduced_embeddings[cluster_mask, 1],
                   #reduced_embeddings[cluster_mask, 2],
                   #c='black',
                   #marker='o',
                   #alpha=1,
                   #label='Outliers')
    #else:
        ## Cluster members: use the colormap
        ## Find the index of the current label in the sorted unique labels (excluding -1)
        #sorted_labels = np.sort(unique_labels[unique_labels != -1])
        #if len(sorted_labels) > 0:
            #color_index = np.where(sorted_labels == label)[0][0]
            #color = cmap(color_index / (len(sorted_labels) - 1) if len(sorted_labels) > 1 else cmap(0))
        #else:
            #color = cmap(0) # Default color if no clusters other than outliers

        #ax.scatter(reduced_embeddings[cluster_mask, 0],
                   #reduced_embeddings[cluster_mask, 1],
                   #reduced_embeddings[cluster_mask, 2],
                   #c=[color],
                   #marker='o',
                   #alpha=1
                   #)
#ax.set_title('3D Visualization of Vector Database Search (PCA Reduced)')
#ax.legend(loc='lower right')

## Annotate the nearest neighbors (optional)
##for i, neighbor in enumerate(reduced_nearest_neighbors):
##    ax.text(neighbor[0], neighbor[1], neighbor[2], f'NN {i+1}', fontsize=9)

## Show the plot
#plt.show()