In [None]:
from FlagEmbedding import BGEM3FlagModel # type: ignore
import json
import os
import numpy as np
import pandas as pd
import faiss
useful_features = ["card_faces" , "cmc", "color_identity", "defense", "edhrec_rank", "game_changer", 
                   "keywords",  "life_modifier", "loyalty", "mana_cost", "name", "oracle_text", 
                   "power", "produced_mana", "toughness", "type_line", "rulings"]

saved_embeddings = "Embedded_Magic_cards.pkl"
redo_embeddings = True

In [4]:
def gen_text_cards(df):
    samples = []
    for card_name, row in df.iterrows():
        row_strings = [f"{card_name};; "]
        for column in df.columns:
            if column != 'name':
                value = row[column]
                if value == '':
                    continue
                row_strings.append(f"{column}; {value}")
        # Combine into single string
        sample = f"{card_name}; " + ", ".join(row_strings)
        samples.append(sample)
    
    return samples

In [10]:
if not os.path.exists(saved_embeddings) or True:
    print(f"Building data frame and doing embeddings")
    with open("Data/oracle-cards-20250419090233.json", "r") as file:
        data = json.load(file)
    with open("Data/rulings-20250419210039.json", "r") as file:
        rulings = json.load(file)

    oracle_id_loc = {}
    for i in range(len(data)):
        oracle_id_loc[data[i]["oracle_id"]] = i
    
    for r in rulings:
        if 'rulings' not in data[oracle_id_loc[r["oracle_id"]]]:
            data[oracle_id_loc[r["oracle_id"]]]['rulings'] = r['comment'] 
        else: 
            data[oracle_id_loc[r["oracle_id"]]]['rulings'] + " " + r['comment']
    #Process the data
    df = pd.DataFrame(data, columns=useful_features)
    df = df[df['type_line'] != "Card // Card"]
    df = df.fillna('')
    df.set_index('name', inplace=True)
else:
    print(f"Dataframe read from {saved_embeddings}")
    df = pd.read_pickle(saved_embeddings)

Building data frame and doing embeddings


In [None]:
if redo_embeddings:
    #Create Text card object to encode as sentences 
    text_cards = gen_text_cards(df)
    embedding_model = BGEM3FlagModel('BAAI/bge-m3') 
    #card_embeddings = embedding_model.encode(df.index.tolist(), return_dense=True)
    card_embeddings = embedding_model.encode(text_cards, return_dense=True)
    print(card_embeddings)
    print(df.shape, card_embeddings['dense_vecs'].shape)
    print(card_embeddings['dense_vecs'][-1])

    df['embeddings'] = card_embeddings['dense_vecs'].tolist()
    df.to_pickle(saved_embeddings)
    print(f"Build data frame and saved do {saved_embeddings}")

Dataframe read from Embedded_Magic_cards.pkl
card_faces                                                         
cmc                                                             4.0
color_identity                                                  [G]
defense                                                            
edhrec_rank                                                  9466.0
game_changer                                                  False
keywords                                                 [Landfall]
life_modifier                                                      
loyalty                                                            
mana_cost                                                    {3}{G}
oracle_text       Landfall — Whenever a land you control enters,...
power                                                             3
produced_mana                                                      
toughness                                                         3
typ

In [96]:
embeddings = np.array(df["embeddings"].tolist()).astype('float32')
string_to_int_id = {str_id: i for i, str_id in enumerate(df.index)}
#int_ids = np.array([string_to_int_id[idx] for idx in df.index]).astype('int64')

index_flat = faiss.IndexFlatL2(embeddings.shape[1])
index_flat.add(embeddings)

res = faiss.StandardGpuResources()
gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)

In [97]:
print(embeddings.shape)
print(embeddings[200])

(32662, 1024)
[-0.00668335  0.01739502 -0.04266357 ... -0.00151634  0.02044678
 -0.01010895]


In [98]:
print(df.head())

                         card_faces  cmc color_identity defense edhrec_rank  \
name                                                                          
Nissa, Worldsoul Speaker             4.0            [G]              9466.0   
Static Orb                           3.0             []              4776.0   
Sensory Deprivation                  1.0            [U]             26645.0   
Road of Return                       2.0            [G]              8185.0   
Storm Crow                           2.0            [U]             18277.0   

                          game_changer    keywords life_modifier loyalty  \
name                                                                       
Nissa, Worldsoul Speaker         False  [Landfall]                         
Static Orb                       False          []                         
Sensory Deprivation              False   [Enchant]                         
Road of Return                   False   [Entwine]                

In [99]:
# Compute the most similar cards.
# Cosing similarity A*B/||A||*||B||. 
#   Dot product over the product of the maginitudes of the two vectors

#Inorder to find the most similar cards. \
#   compute the dot product of the query vector to every vector

# Recursive definition? can maybe you can compute the k most similar and then use the intersection to compute the similarity between all cards. might have a better run time. interesting experiment

# Better to compute as a matrix instead
from sklearn.metrics.pairwise import cosine_similarity
def cosine_similarity_search(df, card_name, all_embeddings):
    if card_name not in df.index:
        print(f"{card_name} is not a card present within dataframe")
        return None
    else:
        target_embedding = np.array(df.loc[card_name, "embeddings"]).reshape(1, -1)
        similarities = cosine_similarity(target_embedding, all_embeddings)[0]

        similarities_df = pd.DataFrame({'card_name': df.index,
                                        'similarity': similarities})
        similarities_df = similarities_df[similarities_df['card_name'] != card_name]

        return similarities_df.sort_values(by='similarity', ascending=False)

In [100]:
all_embeddings = np.stack(df["embeddings"].values)
print(all_embeddings)
print(all_embeddings.shape)
cos_search_results = cosine_similarity_search(df, "K'rrik, Son of Yawgmoth", all_embeddings)

[[-0.06173706  0.01956177  0.00497055 ...  0.0186615   0.06030273
  -0.01419067]
 [-0.01577759  0.00817108 -0.0329895  ... -0.03729248  0.02243042
  -0.01500702]
 [-0.05505371  0.00622177 -0.03189087 ... -0.02635193 -0.02461243
  -0.00423431]
 ...
 [-0.04327393  0.03170776 -0.03030396 ... -0.00508881  0.00331306
  -0.00921631]
 [-0.05020142 -0.00331116 -0.0345459  ... -0.04309082  0.03317261
  -0.02444458]
 [-0.06896973  0.01503754 -0.02912903 ... -0.00394821  0.02574158
  -0.00110626]]
(32662, 1024)


In [101]:
if cos_search_results is not None:
    print("Top 5 most similar cards to search")
    print(cos_search_results.head(10))
    for k in cos_search_results.head(10)["card_name"]:
        print(k + ":\n", df.loc[k]["oracle_text"], df.loc[k])


Top 5 most similar cards to search
                    card_name  similarity
25830      Priest of Yawgmoth    0.752767
15920    Phyrexian Broodlings    0.741464
26935   Phyrexian Fleshgorger    0.733331
2288   Red Priest of Yawgmoth    0.723851
25178          Child of Night    0.719800
32456  Mondrak, Glory Dominus    0.714601
4760                 Myr Sire    0.714166
18645   Gix, Yawgmoth Praetor    0.713682
12492            Pith Driller    0.713655
31613      Yawgmoth's Bargain    0.713465
Priest of Yawgmoth:
 {T}, Sacrifice an artifact: Add an amount of {B} equal to the sacrificed artifact's mana value. card_faces                                                         
cmc                                                             2.0
color_identity                                                  [B]
defense                                                            
edhrec_rank                                                 17948.0
game_changer                                  

In [102]:
print(df.index.tolist)

<bound method IndexOpsMixin.tolist of Index(['Nissa, Worldsoul Speaker', 'Static Orb', 'Sensory Deprivation',
       'Road of Return', 'Storm Crow', 'Snarlfang Vermin', 'Walking Sponge',
       'Ravnica at War', 'Greta, Sweettooth Scourge', 'Torrent of Fire',
       ...
       'Freyalise's Winds', 'Clearwater Goblet',
       'Young Blue Dragon // Sand Augury', 'Quarry Beetle', 'Devoted Hero',
       'Without Weakness', 'Firesong and Sunspeaker',
       'Toralf, God of Fury // Toralf's Hammer', 'Samut, the Tested',
       'Sinew Sliver'],
      dtype='object', name='name', length=32662)>


In [103]:

#import matplotlib.pyplot as plt
#import numpy as np
#import hdbscan
#import faiss
#from mpl_toolkits.mplot3d import Axes3D
#from sklearn.decomposition import PCA

#num_random = 1000
#random_samp = [0]
#possible_indices = np.arange(1, embeddings.shape[0])
#random_indices = np.random.choice(possible_indices, size=num_random, replace=False)
#random_samp.extend(random_indices)

#r_embeddings = embeddings[random_samp]


#r_embeddings = embeddings[:500]

#index_flat = faiss.IndexFlatL2(r_embeddings.shape[1])
#index_flat.add(r_embeddings)

#res = faiss.StandardGpuResources()
#gpu_index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)

#query_index = 0  
#query_vector_original = r_embeddings[query_index].reshape(1, -1)

##knn
#k = 20  
#distances, indices = gpu_index_flat.search(query_vector_original, k)
#nearest_neighbor_vectors_original = r_embeddings[indices[0]]

## Apply PCA for Dimensionality Reduction to 3D
#n_components = 3
#pca = PCA(n_components=n_components)
#reduced_embeddings = pca.fit_transform(r_embeddings)
#reduced_query_vector = pca.transform(query_vector_original)[0]
#reduced_nearest_neighbors = pca.transform(nearest_neighbor_vectors_original)

##HDBScan
#clusterer = hdbscan.HDBSCAN(min_cluster_size=3)
#clusters = clusterer.fit_predict(reduced_embeddings)
##clusters = clusterer.fit_predict(r_embeddings)

#fig = plt.figure(figsize=(10, 12))
#ax = fig.add_subplot(111, projection='3d')

##ax.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], reduced_embeddings[:, 2], color='black', label='Data Points (PCA Reduced)', alpha=0.4)
#ax.scatter(reduced_query_vector[0], reduced_query_vector[1], reduced_query_vector[2], color='red', marker='x', s=100, label="Nissa, Worldsoul Speaker")
#ax.scatter(reduced_nearest_neighbors[:, 0], reduced_nearest_neighbors[:, 1], reduced_nearest_neighbors[:, 2], color='orange', s=80, label='Nearest Neighbors (PCA Reduced)', alpha=0.5)
#unique_clusters = np.unique(clusters)
##for label, color in zip(np.unique(unique_clusters), colors):
##    cluster_mask = (clusters == label)
##    ax.scatter(reduced_embeddings[cluster_mask, 0],
##               reduced_embeddings[cluster_mask, 1],
##               reduced_embeddings[cluster_mask, 2],
##               c=[color],  # Use a list to ensure color is applied correctly
##               marker='o',
##               alpha=1,)
#unique_labels = np.unique(clusters)
#cmap = plt.cm.viridis
## Plot each cluster
#for label in unique_labels:
    #cluster_mask = (clusters == label)
    #if label == -1:
        ## Outliers: make them black
        #ax.scatter(reduced_embeddings[cluster_mask, 0],
                   #reduced_embeddings[cluster_mask, 1],
                   #reduced_embeddings[cluster_mask, 2],
                   #c='black',
                   #marker='o',
                   #alpha=1,
                   #label='Outliers')
    #else:
        ## Cluster members: use the colormap
        ## Find the index of the current label in the sorted unique labels (excluding -1)
        #sorted_labels = np.sort(unique_labels[unique_labels != -1])
        #if len(sorted_labels) > 0:
            #color_index = np.where(sorted_labels == label)[0][0]
            #color = cmap(color_index / (len(sorted_labels) - 1) if len(sorted_labels) > 1 else cmap(0))
        #else:
            #color = cmap(0) # Default color if no clusters other than outliers

        #ax.scatter(reduced_embeddings[cluster_mask, 0],
                   #reduced_embeddings[cluster_mask, 1],
                   #reduced_embeddings[cluster_mask, 2],
                   #c=[color],
                   #marker='o',
                   #alpha=1
                   #)
#ax.set_title('3D Visualization of Vector Database Search (PCA Reduced)')
#ax.legend(loc='lower right')

## Annotate the nearest neighbors (optional)
##for i, neighbor in enumerate(reduced_nearest_neighbors):
##    ax.text(neighbor[0], neighbor[1], neighbor[2], f'NN {i+1}', fontsize=9)

## Show the plot
#plt.show()