In [115]:
import os
import pickle
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from string import punctuation
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import MinMaxScaler
from gensim.models import Word2Vec
import torch
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
DATA_PATH = os.path.abspath(os.path.join('..','data'))
CARDS_PATH = os.path.join(DATA_PATH, "cards_unique.pkl")

ModuleNotFoundError: No module named 'bodo'

In [51]:
def load_data(fp):
    """
    Reads in data.

    :param fp: filepath of data stored in pickle
    :returns: data stored in pickle
    """
    with open(fp, "rb") as f:
        data = pickle.load(f)
    return data

def tokenize(text):
    """
    Tokenizes text.

    :param text: text to tokenize
    :returns: tokenized text
    """
    to_remove = stopwords.words("english")+list(punctuation)
    return [x for x in word_tokenize(text.lower()) if x not in to_remove]


In [52]:
def clean_data(cards):
    """
    Performs multiple transformations on data, such as filtering, tokenizing text, and extracting keywords.
    
    :param cards: DataFrame containing information of each non-commander card, such as name, text, and color
    :param commanders: DataFrame containing information of each commander card, such as name, text, and color
    :returns: tuple containing all cleaned data, cleaned non-commander card data, and cleaned commander data
    """
    # filtering out non-legal cards in commander
    legal = pd.read_csv('../data/cardLegalities.csv').loc[:,['commander', 'uuid']]
    cards = cards.merge(legal,on='uuid')
    cards = cards[cards['commander'] == 'Legal']

    cards_clean = cards.loc[cards["text"].apply(lambda x: not (isinstance(x, float) and np.isnan(x))), ["name", "text", "colorIdentity", "keywords", "type"]]
    cards_clean["color"] = cards_clean["colorIdentity"].str.split(", ")
    # tokenize text
    cards_clean["tokenized"] = cards_clean["text"].apply(tokenize)
    # normalized text length
    cards_clean["textLength"] = cards_clean["text"].str.len()
    cards_clean["textLength"] = MinMaxScaler().fit_transform(cards_clean[["textLength"]])
    # keyword list
    cards_clean["keyword_list"] = cards_clean["keywords"].str.split(", ")


    return cards_clean

def train_model(cards_clean):
    """
    Trains Word2Vec model on card text.
    
    :param cards_clean: cleaned DataFrame containing information on all cards
    :returns: trained Word2Vec model
    """
    return Word2Vec(sentences=cards_clean["tokenized"])

In [55]:
cards = load_data(CARDS_PATH)
cards_clean = clean_data(cards)

sentences = ["This is an example sentence", "Each sentence is converted"]


In [108]:

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
embeddings = model.encode(sentences)
def cosine_similarity(vector1, vector2):
    """
    Computes the cosine similarity between two vectors.
    
    Args:
    vector1 (torch.Tensor): A tensor representing the first vector.
    vector2 (torch.Tensor): A tensor representing the second vector.
    
    Returns:
    float: The cosine similarity between vector1 and vector2.
    """
    # Ensure the vectors are 1-dimensional
    
    # Compute the dot product between the two vectors
    dot_product = np.dot(vector1, vector2)
    
    # Compute the magnitudes (norms) of the vectors
    norm1 = np.linalg.norm(vector1)
    norm2 = np.linalg.norm(vector2)
    
    # Compute the cosine similarity
    cos_similarity = dot_product / (norm1 * norm2)
    
    return cos_similarity.item()



In [93]:
model.encode(['Flying, trample Ward—Discard a card. Creatures your opponents control have base toughness 1.', 'Certainly! Here is the same method to compute the cosine similarity between two vectors using NumPy:'])

array([[-0.00234556, -0.08480919, -0.0134825 , ...,  0.01781378,
         0.03595351, -0.0002674 ],
       [-0.07560935,  0.02839662, -0.00189859, ..., -0.02056374,
         0.03101758, -0.00616321]], dtype=float32)

In [102]:
def find_sim(colors,text):
    #tokenized = tokenize(text)

    scores = []
    for card_idx, card_row in tqdm(cards_clean.iterrows(), total= cards_clean.shape[0]):
        # null-color cards can go into any deck
        # null-color commanders can only take null-color cards  
        if (not isinstance(card_row["color"], list)) or ((isinstance(colors, list)) and (all([x in colors for x in card_row["color"]]))):
            scores.append((cosine_similarity(model.encode([text, card_row['text']])), card_row["name"], card_row["type"]))
    # sorts scores
    sorted_scores = sorted(scores)[::-1]
    #results_base_all[row["name"]] = [x[1] for x in sorted_scores]
    return sorted_scores

In [109]:
def find_sim(colors, text):
    def compute_similarity(row):
        if not isinstance(row["color"], list) or (isinstance(colors, list) and all([x in colors for x in row["color"]])):
            similarity = cosine_similarity(model.encode(text), model.encode(row['text']))
            return similarity, row["name"], row["type"]
        return None

    tqdm.pandas()  # Enable tqdm progress bar for pandas
    scores = cards_clean.progress_apply(compute_similarity, axis=1)
    scores = [score for score in scores if score is not None]
    sorted_scores = sorted(scores, key=lambda x: x[0], reverse=True)
    
    return sorted_scores

In [113]:
def find_sim(colors, text):
    # Create a boolean mask based on the condition
    mask = np.where(
        ~cards_clean['color'].apply(lambda x: isinstance(x, list)) | 
        (cards_clean['color'].apply(lambda x: isinstance(x, list) and all([color in colors for color in x]))),
        True, False
    )

    # Apply the mask to filter the DataFrame
    filtered_cards = cards_clean[mask]

    # Define the function to compute similarity
    def compute_similarity(row):
        similarity = cosine_similarity(model.encode(text), model.encode(row['text']))
        return similarity, row["name"], row["type"]

    tqdm.pandas()  # Enable tqdm progress bar for pandas
    scores = filtered_cards.progress_apply(compute_similarity, axis=1)
    sorted_scores = sorted(scores, key=lambda x: x[0], reverse=True)
    
    return sorted_scores

In [114]:
find_sim(['B'], 'Flying, trample Ward—Discard a card. Creatures your opponents control have base toughness 1.')

  0%|          | 0/6352 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
model.encode([" ".join(cards_clean.iloc[0]['tokenized']), 'Flying, trample Ward—Discard a card. Creatures your opponents control have base toughness 1.'])

array([[ 0.02259655, -0.07929838,  0.01353117, ...,  0.0015718 ,
        -0.00592575, -0.01060783],
       [-0.00234556, -0.08480919, -0.01348252, ...,  0.01781374,
         0.03595351, -0.0002674 ]], dtype=float32)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[[ 0.02250258 -0.07829181 -0.02303076 ... -0.00827928  0.02652692
  -0.00201897]
 [ 0.04170238  0.0010974  -0.01553418 ... -0.02181626 -0.0635936
  -0.00875283]]


In [99]:
cards_clean['text']

0        First strike (This creature deals combat damag...
1        Flying\nWhen Angel of Mercy enters the battlef...
2        Target creature gets +3/+3 and gains flying un...
3        Whenever a creature enters the battlefield und...
4           Defender (This creature can't attack.)\nFlying
                               ...                        
28775    Kicker {4} (You may pay an additional {4} as y...
28776    Spare Supplies enters the battlefield tapped.\...
28777    Stonework Packbeast is also a Cleric, Rogue, W...
28778    When Utility Knife enters the battlefield, att...
28779    Base Camp enters the battlefield tapped.\n{T}:...
Name: text, Length: 26416, dtype: object