This time I am going to use qwen3-Embedding-0.6B again but give it a different instruction. I want it to focus more on the gameplay and less on the setting/story of the game.

In [1]:
import pandas as pd

In [3]:
sg_df_clean = pd.read_csv("sg_df_clean.csv")

In [4]:
from sentence_transformers import SentenceTransformer
import torch
import gc

# Load the model 
model = SentenceTransformer(
    "Qwen/Qwen3-Embedding-0.6B", 
    device="cuda", 
    model_kwargs={"torch_dtype": torch.bfloat16}
)

# Set the max sequence length 
model.max_seq_length = 512 

# Format the text with the instruction
print("Preparing text for Qwen3...")
instruction = "Represent this game primarily by its core gameplay mechanics and functional loops, treating the setting and narrative theme as secondary features for similarity matching\nQuery: "
texts = [instruction + str(text) for text in sg_df_clean['about_the_game']]

# Generate Embeddings
print("Starting Qwen3 Encoding in BF16 (Turbo Mode)...")
embeddings = model.encode(
    texts, 
    batch_size=64, 
    show_progress_bar=True, 
    normalize_embeddings=True
)

# Save to your DataFrame
sg_df_clean['embeddings_qwen'] = list(embeddings)

# Immediate VRAM Cleanup
del model
gc.collect()
torch.cuda.empty_cache()

print("\nEncoding complete. GPU VRAM has been cleared.")

`torch_dtype` is deprecated! Use `dtype` instead!


Preparing text for Qwen3...
Starting Qwen3 Encoding in BF16 (Turbo Mode)...


Batches:   0%|          | 0/1131 [00:00<?, ?it/s]


Encoding complete. GPU VRAM has been cleared.


In [6]:
sg_df_clean['embeddings_qwen']

0        [0.0046081543, -0.12011719, -0.010375977, -0.0...
1        [0.037109375, -0.08496094, -0.0034942627, -0.0...
2        [0.04272461, -0.11376953, -0.0046691895, -0.06...
3        [0.068359375, -0.140625, -0.0054626465, -0.044...
4        [0.009338379, -0.08105469, -0.007659912, -0.05...
                               ...                        
72366    [0.018798828, -0.08886719, -0.00970459, -0.017...
72367    [0.049804688, -0.07324219, -0.0065307617, -0.0...
72368    [0.04321289, -0.06347656, -0.006958008, -0.113...
72369    [0.068847656, -0.09423828, -0.0071411133, -0.0...
72370    [-0.016235352, -0.11816406, -0.007446289, -0.0...
Name: embeddings_qwen, Length: 72371, dtype: object

In [7]:
import numpy as np

In [8]:
embedding_matrix = np.array(sg_df_clean["embeddings_qwen"].to_list()) #we need to "unpack" the column to create a matrix to use for cosine similarity

__Cosine Similarity__

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
sim_matrix_embeddings = cosine_similarity(embedding_matrix) #cosine similarity for the new embeddings

In [11]:
tfidf_matrix = pd.read_pickle("TF-IDF_V1.pkl")

In [12]:
sim_matrix_tfidf = cosine_similarity(tfidf_matrix) #cosine similarity for tf-idf

In [13]:
del tfidf_matrix
del embedding_matrix

In [14]:
import gc
gc.collect() 

0

In [15]:
del sg_df_clean

In [16]:
sim_matrix_embeddings = sim_matrix_embeddings.astype(np.float32)

In [17]:
sim_matrix_tfidf = sim_matrix_tfidf.astype(np.float32)

In [18]:
tfidf_w = 0.3
embeddings_w = 0.7

In [19]:
weighted_matrix_1 = tfidf_w * sim_matrix_tfidf

In [20]:
del sim_matrix_tfidf
gc.collect()

0

In [21]:
weighted_matrix_2 =embeddings_w * sim_matrix_embeddings

In [22]:
del sim_matrix_embeddings

In [23]:
gc.collect()

0

In [24]:
final_matrix = weighted_matrix_1 + weighted_matrix_2

In [25]:
del weighted_matrix_1 
del weighted_matrix_2

In [26]:
pd.to_pickle(final_matrix, "Full_cosine_matrix_Qwen-3-6B-V2.pkl")