In [1]:
import json
import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
import string

import chromadb
from chromadb.config import Settings
from chromadb.utils.batch_utils import create_batches

from sentence_transformers import SentenceTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Для препроцессинга
stop_words = set(stopwords.words('russian') + stopwords.words('english'))
emoji_pattern = re.compile(
    "["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F700-\U0001F77F"  # alchemical symbols
    u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
    u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
    u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    u"\U0001FA00-\U0001FA6F"  # Chess Symbols
    u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
    u"\U00002702-\U000027B0"  # Dingbats
    u"\U000024C2-\U0001F251" 
    "]+", flags=re.UNICODE)
pattern = r'&[a-zA-Z0-9#]+;'

In [3]:
def clear_text(x):
    '''
    Функция для очистки текста перед созданием эмбеддингов
    '''
    x = re.sub(re.compile('<.*?>'), ' ', x)
    x = emoji_pattern.sub(r'', x)
    x = re.sub(r'\r\n|\r|\n', '', x)
    x = re.sub(pattern, '', x)
    x = re.sub(r'http\S+', '', x)
    x = x.translate(str.maketrans('', '', string.punctuation))
    x = re.sub(' +', ' ', x)
    x = x.lower()
    x = ' '.join([i for i in x.split(' ') if i not in stop_words])
    return x

In [4]:
# Загрузка модели и эмбедингов

description_embeddings = np.load('/Users/mossyhead/ds_bootcamp/GameExplorer/model/proj/description_embeddings.npy')
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
model.to('cpu');

In [5]:
# Понижаем размерность эмбедингов

lsa = make_pipeline(TruncatedSVD(n_components=128), Normalizer(copy=False))
fited_description_embeddings = lsa.fit_transform(description_embeddings)
explained_variance = lsa[0].explained_variance_ratio_.sum()
print(f"Explained variance of the SVD step: {explained_variance * 100:.1f}%")

fited_description_embeddings = fited_description_embeddings / np.linalg.norm(fited_description_embeddings, axis=1, keepdims=True)

Explained variance of the SVD step: 87.3%


In [6]:
# Полный датафрейм (со всей нашей загруженной инфой)
# df = pd.read_csv('preprocessed_df.csv')
# df = df.set_index('steam_appid')

final_df = pd.read_csv('/Users/mossyhead/ds_bootcamp/GameExplorer/model/proj/filter_df.csv') 
# Это укороченная версия, где есть только id, название и бинарные признаки. Его размерность совпадает c df, который выше, так что их удобно соединять
final_df = final_df.set_index('steam_appid')

In [7]:
# Создаем базу данных

client = chromadb.Client()
main_collection = client.create_collection("main_collection")
# client = chromadb.PersistentClient(path="/Users/mossyhead/ds_bootcamp/GameExplorer/model/proj/chromadb.db")
# main_collection = client.get_collection("main_collection")

In [None]:
# client = chromadb.PersistentClient(path="/Users/mossyhead/ds_bootcamp/GameExplorer/model/proj/chromadb.db")
# main_collection = client.get_collection("main_collection")

In [8]:
# Наполняем базу данных

ids = [str(game_id) for game_id in final_df.index]
embeddings = [embedding.tolist() for embedding in fited_description_embeddings]
metadatas = [
    {
        "steam_appid": game_id,
        "name": final_df.loc[game_id, 'name'],
        "single_player": int(final_df.loc[game_id, 'Для одного игрока']),
        "family_library": int(final_df.loc[game_id, 'Family Library Sharing']),
        "MMO": int(final_df.loc[game_id, 'MMO']),
        "action": int(final_df.loc[game_id, 'Экшены']),
        "indie": int(final_df.loc[game_id, 'Инди']),
        "simulator": int(final_df.loc[game_id, 'Симуляторы']),
        "strategy": int(final_df.loc[game_id, 'Стратегии']),
        "casual": int(final_df.loc[game_id, 'Казуальные игры']),
        "adventure": int(final_df.loc[game_id, 'Приключенческие игры']),
        "RPG": int(final_df.loc[game_id, 'Ролевые игры']),
        "VR": int(final_df.loc[game_id, 'VR']),
        "share/split_screen": int(final_df.loc[game_id, 'Share/Split Screen']),
        "f2p": int(final_df.loc[game_id, 'f2p']),
        "coop": int(final_df.loc[game_id, 'Co-op']),
        "multiplayer": int(final_df.loc[game_id, 'Multiplayer']),
        "racing/sport": int(final_df.loc[game_id, 'Racing/Sport']),
    }
    for game_id in final_df.index
]

batches = create_batches(api=client, ids=ids, embeddings=embeddings, metadatas=metadatas)

for batch in batches:
    main_collection.add(ids=batch[0], embeddings=batch[1], metadatas=batch[2])

In [9]:
# Функция, которая преобразует список пользовательских фильтров в нужный формат
def make_fil(l):
    conditions = [{field: {'$eq': 1}} for field in l]
    return {'$and': conditions}

In [13]:
filters =[]


In [15]:
filters = [
        'single_player', 'family_library', 'MMO', 'action', 'indie', 'simulator',
        'strategy', 'casual', 'adventure', 'RPG', 'VR', 'share/split_screen', 
        'f2p', 'coop', 'multiplayer', 'racing/sport'
    ]

In [16]:
make_fil(filters)

{'$and': [{'single_player': {'$eq': 1}},
  {'family_library': {'$eq': 1}},
  {'MMO': {'$eq': 1}},
  {'action': {'$eq': 1}},
  {'indie': {'$eq': 1}},
  {'simulator': {'$eq': 1}},
  {'strategy': {'$eq': 1}},
  {'casual': {'$eq': 1}},
  {'adventure': {'$eq': 1}},
  {'RPG': {'$eq': 1}},
  {'VR': {'$eq': 1}},
  {'share/split_screen': {'$eq': 1}},
  {'f2p': {'$eq': 1}},
  {'coop': {'$eq': 1}},
  {'multiplayer': {'$eq': 1}},
  {'racing/sport': {'$eq': 1}}]}

In [10]:
# Пользовательские фильтры и пользовательский запрос
filters = ['single_player', 'strategy', 'indie']
user_query = "medieval strategy with roman and german units"

In [31]:
def make_fil(filters):
    if not filters:
        # Если фильтры пусты, используйте все фильтры
        filters = [
            'single_player', 'family_library', 'MMO', 'action', 'indie', 'simulator',
            'strategy', 'casual', 'adventure', 'RPG', 'VR', 'share/split_screen', 
            'f2p', 'coop', 'multiplayer', 'racing/sport'
        ]
    conditions = [{field: {'$eq': 1}} for field in filters]
    # if len(conditions) == 1:
    #     return conditions[0]  # Возвращаем одно условие
    # else:
    #     return {'$and': conditions} 
    
    if len(conditions) == 1:
        return conditions[0]  # Возвращаем одно условие
    if len(conditions) == 1:
        return {conditions[0]: {'$eq': 1}}  # Возвращаем одно условие

filters = []
make_fil(filters)

In [34]:
def make_fil(filters):
    if not filters:
        # Если фильтры пусты, используйте все фильтры
        filters = [
            'single_player', 'family_library', 'MMO', 'action', 'indie', 'simulator',
            'strategy', 'casual', 'adventure', 'RPG', 'VR', 'share/split_screen', 
            'f2p', 'coop', 'multiplayer', 'racing/sport'
        ]
    conditions = [{field: {'$eq': 1}} for field in filters]
    # if len(conditions) == 1:
    #     return conditions[0]  # Возвращаем одно условие
    # else:
    #     return {'$and': conditions} 
    if len(conditions) == 1:
        return conditions[0]  # Возвращаем одно условие
    if len(conditions) == 1:
        return {conditions[0]: {'$eq': 1}} 

filters = []
user_query = "medieval strategy with roman and german units"


# Применение фильтров + поиск
def search(user_query, filters):
    user_query = clear_text(user_query)
    query_embedding = model.encode([user_query], convert_to_tensor=True).cpu().numpy()
    query_embedding = lsa.transform(query_embedding)
    print("Query Embedding Before Normalization:", query_embedding)
    norm = np.linalg.norm(query_embedding, axis=1, keepdims=True)
    print("norm:", norm)
    query_embedding = query_embedding / norm  # Нормализация
    print("Query Embedding After Normalization:", query_embedding)
    results = main_collection.query(
        query_embeddings=query_embedding.tolist(),
        n_results=10,
        where=make_fil(filters)
    )
    result_ids = [int(x) for x in results['ids'][0]]
    return result_ids
final_df.loc[search(user_query, filters),:]

Query Embedding Before Normalization: [[ 2.57737666e-01  3.05311149e-03  3.26621890e-01  1.72085777e-01
   6.48513883e-02  9.95513350e-02 -1.23875938e-01 -8.63163248e-02
   2.22578257e-01 -1.27950609e-01 -2.51858294e-01  1.50817886e-01
   1.70280725e-01 -2.50186533e-01  1.31302282e-01 -2.68879328e-02
  -6.39273226e-03 -1.22177340e-01 -1.30301937e-02  1.43122569e-01
  -1.99645031e-02  1.36884660e-01  1.63359359e-01 -4.87129297e-03
   3.11542992e-02  1.06674530e-01 -5.27785867e-02 -4.47666971e-03
  -1.44999355e-01 -1.22254863e-01  9.16832611e-02 -3.30279656e-02
  -5.29477224e-02  1.35869190e-01  9.91782323e-02 -1.59214735e-01
   1.26390508e-03 -9.17834491e-02 -9.94594395e-02  2.63222251e-02
   6.66044429e-02 -4.72412258e-02  1.53906143e-03  4.06872015e-03
   1.54073104e-01 -6.68678656e-02  1.54328439e-02  5.89109734e-02
  -1.07779138e-01 -5.38532510e-02  5.19452197e-03  2.70117365e-04
  -4.70537953e-02 -6.38768896e-02  1.18562849e-02  7.68293664e-02
  -2.37816852e-02 -6.86166957e-02 -5.3

Unnamed: 0_level_0,name,Для одного игрока,Family Library Sharing,MMO,Экшены,Инди,Симуляторы,Стратегии,Казуальные игры,Приключенческие игры,Ролевые игры,VR,Share/Split Screen,f2p,Co-op,Multiplayer,Racing/Sport
steam_appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
801920,Field of Glory II: Age of Belisarius,1,1,0,0,0,1,1,0,0,0,0,1,0,1,1,0
1665841,Wars Across The World: Cortenuova 1237,1,1,0,0,1,0,1,0,0,0,0,1,0,1,1,0
2517030,Wars Across The World: Dacia 101,1,1,0,0,1,0,1,0,0,0,0,1,0,1,1,0
345260,Medieval: Total War™ - Collection,1,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0
1640470,Steel Division 2 - Tribute to the Liberation o...,1,1,0,1,1,1,1,0,0,0,0,0,0,1,1,0
998830,The Last Roman Village,1,1,0,0,1,0,1,1,1,0,0,0,0,0,0,0
554371,Wars Across the World: Carrhae 53,1,1,0,0,1,0,1,0,0,0,0,1,0,1,1,0
1884780,Unity of Command II - Desert Rats,1,1,0,0,1,1,1,0,0,0,0,1,0,1,1,0
1389240,Age of Empires II: Definitive Edition - Lords ...,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0
1646380,Comrades and Barons: Gates of Freedom,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0


In [11]:
# Применение фильтров + поиск
def search(user_query, filters):
    user_query = clear_text(user_query)
    query_embedding = model.encode([user_query], convert_to_tensor=True).cpu().numpy()
    query_embedding = lsa.transform(query_embedding)
    query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)  # Нормализация
    results = main_collection.query(
        query_embeddings=query_embedding.tolist(),
        n_results=10,
        where=make_fil(filters)
    )
    result_ids = [int(x) for x in results['ids'][0]]
    return result_ids

In [12]:
# Пример использования
final_df.loc[search(user_query, filters),:]

Unnamed: 0_level_0,name,Для одного игрока,Family Library Sharing,MMO,Экшены,Инди,Симуляторы,Стратегии,Казуальные игры,Приключенческие игры,Ролевые игры,VR,Share/Split Screen,f2p,Co-op,Multiplayer,Racing/Sport
steam_appid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1665841,Wars Across The World: Cortenuova 1237,1,1,0,0,1,0,1,0,0,0,0,1,0,1,1,0
2517030,Wars Across The World: Dacia 101,1,1,0,0,1,0,1,0,0,0,0,1,0,1,1,0
2293520,Roman Empire vs. Barbarians,1,1,0,1,1,0,1,1,1,1,0,0,0,0,0,0
1640470,Steel Division 2 - Tribute to the Liberation o...,1,1,0,1,1,1,1,0,0,0,0,0,0,1,1,0
998830,The Last Roman Village,1,1,0,0,1,0,1,1,1,0,0,0,0,0,0,0
314970,Age of Conquest IV,1,0,0,0,1,0,1,0,0,0,0,1,1,1,1,0
554371,Wars Across the World: Carrhae 53,1,1,0,0,1,0,1,0,0,0,0,1,0,1,1,0
1884780,Unity of Command II - Desert Rats,1,1,0,0,1,1,1,0,0,0,0,1,0,1,1,0
1646380,Comrades and Barons: Gates of Freedom,1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0
1911150,Unity of Command II - Desert Fox,1,1,0,0,1,1,1,0,0,0,0,1,0,1,1,0
