In [1]:
!nvidia-smi

Sat Nov 25 00:25:11 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.147.05   Driver Version: 525.147.05   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000000:8C:00.0 Off |                    0 |
| N/A   28C    P0    26W / 250W |      4MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Библиотеки

In [2]:
import pickle

import numpy as np
import pandas as pd
import polars as pl

import faiss

from tqdm.auto import tqdm

In [3]:
import torch

from make_embeds import get_embs

# Данные

In [4]:
features = pl.read_parquet('./features.parquet', columns = ['video_id', 'v_pub_datetime']).sort('v_pub_datetime')
features = features.unique()
videos = pl.read_parquet('./videos.parquet', columns = ['video_id', 'video_title', 'v_pub_datetime']).sort('v_pub_datetime')
videos = videos.join(features, on='video_id', how='left')
videos = videos.filter((~pl.col('v_pub_datetime').is_null()) & (~pl.col('v_pub_datetime_right').is_null()))
videos = videos.sort('video_id')
videos

video_id,video_title,v_pub_datetime,v_pub_datetime_right
str,str,"datetime[ns, +00:00]",str
"""video_0""","""Bonus 3-6 in 3…",2023-05-05 10:43:01 +00:00,"""2023-05-05 10:…"
"""video_1""","""Sean Paul feat…",2023-05-03 01:15:49 +00:00,"""2023-05-03 01:…"
"""video_10""","""Пластилиновый …",2023-05-08 15:19:34 +00:00,"""2023-05-08 15:…"
"""video_100""","""Суп из хребтов…",2023-05-09 07:32:25 +00:00,"""2023-05-09 07:…"
"""video_1000""","""Майский вечер.…",2023-05-23 20:26:25 +00:00,"""2023-05-23 20:…"
"""video_10000""","""SEKIRO Пожило…",2022-10-25 14:45:08 +00:00,"""2022-10-25 14:…"
"""video_10000000…","""В ЗАРЕВЕ ЗАКАТ…",2014-03-25 09:42:49 +00:00,"""2014-03-25 09:…"
"""video_10000001…","""горнолыжный тр…",2014-03-17 22:03:12 +00:00,"""2014-03-17 22:…"
"""video_10000002…","""ՆՈՐ ԵՐԳ! Karen…",2014-03-26 15:41:15 +00:00,"""2014-03-26 15:…"
"""video_10000003…","""ԱՆՑՈՒԴԱՐՁ 27.0…",2014-03-30 08:10:02 +00:00,"""2014-03-30 08:…"


In [5]:
automarkup = pl.read_parquet('./automarkup.parquet', columns=['video_id'])
automarkup.head()

video_id
str
"""video_3049590"""
"""video_3049590"""
"""video_3049590"""
"""video_31592942…"
"""video_31592942…"


In [6]:
video_ids = videos["video_id"].tail(1_000_000).to_list()
video_ids += automarkup["video_id"].to_list()
video_ids = sorted(list(set(video_ids)))
len(video_ids)

1381401

In [None]:
sentences = np.array([title.lower().strip() for title in videos.filter(pl.col('video_id').is_in(video_ids))["video_title"].to_list()])
video_ids = np.array([title.lower().strip() for title in videos.filter(pl.col('video_id').is_in(video_ids))["video_id"].to_list()])

# Получение векторов для названий

In [7]:
title_labse_768 = get_embs(
    "cointegrated/LaBSE-en-ru",
    sentences=sentences, 
    device=torch.device("cuda"), 
    batch_size=128
)

  0%|          | 0/10769 [00:00<?, ?it/s]

In [8]:
title_labse_768 = np.array(title_labse_768)

# Построение Faiss

In [9]:
d = 1024 # 768 
index = faiss.IndexFlatIP(d)
index.add(title_labse_768)

In [42]:
faiss.write_index(index, 'labse_candidates.index')

# Получение векторов для запросов

In [10]:
automarkup = pl.read_parquet('./automarkup.parquet').sort('datetime')
automarkup = automarkup.unique(subset='query', keep='last', maintain_order=True).tail(300_000)

In [11]:
query_labse_768 = get_embs(
    "cointegrated/LaBSE-en-ru",
    sentences=[query.lower().strip() for query in automarkup["query"].to_list()], 
    device=torch.device("cuda"), 
    batch_size=128
)

  0%|          | 0/2344 [00:00<?, ?it/s]

In [12]:
query_labse_768 = np.array(query_labse_768)

In [10]:
with open('query_labse_768.pickle', 'wb') as f:
    pickle.dump(query_labse_768, f)

# Получение ближайших названий для запросов

In [11]:
index = faiss.read_index('labse_candidates.index')

In [12]:
with open('query_labse_768.pickle', 'rb') as f:
    query_labse_768 = pickle.load(f)

In [15]:
res = faiss.StandardGpuResources()
index_gpu = faiss.index_cpu_to_gpu(res, 0, index)

In [20]:
D, I = index_gpu.search(query_labse_768, 100)

In [None]:
with open('D_query_labse_768.pickle', 'wb') as f:
    pickle.dump(D, f)
    
with open('I_query_labse_768.pickle', 'wb') as f:
    pickle.dump(I, f)

# Тестирование

In [22]:
p, q = 0, 0
for i, video_id in enumerate(tqdm(automarkup["video_id"].to_list())):
    ids = I[i]
    p += (video_id in video_ids[ids])
    q += 1
p / q # 0.34075666666666665

  0%|          | 0/300000 [00:00<?, ?it/s]

0.3381133333333333

In [23]:
p, q = 0, 0
for i, video_id in enumerate(tqdm(automarkup["video_id"].to_list())):
    ids = I[i]
    for rank, value in enumerate(video_ids[ids]):
        if video_id == value:
            p += 1 / (rank + 1)
            break
    q += 1
p / q # 0.11580963784777484

  0%|          | 0/300000 [00:00<?, ?it/s]

0.1326422903107275