In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import DataCollatorWithPadding
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import cupy as cp

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
%%time
# loading the contents and topics
contents = pd.read_csv('./data/content.csv')
topics = pd.read_csv('./data/topics.csv')

CPU times: user 4.68 s, sys: 430 ms, total: 5.11 s
Wall time: 8.06 s


In [33]:
# =========================================================================================
# Configurations
# =========================================================================================
class CFG:
    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
    model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
    num_workers = 28
    batch_size = 64
    top_n = 10
    seed = 42

In [18]:
# =========================================================================================
# Data Loading
# =========================================================================================
def read_data(cfg):
    contents = pd.read_csv('./data/content.csv')
    topics = pd.read_csv('./data/topics.csv')
    correlations = pd.read_csv('./data/correlations.csv')
    # Fillna titles
    topics['title'].fillna("", inplace = True)
    contents['title'].fillna("", inplace = True)
    # Fillna descriptions
    topics['description'].fillna("", inplace = True)
    contents['description'].fillna("", inplace = True)
    # Sort by title length to make inference faster
    topics['length'] = topics['title'].apply(lambda x: len(x))
    contents['length'] = contents['title'].apply(lambda x: len(x))
    topics.sort_values('length', inplace = True)
    contents.sort_values('length', inplace = True)
    # Drop cols
    topics.drop(['description', 'channel', 'category', 'level', 'language', 'parent', 'has_content', 'length'], axis = 1, inplace = True)
    contents.drop(['description', 'kind', 'language', 'text', 'copyright_holder', 'license', 'length'], axis = 1, inplace = True)
    # Reset index
    topics.reset_index(drop = True, inplace = True)
    contents.reset_index(drop = True, inplace = True)
    print(' ')
    print('-' * 50)
    print(f"topics.shape: {topics.shape}")
    print(f"content.shape: {contents.shape}")
    print(f"correlations.shape: {correlations.shape}")
    return topics, contents, correlations

In [6]:
# =========================================================================================
# Prepare input, tokenize
# =========================================================================================
def prepare_input(text, cfg):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

In [9]:
# =========================================================================================
# Unsupervised dataset
# =========================================================================================
class uns_dataset(Dataset):
    
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['title'].values
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        inputs = prepare_input(self.texts[item], self.cfg)
        return inputs

In [29]:
# =========================================================================================
# Unsupervised model
# =========================================================================================
class uns_model(nn.Module):
    
    def __init__(self, pre_trained_model):
        super().__init__()
        self.model = pre_trained_model
    
    def pooling(self, inputs):
        # get model output
        model_output = self.model(**inputs)
        attention_mask = inputs['attention_mask']
        #First element of model_output contains all token embeddings
        token_embeddings = model_output[0] 
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    def forward(self, inputs):
        return self.pooling(inputs)
    
# =========================================================================================
# Get embeddings
# =========================================================================================
def get_embeddings(loader, model, device):
    model.eval()
    preds = []
    for step, inputs in enumerate(tqdm(loader)):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    preds = np.concatenate(preds)
    return preds

In [35]:
%%time
# Read data
topics, contents, correlations = read_data(CFG)
# Create topics dataset
topics_dataset = uns_dataset(topics, CFG)
# Create content dataset
content_dataset = uns_dataset(contents, CFG)
# Create topics and content dataloaders
topics_loader = DataLoader(
    topics_dataset, 
    batch_size = CFG.batch_size, 
    shuffle = False, 
    collate_fn = DataCollatorWithPadding(tokenizer = CFG.tokenizer, padding = 'longest'),
    num_workers = CFG.num_workers, 
    pin_memory = True, 
    drop_last = False
)
content_loader = DataLoader(
    content_dataset, 
    batch_size = CFG.batch_size, 
    shuffle = False, 
    collate_fn = DataCollatorWithPadding(tokenizer = CFG.tokenizer, padding = 'longest'),
    num_workers = CFG.num_workers, 
    pin_memory = True, 
    drop_last = False
)

 
--------------------------------------------------
topics.shape: (76972, 2)
content.shape: (154047, 2)
correlations.shape: (61517, 2)
CPU times: user 4.23 s, sys: 359 ms, total: 4.59 s
Wall time: 4.59 s


In [36]:
%%time
# Create model to extract embeddings
model = uns_model(CFG.model)
model.to(device)
# Predict topics
topics_preds = get_embeddings(topics_loader, model, device)
content_preds = get_embeddings(content_loader, model, device)
# Transfer predictions to gpu
# topics_preds_gpu = cp.array(topics_preds)
# content_preds_gpu = cp.array(content_preds)
# Release memory
torch.cuda.empty_cache()

  0%|                                                                                                                                                                                                                                                            | 0/1203 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a XLMRobe

CPU times: user 1min 3s, sys: 5.85 s, total: 1min 9s
Wall time: 1min 9s


In [39]:
## build dict to store the title embeddings
def build_dict_embeddings():
    pass

76972

In [64]:
topics.iloc[76969]

id                                          t_78a57f8f898c
title    10.1.2 Test for conduction of electricity by: ...
Name: 76969, dtype: object

In [66]:
topics_old.iloc[76968]

id                                                t_c613a6f04479
title          10.1.2 Test for conduction of electricity by: ...
description    10.1.2 Test for conduction of electricity by: ...
channel                                                   03de11
category                                                 aligned
level                                                          4
language                                                      en
parent                                            t_c2d7c7183ac9
has_content                                                 True
length                                                       199
Name: 59322, dtype: object

In [67]:
topics_old.iloc[76969]

id                                                t_78a57f8f898c
title          10.1.2 Test for conduction of electricity by: ...
description    10.1.2 Test for conduction of electricity by: ...
channel                                                   c7ca13
category                                                 aligned
level                                                          5
language                                                      en
parent                                            t_3c372ee894bb
has_content                                                 True
length                                                       199
Name: 36129, dtype: object

In [70]:
import scann

2023-02-14 23:11:04.878231: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-14 23:11:04.949253: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-02-14 23:11:05.241265: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.8/lib64
2023-02-14 23:11:05.241311: W tensorflow/compiler/xla/stream_executor/p

In [73]:
# use scann.scann_ops.build() to instead create a TensorFlow-compatible searcher
searcher = scann.scann_ops.builder(topics_preds, 10, "dot_product").tree(
    num_leaves=2000, num_leaves_to_search=100, training_sample_size=250000).score_ah(
    2, anisotropic_quantization_threshold=0.2).reorder(100).build()

2023-02-14 23:12:57.931389: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-14 23:12:57.931626: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-14 23:12:57.931663: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-14 23:12:57.931883: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuil

In [76]:
%%time
neighbors, distances = searcher.search_batched(content_preds)


CPU times: user 16.6 s, sys: 234 ms, total: 16.8 s
Wall time: 16.8 s


In [77]:
neighbors

<tf.Tensor: shape=(154047, 10), dtype=int32, numpy=
array([[    0,     1,    46, ...,  7493,  8578,  9169],
       [    0,     1,    46, ...,  7493,  8578,  9169],
       [    0,     1,    46, ...,  7493,  8578,  9169],
       ...,
       [76285, 73751, 65996, ..., 76746, 16435, 53456],
       [67389, 52171, 47938, ..., 55521, 40598, 39373],
       [45717,  5777, 16521, ..., 23452, 23553, 22751]], dtype=int32)>

In [78]:
distances

<tf.Tensor: shape=(154047, 10), dtype=float32, numpy=
array([[9.0941305, 9.0941305, 4.5122385, ..., 4.009285 , 4.009285 ,
        4.009285 ],
       [9.0941305, 9.0941305, 4.5122385, ..., 4.009285 , 4.009285 ,
        4.009285 ],
       [9.0941305, 9.0941305, 4.5122385, ..., 4.009285 , 4.009285 ,
        4.009285 ],
       ...,
       [7.907192 , 7.4944816, 7.233297 , ..., 6.998645 , 6.9758167,
        6.9589   ],
       [4.64818  , 4.5930767, 4.5774746, ..., 4.433792 , 4.433338 ,
        4.4333377],
       [3.8597376, 3.6365285, 3.4780874, ..., 3.3612852, 3.361285 ,
        3.3612847]], dtype=float32)>