In [None]:
import os
os.chdir('/content/drive/MyDrive/workspace/Learning_Equality')

In [None]:
!nvidia-smi

Sat Jan 21 15:42:19 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P0    28W /  70W |   2810MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
!pip install cuml-cu11 --extra-index-url=https://pypi.ngc.nvidia.com

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/, https://pypi.ngc.nvidia.com
^C


In [None]:
!pip install -U transformers
!pip install sentencepiece
!pip install -U fuzzywuzzy

KeyboardInterrupt: ignored

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModel
import gc
import tqdm
import cudf
from cuml.neighbors import NearestNeighbors
from cuml.datasets import make_blobs
from sklearn.model_selection import KFold
from fuzzywuzzy import fuzz, process
import time
import seaborn as sns
from pathlib import Path

In [None]:
# データフレームをロードする
topics_df = pd.read_csv('./data/row/topics.csv')
content_df = pd.read_csv('./data/row/content.csv')
correlations_df = pd.read_csv('./data/row/correlations.csv')
sample_submission = pd.read_csv('./data/row/sample_submission.csv')

In [None]:
class CFG:
  MODEL_NAME = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
  PRETRAINED_DIR = Path('/content/drive/MyDrive/workspace/Learning_Equality/paraphrase-multilingual-mpnet-base-v2')
  DEVICE = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
  SEED=42
  TRAIN_STEP = True
  N_NEIGHBORS = 1000

In [None]:
content_df = content_df.fillna('')
topics_df = topics_df.fillna('')

In [None]:
topics_df

Unnamed: 0,id,title,description,channel,category,level,language,parent,has_content
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",000cf7,source,4,bg,t_16e29365b50d,True
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,,b3f329,aligned,2,en,t_aa32fb6252dc,False
2,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,8e286a,source,4,pt,t_d14b6c2a2b70,True
3,t_00069b63a70a,Transcripts,,6e3ba4,source,3,en,t_4054df11a74e,True
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,000cf7,source,4,bg,t_e2452e21d252,True
...,...,...,...,...,...,...,...,...,...
76967,t_fffb0bf2801d,4.3 Graph of functions,,e77b55,aligned,4,en,t_676e6a1a4dc7,False
76968,t_fffbe1d5d43c,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,0c929f,source,4,sw,t_50145b9bab3f,True
76969,t_fffe14f1be1e,Lección 7,,6e90a7,aligned,6,es,t_d448c707984d,True
76970,t_fffe811a6da9,تحديد العلاقة بين الإحداثيّات القطبية والإحداث...,5b9e5ca86571f90499ea987f,9fd860,source,2,ar,t_5b4f3ba4eb7d,True


In [None]:
# topics_id2title = {k:v for k, v in zip(topics_df.id.to_list(), topics_df.title.to_list())}
# topics_id2description = {k:v for k, v in zip(topics_df.id.to_list(), topics_df.description.to_list())}
# topics_id2parents = {k:v for k, v in zip(topics_df.id.to_list(), topics_df.parent.to_list())}
# topics_id2children = {}
# for k, v in topics_id2parents.items():
#   if v not in topics_id2children.keys():
#       topics_id2children[v]= [k]
#   else:
#     res = topics_id2children[v]
#     res.append(k)
#     topics_id2children[v] = res

In [None]:
def get_path_list(df):
    topics_id2title = {k:v for k, v in zip(df.id.to_list(), df.title.to_list())}
    topics_id2description = {k:v for k, v in zip(df.id.to_list(), df.description.to_list())}
    topics_id2parents = {k:v for k, v in zip(df.id.to_list(), df.parent.to_list())}
    path_list = []
    for id in df.id.to_list():
        res_list = []
        while True:
            res_list.append(topics_id2title[id])
            id = topics_id2parents[id]
            if id=="":
                break
        path_list.append(" | ".join(res_list[::-1][:-1]))
    return path_list

In [None]:
topics_df['path'] = get_path_list(topics_df)

In [None]:
topics_df = topics_df[topics_df.has_content].reset_index(drop=True)
topics_df.drop(['channel', 'category', 'level', 'has_content'], axis = 1, inplace = True)
content_df.drop(['kind',  'text', 'copyright_holder', 'license'], axis = 1, inplace = True)

In [None]:
def get_train_test_data(train):
    train_idx = topics_df[~topics_df.id.isin(sample_submission.topic_id)].index
    train["fold"] = -1
    # 交差検証 用の番号を振ります。
    kf = KFold(n_splits=5, shuffle=True, random_state=CFG.SEED)
    for n, (train_index, val_index) in enumerate(kf.split(train_idx)):
        train.loc[train_idx[val_index], "fold"] = int(n)
    train["fold"] = train["fold"]
    return train
topics_df = get_train_test_data(topics_df)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_NAME, use_fast=True)
tokenizer.add_tokens(["<|=t_sep=|>"], special_tokens=True)

1

In [None]:
import pytorch_lightning as pl
class FeedbackModel(pl.LightningModule):
    def __init__(self, tokenizer,  model_name):
        super().__init__()
        self.save_hyperparameters()
        
        self.tokenizer = tokenizer
        self.model_name = model_name
        
        config = AutoConfig.from_pretrained(model_name)

        self.topics_encoder = AutoModel.from_pretrained(model_name, config=config)
        self.content_encoder = AutoModel.from_pretrained(model_name, config=config)
        self.topics_encoder.resize_token_embeddings(len(self.tokenizer))
        self.content_encoder.resize_token_embeddings(len(self.tokenizer))
        
    def encode_topics(self, ids, mask):
        output_topics_embeddings = self.topics_encoder(ids, mask)
        return output_topics_embeddings
    
    def encode_content(self, ids, mask):
        output_content_enbeddings = self.content_encoder(ids, mask)
        return output_content_enbeddings

In [None]:
state_dict = torch.load("/content/drive/MyDrive/workspace/Learning_Equality/paraphrase-multilingual-mpnet-base-v2-finetuned/fold_0/epoch=0-step=6960.ckpt")["state_dict"]

In [None]:
model = FeedbackModel(tokenizer, CFG.MODEL_NAME)

In [None]:
model.load_state_dict(state_dict)

<All keys matched successfully>

In [None]:
def get_topic_embeddings(tokenizer, data:pd.Series):
    data = list(data.fillna(''))
    gap = 1000
    
    token_outs = []
    # uniform dynamic padding
    for i in tqdm.tqdm(range(0, len(data), gap), desc='tokenization'):
        batch_tokens=tokenizer(data[i:i+gap], truncation=True, padding=True, return_tensors='pt')
        token_outs.append(batch_tokens)
        
    outs = []
    model.to(CFG.DEVICE)
    model.eval()
    
    with torch.no_grad():
        for batch_tokens in tqdm.tqdm(token_outs, total=len(token_outs), desc='model output'):
            inputs = batch_tokens['input_ids'].to(CFG.DEVICE)
            attention_mask = batch_tokens['attention_mask'].to(CFG.DEVICE)
            out = model.encode_topics(inputs, attention_mask).last_hidden_state.mean(1)
            outs.append(out.cpu().numpy())
    return np.concatenate(outs)

def get_content_embeddings(tokenizer, data:pd.Series):
    data = list(data.fillna(''))
    gap = 1000
    
    token_outs = []
    # uniform dynamic padding
    for i in tqdm.tqdm(range(0, len(data), gap), desc='tokenization'):
        batch_tokens=tokenizer(data[i:i+gap], truncation=True, padding=True, return_tensors='pt')
        token_outs.append(batch_tokens)
        
    outs = []
    model.to(CFG.DEVICE)
    model.eval()
    
    with torch.no_grad():
        for batch_tokens in tqdm.tqdm(token_outs, total=len(token_outs), desc='model output'):
            inputs = batch_tokens['input_ids'].to(CFG.DEVICE)
            attention_mask = batch_tokens['attention_mask'].to(CFG.DEVICE)
            out = model.encode_content(inputs, attention_mask).last_hidden_state.mean(1)
            outs.append(out.cpu().numpy())
    return np.concatenate(outs)

In [None]:
import glob
save_name = f"./data/processed/embedding_data/content_title_emb_{CFG.MODEL_NAME.split('/')[1]}_finetuned.npy"
try:
  contents_embedding = np.load(save_name)
except:
  contents_embedding = get_content_embeddings(tokenizer, content_df.title)
  np.save(save_name, contents_embedding)

In [None]:
topics_df['input_text'] = topics_df['title'] +"<|=t_sep=|>"+ topics_df['path']

In [None]:
import glob
save_name = f"./data/processed/embedding_data/topics_title_emb_{CFG.MODEL_NAME.split('/')[1]}_finetuned.npy"
try:
  topics_embedding = np.load(save_name)
except:
  topics_embedding = get_topic_embeddings(tokenizer, topics_df.input_text)
  np.save(save_name, topics_embedding)

In [None]:
topics_embedding_cudf = cudf.DataFrame(topics_embedding)

In [None]:
CFG.N_NEIGHBORS=1000

In [None]:
all_topics_lang = topics_df.language.to_numpy()
all_content_lang  =  content_df.language.to_numpy()
lang_list = np.unique(all_topics_lang)
knn_prd_idx = []
knn_prd_dis = []
for lang in lang_list:
    prd_data = topics_embedding_cudf[all_topics_lang==lang]
    fit_data = contents_embedding[all_content_lang==lang]
    print(f"言語:{lang}, コンテンツ数:{fit_data.shape[0]}, トピック数:{prd_data.shape[0]}")
    content_idx = np.where(all_content_lang==lang)[0]
    if fit_data.shape[0]>CFG.N_NEIGHBORS:
      n_neighbors= CFG.N_NEIGHBORS
    else:
      n_neighbors=int(fit_data.shape[0])
    model = NearestNeighbors(n_neighbors=n_neighbors)
    model.fit(fit_data) 
    prd_dis, prd_idx = model.kneighbors(prd_data)
    prd_idx.index=prd_data.index
    prd_idx = prd_idx.to_pandas().applymap(lambda x:content_idx[x])
    knn_prd_idx.append(prd_idx)
    del prd_data, fit_data, content_idx, prd_idx
knn_prd_idx = pd.concat(knn_prd_idx, axis=0).sort_index()
gc.collect()

言語:ar, コンテンツ数:7418, トピック数:3173
言語:as, コンテンツ数:641, トピック数:126
言語:bg, コンテンツ数:6050, トピック数:2420
言語:bn, コンテンツ数:2513, トピック数:1731
言語:en, コンテンツ数:65939, トピック数:28053
言語:es, コンテンツ数:30844, トピック数:11769
言語:fil, コンテンツ数:516, トピック数:224
言語:fr, コンテンツ数:10682, トピック数:3034
言語:gu, コンテンツ数:3677, トピック数:1809
言語:hi, コンテンツ数:4042, トピック数:1373
言語:it, コンテンツ数:1300, トピック数:722
言語:km, コンテンツ数:505, トピック数:104
言語:kn, コンテンツ数:501, トピック数:88
言語:mr, コンテンツ数:999, トピック数:239
言語:my, コンテンツ数:206, トピック数:110
言語:or, コンテンツ数:326, トピック数:51
言語:pl, コンテンツ数:319, トピック数:28
言語:pnb, コンテンツ数:184, トピック数:40
言語:pt, コンテンツ数:10435, トピック数:3425
言語:ru, コンテンツ数:188, トピック数:21
言語:sw, コンテンツ数:1447, トピック数:2082
言語:swa, コンテンツ数:495, トピック数:33
言語:ta, コンテンツ数:216, トピック数:44
言語:te, コンテンツ数:285, トピック数:66
言語:tr, コンテンツ数:225, トピック数:26
言語:ur, コンテンツ数:245, トピック数:54
言語:zh, コンテンツ数:3849, トピック数:672


20954

In [None]:
knn_prd_idx.iloc[topics_df[topics_df.id.isin(sample_submission.topic_id)].index,]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,130346,149306,9538,107323,37511,150352,125255,123312,55283,108905,...,105544.0,135647.0,131198.0,14255.0,130285.0,124472.0,31223.0,103258.0,68361.0,45134.0
1,82975,103703,12525,124747,99968,80127,37155,21499,40879,123795,...,62064.0,143530.0,68029.0,146103.0,71071.0,2482.0,27096.0,24219.0,141977.0,3597.0
2,24365,72305,89981,149972,84432,20422,23988,67749,10590,66358,...,64355.0,90125.0,53814.0,7078.0,89767.0,89979.0,76130.0,32922.0,115612.0,13814.0
3,39569,27262,95914,136312,55717,136988,59076,129788,19568,116553,...,40706.0,48390.0,130346.0,120852.0,28648.0,42967.0,59891.0,73457.0,130637.0,31016.0
15278,80296,32922,63735,10590,130,90395,49945,77447,77982,42093,...,100099.0,111374.0,39067.0,7246.0,73145.0,61959.0,100861.0,225.0,37387.0,113006.0


In [None]:
all_content_lang = content_df.language.to_numpy()
all_content_title = content_df.title.to_numpy()
all_content_ids =  content_df.id.to_numpy()
all_topics_lang = topics_df.language.to_numpy()
all_topics_title = topics_df.title.to_numpy()
all_topics_ids = topics_df.id.to_numpy()
preds = []
for t_idx in tqdm.tqdm(range(len(knn_prd_idx)), total=len(knn_prd_idx)):
  topic_id = all_topics_ids[t_idx]
  content_idx = knn_prd_idx.iloc[t_idx].to_numpy()
  content_idx = content_idx[~np.isnan(content_idx)].astype(int)
  content_ids = all_content_ids[content_idx]
  preds.append({
        'topic_id': topic_id,
        'content_ids': ' '.join(content_ids)
    })
preds = pd.DataFrame.from_records(preds)

100%|██████████| 61517/61517 [00:48<00:00, 1263.22it/s]


In [None]:
del content_idx, knn_prd_idx,knn_prd_dis
gc.collect()

0

In [None]:
preds['fold'] = topics_df['fold']

In [None]:
# SVM用のデータセットを作成する
preds_stack = preds.content_ids.str.split(' ', expand=True).stack().to_frame().reset_index(level=1)

In [None]:
preds_stack = preds_stack.rename(columns={0:'content_id'})
preds_stack = preds_stack.drop(columns=['level_1'])
preds_stack = preds_stack.join(preds).drop(columns=['content_ids'])

In [None]:
correlations_df = correlations_df[correlations_df.topic_id.isin(topics_df.id)].reset_index(drop=True)

In [None]:
all_fold = preds.fold
all_knn_prediction = preds.content_ids.apply(lambda x:np.array(x.split(' '))).to_list()
all_correlation_contents = correlations_df.content_ids.apply(lambda x:np.array(x.split(' '))).to_list()

In [None]:
labels = []
for i in tqdm.tqdm(range(len(all_knn_prediction)), total=len(all_knn_prediction)):
    correlation_contents = all_correlation_contents[i]
    knn_prediction = all_knn_prediction[i]
    if all_fold[i]!=-1:
        label = np.isin(knn_prediction, correlation_contents)
        labels.append(label)
    else:
        labels.append(np.array([-1] * len(knn_prediction)))
labels = np.concatenate(labels)
preds_stack['label'] = labels

100%|██████████| 61517/61517 [00:03<00:00, 15422.06it/s]


In [None]:
preds['true_content_ids'] = correlations_df["content_ids"]

In [None]:
import seaborn as sns
def calc_cm(df, target_col, pred_col):
  prd = df[pred_col].apply(lambda x:x.split()).to_list()
  target = df[target_col].apply(lambda x:x.split()).to_list()
  res = {'TP':0, 'TN':0, 'FP':0, 'FN':0}
  for p, t in tqdm.tqdm(zip(prd, target), total=len(prd)):
    true_positives = len(set(t)&set(p))
    true_negtives = 154047 -len((set(p) | set(t))-(set(t)&set(p)))
    false_positives = len(set(p)-set(t))
    false_negatives = len(set(t)-set(p))
    res['TP'] += true_positives
    res['TN'] += true_negtives
    res['FP'] += false_positives
    res['FN'] += false_negatives
  return res
res = calc_cm(preds, 'true_content_ids','content_ids')

100%|██████████| 61517/61517 [00:12<00:00, 4888.93it/s]


In [None]:
def calc_score(true_ids, pred_ids):
  true_positives = len(set(true_ids)&set(pred_ids))
  false_negatives = len(set(true_ids)-set(pred_ids))
  return true_positives/(true_positives + false_negatives)
def calc_score_mean(target_df, pred_df):
  shape = target_df.shape
  score = [calc_score(target_df.loc[i, 'content_ids'].split(), pred_df.loc[i, 'content_ids'].split()) for i in range(shape[0])]
  pred_df['score'] = score
  return pred_df['score'].mean(), pred_df
score, t_df = calc_score_mean(correlations_df, preds)

In [None]:
score

0.904084565383088

In [None]:
topics_index2id = topics_df.id.to_dict()
topics_id2index = dict(zip(topics_index2id.values(), topics_index2id.keys()))
contents_index2id = content_df.id.to_dict()
contents_id2index = dict(zip(contents_index2id.values(), contents_index2id.keys()))

In [None]:
contents_embedding_cudf = cudf.DataFrame(contents_embedding)

In [None]:
train_df = preds_stack[preds_stack.fold!=-1].reset_index(drop=False)
test_df = preds_stack[preds_stack.fold==-1].reset_index(drop=False)
train_to_idx = train_df.topic_id.apply(lambda x:topics_id2index[x]).to_numpy()
train_co_idx = train_df.content_id.apply(lambda x:contents_id2index[x]).to_numpy()
test_to_idx = test_df.topic_id.apply(lambda x:topics_id2index[x]).to_numpy()
test_co_idx = test_df.content_id.apply(lambda x:contents_id2index[x]).to_numpy()
del preds_stack
gc.collect()

50

In [None]:
import datetime
now = datetime.datetime.now()
datasave_dir = Path(f"data/processed/{now.strftime('%Y_%m%d')}")
datasave_dir.mkdir(exist_ok=True)
# np.save(f"{datasave_dir}/train_df", train_df.to_numpy())
train_df.to_csv(f"{datasave_dir}/train_df.csv")

In [None]:
np.save(datasave_dir/ "topics_embedding", topics_embedding_cudf.to_numpy())

In [None]:
np.save(datasave_dir/ "contents_embedding", contents_embedding_cudf.to_numpy())

In [None]:
train_idx_dict = {'train_to_idx':train_to_idx,
 'train_co_idx':train_co_idx}

In [None]:
import pickle
pickle.dump(train_idx_dict, open(datasave_dir / 'train_idx.pkl', mode="wb"))