In [1]:

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sentencetransformers/model/sentence-transformers/paraphrase-multilingual-mpnet-base-v2/config.json
/kaggle/input/sentencetransformers/model/sentence-transformers/paraphrase-multilingual-mpnet-base-v2/pytorch_model.bin
/kaggle/input/sentencetransformers/tokenizer/sentence-transformers/paraphrase-multilingual-mpnet-base-v2/tokenizer.json
/kaggle/input/sentencetransformers/tokenizer/sentence-transformers/paraphrase-multilingual-mpnet-base-v2/tokenizer_config.json
/kaggle/input/sentencetransformers/tokenizer/sentence-transformers/paraphrase-multilingual-mpnet-base-v2/special_tokens_map.json
/kaggle/input/sentencetransformers/tokenizer/sentence-transformers/paraphrase-multilingual-mpnet-base-v2/sentencepiece.bpe.model
/kaggle/input/learning-equality-curriculum-recommendations/sample_submission.csv
/kaggle/input/learning-equality-curriculum-recommendations/topics.csv
/kaggle/input/learning-equality-curriculum-recommendations/correlations.csv
/kaggle/input/learning-equality-curr

In [2]:
import transformers
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from pathlib import Path
from fuzzywuzzy import fuzz, process
import math

In [3]:
class Config():
    model = Path('/kaggle/input/sentencetransformers/model/sentence-transformers/paraphrase-multilingual-mpnet-base-v2/')
    tokenizer = Path('/kaggle/input/sentencetransformers/tokenizer/sentence-transformers/paraphrase-multilingual-mpnet-base-v2/')
    device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
    nearest_content_count=400
    

In [4]:
df_topic=pd.read_csv(r"/kaggle/input/learning-equality-curriculum-recommendations/topics.csv")
df_topic.fillna('')
df_correlations=pd.read_csv(r"/kaggle/input/learning-equality-curriculum-recommendations/correlations.csv")
df_content=pd.read_csv(r"/kaggle/input/learning-equality-curriculum-recommendations/content.csv")
df_content.fillna('')
df_sample=pd.read_csv(r"/kaggle/input/learning-equality-curriculum-recommendations/sample_submission.csv")



In [5]:
df_topic=df_topic.set_index("id")

In [6]:
################
def tree(ids,lst):
#     if(df_topic.at[ids,"lst_parent"]!="NULL"):
#         lst=df_topic.at[ids,"title"]
#         return lst
    
    if((ids in df_topic.index)==False):
        
        return lst
    
    lst=tree(df_topic.at[ids,"parent"],lst)
    df_topic.at[ids,"lst_parent"]=lst
    lst.append(df_topic.at[ids,"title"])
    return lst
    
    
#     (df_topic[index]==ids).any()==False
    

In [7]:
df_topic["lst_parent"]="NULL"

for i in df_topic.index:
    lst=[]
    lst=tree(i,lst)
    if (len(lst)==0):
        df_topic.at[i,"lst_parent"]=df_topic.at[i,"title"]
        print(df_topic.at[i,"lst_parent"])
    else:
        df_topic.at[i,"lst_parent"]=lst
        


In [8]:
tokenizer = transformers.AutoTokenizer.from_pretrained(Config.tokenizer, use_fast=True)
model = transformers.AutoModel.from_pretrained(Config.model)

In [9]:
model.to(Config.device)
print('models paramters:', sum(p.numel() for p in model.parameters()))
params_count = sum(1 for x in model.parameters())
for i, (name, param) in enumerate(model.named_parameters()):
    if i > params_count - 10:
        print(name, param.requires_grad, param.shape)

models paramters: 278043648
encoder.layer.11.attention.output.LayerNorm.bias True torch.Size([768])
encoder.layer.11.intermediate.dense.weight True torch.Size([3072, 768])
encoder.layer.11.intermediate.dense.bias True torch.Size([3072])
encoder.layer.11.output.dense.weight True torch.Size([768, 3072])
encoder.layer.11.output.dense.bias True torch.Size([768])
encoder.layer.11.output.LayerNorm.weight True torch.Size([768])
encoder.layer.11.output.LayerNorm.bias True torch.Size([768])
pooler.dense.weight True torch.Size([768, 768])
pooler.dense.bias True torch.Size([768])


In [10]:
def get_embeddings(tokenizer, data:pd.Series):
    data = list(data.fillna(''))
    gap = 1000
    
    token_outs = []
    # uniform dynamic padding
    for i in tqdm(range(0, len(data), gap), desc='tokenization'):
        batch_tokens=tokenizer(data[i:i+gap], truncation=True, padding=True, return_tensors='pt')
        token_outs.append(batch_tokens)
        
    outs = []
    model.to(Config.device)
    model.eval()
    
    with torch.no_grad():
        for batch_tokens in tqdm(token_outs, total=len(token_outs), desc='model output'):
            inputs = batch_tokens['input_ids'].to(Config.device)
            attention_mask = batch_tokens['attention_mask'].to(Config.device)
            out = model(inputs, attention_mask=attention_mask).last_hidden_state.mean(1)
            outs.append(out.cpu().numpy())
    return np.concatenate(outs)

In [11]:
from fuzzywuzzy import fuzz, process
from annoy import AnnoyIndex

# max_corpus_size=1100000

df_content=df_content.set_index("id")

In [12]:
# df_content['title'].duplicated().any()

In [13]:
# df_content[df_content.duplicated(['title'], keep=False)]

In [14]:
n_trees = 128           #Number of trees used for Annoy. More trees => better recall, worse run-time
top_k_hits = 400

In [15]:
def get_embeddings(tokenizer, data:pd.Series):
    data = list(data.fillna(''))
    gap = 1000
    
    token_outs = []
    # uniform dynamic padding
    for i in range(0, len(data), gap):
        batch_tokens=tokenizer(data[i:i+gap], truncation=True, padding=True, return_tensors='pt')
        token_outs.append(batch_tokens)
        
    outs = []
    model.to(Config.device)
    model.eval()
    
    with torch.no_grad():
        for batch_tokens in token_outs:
            inputs = batch_tokens['input_ids'].to(Config.device)
            attention_mask = batch_tokens['attention_mask'].to(Config.device)
            out = model(inputs, attention_mask=attention_mask).last_hidden_state.mean(1)
            outs.append(out.cpu().numpy())
    return np.concatenate(outs)


In [16]:
contents_embedding = get_embeddings(tokenizer, df_content.title)


In [17]:
# for i in range(len(contents_embedding)):
#         annoy_index.add_item(i, contents_embedding[i])

# annoy_index.build(n_trees)
from annoy import AnnoyIndex
annoy_index = AnnoyIndex(contents_embedding.shape[1], metric='angular')
for i, item in tqdm(enumerate(contents_embedding), total=len(contents_embedding)):
    annoy_index.add_item(i, item)
annoy_index.build(n_trees)

  0%|          | 0/154047 [00:00<?, ?it/s]

True

In [18]:
df_content.reset_index(level =['id'], inplace = True)

In [19]:
df_content.at[32137, 'id']

'c_35515f242f70'

In [20]:
# data1=[]

# nearest_content_count=400
# for i in df_sample.topic_id:
#     print(i)
#     lst1=df_topic.at[i,"lst_parent"]
#     j=(len(lst1)-1)
#     l=10
#     data=[]
#     while j>=0 and l>0:
#         emb1=pd.Series(lst1[j])
#         emb=get_embeddings(tokenizer,emb1)
#         res=[]
#         data2=[]

#         for i, t_e in (enumerate(emb)):
#             res =annoy_index.get_nns_by_vector(t_e,nearest_content_count)
#         #res=annoy_index.get_nns_by_vector(emb,top_k_hits,include_distances=False)
#         iter=res[:8]
#         for it in iter:
#             data2.append(df_content.at[it,"id"])
#         c=df_topic[df_topic.title==lst1[j]].index
#         for k in res:

#             if(df_topic.at[c[0],"language"]==df_content.at[k,"language"]):

#                 fuzzy_value = fuzz.token_set_ratio(df_topic.at[c[0],"title"],df_content.at[k,"title"])
#                 if fuzzy_value>70:
#                     data.append(df_content.at[k,"id"])
#                     l-=1
#             if(l==0):
#                 break
        
#         j-=1
#     if(len(data)==0):
#         data1.append(' '.join(data2))
#     else:
#         data1.append(' '.join(data))
    

In [21]:
data1=[]

nearest_content_count=400
for i in df_sample.topic_id:
    lst1=df_topic.at[i,"lst_parent"]
    j=(len(lst1)-1)
    l=10
    data=[]
    while j>=0 and l>0:
        emb1=pd.Series(lst1[j])
        emb=get_embeddings(tokenizer,emb1)
        res=[]
        data2=[]

        for i, t_e in (enumerate(emb)):
            res =annoy_index.get_nns_by_vector(t_e,nearest_content_count)
        #res=annoy_index.get_nns_by_vector(emb,top_k_hits,include_distances=False)
        iter=res[:8]
        for it in iter:
            data2.append(df_content.at[it,"id"])
        if len(df_topic[df_topic.title==lst1[j]].index)>0:
            c=df_topic[df_topic.title==lst1[j]].index
        
            for k in res:

                if(df_topic.at[c[0],"language"]==df_content.at[k,"language"]):

                    fuzzy_value = fuzz.token_set_ratio(df_topic.at[c[0],"title"],df_content.at[k,"title"])
                    if fuzzy_value>70:
                        data.append(df_content.at[k,"id"])
                        l-=1
                if(l==0):
                    break
        
        j-=1
    if(len(data)==0):
        data1.append(' '.join(data2))
    else:
        data1.append(' '.join(data))
    

In [22]:
dataa={'topic_id':df_sample.topic_id,
       'content_ids':data1
    
}
df = pd.DataFrame(dataa)

In [23]:
df.to_csv("submission.csv",index=False)