In [4]:
import json
import random
import os

import torch

from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes.retriever import ElasticsearchRetriever, DensePassageRetriever

from tqdm import tqdm

import pandas as pd
import numpy as np

In [5]:
from typing import List, Dict
# 파이썬 내장 자료구조에 대해 타입을 명시하기 위해 사용 !

In [6]:
ROOT_PATH = './data/ALL/jsonl/'
train = pd.read_json(ROOT_PATH + 'train.jsonl', lines=True)
val = pd.read_json(ROOT_PATH + 'val.jsonl', lines=True)

In [7]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42)

In [8]:
"""
    iterrows() 개념 : 데이터 전처리를 진행할 때  
    df 행에 반복적으로 접근하면서 값을 추출하거나 조작할 때 사용
    이 때 효율적으로 접근하기 위해 사용
"""

import math
MAX_SEQ_LEN= 512
TURN= 5

def transform_data(data: pd.DataFrame, query_nm= 'query', answer_nm= 'answer', max_length= MAX_SEQ_LEN, turn_cnt= TURN) -> List[Dict]:
    data_list= []

    for idx, i in data.iterrows():
        for j in i['specific_query_list']:
            rel_context= ''
            rel_feature= []
            for start, end in j['relevant_text_span']: # relevant text span은 [[1, 2], [12, 15]] 이런식으로 되어 있음
                rel_dialogue= i['meeting_transcripts'][int(start): int(end)+1]
                rel_context= '\n'.join(['{}:{}'.format(turn['speaker'], turn['content']) for turn in rel_dialogue]).strip().lower()

                for m in range(0, len(rel_dialogue), turn_cnt):
                    tmp_feature= '\n'.join(['{}:{}'.format(t['speaker'], t['content']) for t in rel_dialogue[m:m+turn_cnt]]).strip().lower()
                    rel_feature.append(tmp_feature)
            
            j['rel_context']= rel_context.strip()
            j['question']= j[query_nm].lower()
            j['answers']= [j[answer_nm].lower()]
            # print(j['answers'])
            j['rel_feature']= rel_feature

            data_list.append(j)
    print(f'data length: {len(data_list)}')
    return data_list



In [9]:
train_dataset = transform_data(train, query_nm='query', answer_nm='answer', max_length=MAX_SEQ_LEN, turn_cnt= TURN)
val_dataset = transform_data(val, query_nm='query', answer_nm='answer', max_length=MAX_SEQ_LEN, turn_cnt= TURN)

data length: 1095
data length: 237


In [10]:
train_dataset[0].keys()

dict_keys(['query', 'answer', 'relevant_text_span', 'rel_context', 'question', 'answers', 'rel_feature'])

In [11]:
"""
    이제 BM25를 통해서 answer와 유사한 relevant feature topk개를 뽑기
"""

'\n    이제 BM25를 통해서 answer와 유사한 relevant feature topk개를 뽑기\n'

In [12]:
# feature_document_store_es = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
# TOPK_FEATURE= 5 # 아래 보니까 TOPK개가 안나올수도 있는 것 같음

In [13]:
# topk_train_dataset= []
# for idx in tqdm(range(len(train_dataset))):
#     retrieve_dataset= []
#     for c in train_dataset[idx]['rel_feature']:
#         retrieve_dataset.append(dict(content= c, meta= {'question': train_dataset[idx]['question'], 'answers': train_dataset[idx]['answers']}))
#     # print(retrieve_dataset)
#     feature_document_store_es.delete_documents()
#     feature_document_store_es.write_documents(retrieve_dataset)
#     feature_retrieve_es= ElasticsearchRetriever(document_store= feature_document_store_es)
#     # print(train_dataset[idx]['answers'])
#     retrieved_feature = feature_retrieve_es.retrieve(query= train_dataset[idx]['answers'][0], top_k=TOPK_FEATURE, index='document')
#     if len(retrieved_feature) < TOPK_FEATURE:
#         print(len(retrieved_feature),train_dataset[idx]['question'])
#     for i in range(len(retrieved_feature)):
#         topk_train_dataset.append(dict(content= retrieved_feature[i].content, meta= {'question': train_dataset[idx]['question'], 'answers': train_dataset[idx]['answers']}))
#         # print(retrieved_feature[i].content)

# with open('./topk_dataset.json', 'w') as f:
#     json.dump(topk_train_dataset, f, indent= 4)

# # save rel topk feature, query, summary, [neg_sample -> 이건 다 쪼개서 해도 될 듯: answer와 가장 유사한 친구들..]


In [14]:
# dict(content= retrieved_feature[i].content, meta= {'question': train_dataset[idx]['question'], 'answers': train_dataset[idx]['answers']})

In [15]:
def load_data(path):
    with open(path, 'r') as f:
        data= json.load(f)
    
    return data

feature_train_dataset= load_data('./data/topk_dataset.json')
print(feature_train_dataset[0].keys())
print(len(feature_train_dataset))

document_store_es = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
document_store_es.delete_documents()
document_store_es.write_documents(feature_train_dataset)
retriever_es = ElasticsearchRetriever(document_store=document_store_es)



dict_keys(['content', 'meta'])
4176




In [16]:
# dict(content= retrieved_feature[i].content, meta= {'question': train_dataset[idx]['question'], 'answers': train_dataset[idx]['answers']})

pos_train_dataset= []

pos_ctxs_list= []
for i in tqdm(range(len(feature_train_dataset))):
    pos_ctxs_list.append(feature_train_dataset[i]['content'])
    if i == len(feature_train_dataset)-1 :
        pos_train_dataset.append(dict(content= feature_train_dataset[i]['meta']['answers'], meta= {'question': feature_train_dataset[i]['meta']['question'], 'pos_ctxs': pos_ctxs_list}))
    elif feature_train_dataset[i]['meta']['question'] != feature_train_dataset[i+1]['meta']['question']: 
        pos_train_dataset.append(dict(content= feature_train_dataset[i]['meta']['answers'], meta= {'question': feature_train_dataset[i]['meta']['question'], 'pos_ctxs': pos_ctxs_list}))
        pos_ctxs_list= []
    


100%|██████████| 4176/4176 [00:00<00:00, 608110.74it/s]


In [17]:
def create_dpr_training_dataset(data, retriever, num_hard_negative_ctxs= 15):
    passage_id= 0
    n_non_added_questions= 0 # neg sample이 추가되지 않은 개수
    n_questions= 0

    dpr_dataset= []

    for d in tqdm(data, unit= 'article'):
        question= d['meta']['question']
        answers= d['content']
        pos_ctxs_list= d['meta']['pos_ctxs']

        pos_ctxs= []
        for pos in pos_ctxs_list:
            pos_ctxs.append({'title': '', 'text': pos, 'passage_id': passage_id})

    
        retrieved_docs = retriever.retrieve(query=question, top_k=num_hard_negative_ctxs, index='document')

        hard_neg_ctxs= []
        for retrieved_doc in retrieved_docs:
            if retrieved_doc.meta['question'] == question:
                # print(retrieved_doc.meta['question'] == question)
                continue
            hard_neg_ctxs.append({'title': '', 'text': retrieved_doc.content, 'passage_id': passage_id})

        passage_id +=1

        if len(hard_neg_ctxs)==0:
            n_non_added_questions+=1
            continue

        dict_DPR = {
                    "question": question,
                    "answers": answers,
                    "positive_ctxs": pos_ctxs,
                    "negative_ctxs": [],
                    "hard_negative_ctxs": hard_neg_ctxs,
                    }   
        
        n_questions +=1
        dpr_dataset.append(dict_DPR)
    print(n_non_added_questions, n_questions)
    return dpr_dataset


In [18]:
num_hard_negative_ctxs = 10
DPR_dataset = create_dpr_training_dataset(pos_train_dataset, retriever_es, num_hard_negative_ctxs=num_hard_negative_ctxs)

100%|██████████| 1095/1095 [00:07<00:00, 142.91article/s]

0 1095





In [19]:
with open('./data/qmsum_dpr.train.json', 'w') as f:
    json.dump(DPR_dataset, f, indent= 4, ensure_ascii=False)

In [20]:
total_nb_questions = len(DPR_dataset)
total_nb_questions 

1095

In [21]:
retriever_es.retrieve(query=DPR_dataset[0]['question'], top_k=1)

[<Document: {'content': 'project manager:{vocalsound} and how did you feel about the whole the whole process though ?\nmarketing:oh , overall i mean i thought we did a good job like {disfmarker}\nuser interface:{vocalsound}\nmarketing:we got to choose {disfmarker} basically we had control over {disfmarker} minus it being just merely a t_v_ remote we got to choose what we wanted to do with it .\nproject manager:right , and we got say over what {disfmarker} how technologically advanced it should be and also how fashionable , which i kind of like {disfmarker}', 'content_type': 'text', 'score': 0.9234859004277541, 'meta': {'question': 'what aspects did the team like when evaluating the whole production process?', 'answers': ['marketing commented on the overall process as fairly satisfying, since they had control over most of the detailed design of the remote, how it should be advanced as well as fashionable. as for teamwork, industrial designer and user interface thought they worked well t

In [22]:
retrieve_dataset = []
for c in train_dataset:
    retrieve_dataset.append(dict(content=c['rel_context'], meta={'question': c['question'], 'answers': c['answers'][0]}))
    
for c in val_dataset:
    retrieve_dataset.append(dict(content=c['rel_context'], meta={'question': c['question'], 'answers': c['answers'][0]}))

In [23]:
document_store = FAISSDocumentStore(sql_url="sqlite://", similarity="dot_product", faiss_index_factory_str="Flat")
document_store.write_documents(retrieve_dataset)

Writing Documents:   0%|          | 0/1332 [00:00<?, ?it/s]

In [24]:
retriever = DensePassageRetriever(document_store=document_store,
                                  query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                  passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                  max_seq_len_query=256,
                                  max_seq_len_passage=MAX_SEQ_LEN)

INFO - haystack.modeling.utils -  Using devices: CUDA:0
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-question_encoder-single-nq-base locally.
INFO - haystack.modeling.model.language_model -  Looking on Transformers Model Hub (in local cache and online)...
INFO - haystack.modeling.model.language_model -  Loaded facebook/dpr-question_encoder-single-nq-base
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.
INFO - haystack.modeling.model.language_model -  LOADING MODEL
INFO - haystack.modeling.model.language_model -  Could not find facebook/dpr-ctx_encoder-single-nq-base locally.
IN

In [25]:
document_store.update_embeddings(retriever)

INFO - haystack.document_stores.faiss -  Updating embeddings for 1324 docs...


Updating Embedding:   0%|          | 0/1324 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/1328 [00:00<?, ? Docs/s]

In [26]:
retriever.retrieve(query=DPR_dataset[0]['question'], top_k=1)

[<Document: {'content': 'user interface:and {disfmarker} and one more feature is we we have a holder for this remote which is an oyster shape .', 'content_type': 'text', 'score': 0.6629450372573744, 'meta': {'question': 'what did industrial designer say about the appearance during the discussion about the design of the prototype?', 'answers': 'the prototype was attractive, bright blue and snail shaped with buttons in different colours such as yellow. it was compact so it could easily fit in the hand and buttons could be easily accessed. moreover, the material for the case would be plastic but the buttons would be made with soft rubber. for the light emitting diode of the led, it would be fluorescent green and it would be a bulb like an ordinary infrared. last but not least, there would be an oyster-shaped holder for the remote.', 'vector_id': '183'}, 'embedding': None, 'id': '2fb61196a6f1cb6bedfb58f05722cfad'}>]

In [28]:
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
retriever.train(
    data_dir='./data',
    train_filename='qmsum_dpr.train.json',
    n_epochs=3,
    batch_size=2,
    grad_acc_steps=32,
    save_dir='./train_qmsum_dpr',
    # evaluate_every=50,
    # embed_title=False,
    num_positives=3,
    num_hard_negatives=3,
    learning_rate=1e-5,
    # weight_decay=0.01,
)

INFO - haystack.modeling.data_handler.data_silo -  
Loading data into the data silo ... 
              ______
               |o  |   !
   __          |:`_|---'-.
  |__|______.-/ _ \-----.|       
 (o)(o)------'\ _ /     ( )      
 
INFO - haystack.modeling.data_handler.data_silo -  LOADING TRAIN DATA
INFO - haystack.modeling.data_handler.data_silo -  Loading train set from: data/qmsum_dpr.train.json 
INFO - haystack.modeling.data_handler.data_silo -  Got ya 7 parallel workers to convert 1095 dictionaries to pytorch datasets (chunksize = 32)...
INFO - haystack.modeling.data_handler.data_silo -   0    0    0    0    0    0    0 
INFO - haystack.modeling.data_handler.data_silo -  /|\  /w\  /w\  /w\  /w\  /w\  /w\
INFO - haystack.modeling.data_handler.data_silo -  /'\  /'\  /'\  /'\  /'\  / \  /'\
Preprocessing Dataset data/qmsum_dpr.train.json: 100%|██████████| 1095/1095 [00:02<00:00, 389.30 Dicts/s]
INFO - haystack.modeling.data_handler.data_silo -  
INFO - haystack.modeling.data_handler

RuntimeError: CUDA out of memory. Tried to allocate 36.00 MiB (GPU 0; 7.93 GiB total capacity; 7.14 GiB already allocated; 27.31 MiB free; 7.15 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
val_dataset

doc_match= 0

for val in tqdm(val_dataset):
    ret_result = retriever.retrieve(val['question'], top_k = 50)

    for ret in ret_result:
        if val['rel_context'] == ret.content:
            doc_match+=1
            break;
        
print(doc_match / len(val_dataset) * 100)

100%|██████████| 237/237 [00:11<00:00, 21.52it/s]

39.24050632911392





In [None]:
!nvidia-smi

Sat Mar 12 21:01:04 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA Quadro P...  Off  | 00000000:02:00.0 Off |                  N/A |
|100%   90C    P0    36W / 105W |   8062MiB /  8118MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces