In [1]:
import torch
import guidance
import pandas as pd
import random, csv

In [134]:
from glob import glob
import json
import os

def load_json_data(data_dir):
    """
    Load multiple JSON files from the folder and merge.
    """

    files = glob(data_dir+"/*.json")
    files.sort()
    all_data = []
    for file_path in files:
        #print("Loading: ",file)
        #file_path = os.path.join(data_dir, file)
        with open(file_path, "r", encoding = "utf-8-sig") as f:
            doc = json.load(f)
        all_data.append(doc)
        #all_data += doc
    return all_data


docs = load_json_data('/Project/json_data')

document = []
for x in docs:
    
    try:
        document.append('제목: '+x["title"]+' 본문: '+x["contents"])
    except:
        pass
        #document.append('제목: '+x["title"]+' 본문: '+x["content"])


In [135]:
from rank_bm25 import BM25Okapi
from konlpy.tag import Mecab
import pickle

In [136]:
from transformers import AutoTokenizer

In [137]:
import numpy as np

doc_id_ref = []
corpus = []

#tokenizer = Mecab()
tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")
for idx, passage in enumerate(document):
    tokenized_doc = tokenizer.convert_ids_to_tokens(tokenizer(passage,add_special_tokens = False )['input_ids'])
    #tokenized_doc = tokenizer.morphs(passage)
    #tokenized_doc = passage.split('.')
    corpus.append(tokenized_doc)
    doc_id_ref.append(idx)

searcher = BM25Okapi(corpus)

Token indices sequence length is longer than the specified maximum sequence length for this model (1309 > 512). Running this sequence through the model will result in indexing errors


In [138]:
#Save the BM25 index to a file
with open(os.path.join('/Project/index_save_dir/', 'sparse','bm25_index_bert.pickle'), 'wb') as index_file:
    pickle.dump(searcher, index_file)

In [139]:
#saved_index_path = '/Project/index_save_dir/sparse/bm25_index.pickle'
saved_index_path = '/Project/index_save_dir/sparse/bm25_index_bert.pickle'

with open(saved_index_path, 'rb') as index_file:
    loaded_bm25 = pickle.load(index_file)

In [158]:
raw_query = '펩 과르디올라가 명장인 이유는?'

# query = tokenizer.morphs(raw_query)

query = tokenizer.convert_ids_to_tokens(tokenizer(raw_query,add_special_tokens = False )['input_ids'])

searcher = loaded_bm25
scores = searcher.get_scores(query)

In [159]:
top_k = 5
top_k_list = np.argpartition(scores,-1*top_k)[-1*top_k:]
top_k_indices = np.array(doc_id_ref)[top_k_list]    
        
raw_reference_list = []

for idx in top_k_indices:
    raw_reference = document[idx]
    raw_reference_list.append(raw_reference)

In [160]:
retriever_prompt_ID_Str = '''

{{#system~}}

You are a human expert on football related knowledge.

{{~/system~}}

{{~#user}}

Select the "most relevant sentences" of the passage that could be useful in answering the query.
Simply return the original passage in a full sentence.
Selected sentences should contain enough information to answer the query.

Answer in Korean.

Query: {{query}}
Passage: {{passages}}

{{~/user}}

{{#assistant~}}

{{gen "retrieved" max_tokens=300 temperature=0.0 n=1}}

{{~/assistant}}

'''

In [161]:
from tqdm import tqdm 

llm = guidance.llms.OpenAI("gpt-3.5-turbo", caching=False, 
                            api_key = "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",
                            ) 
guidance.llm = llm 

In [162]:
guidance.llm.cache.clear()

structure_program = guidance(retriever_prompt_ID_Str, 
                             query = raw_query, 
                             passages = raw_reference_list[1]#[int(len(raw_reference_list[0])*0.5):],
                             )

res = structure_program()

In [163]:
retriever_prompt_D_Str = '''

{{#system~}}

You are a human expert on football related knowledge.
All you answers should be in Korean

{{~/system~}}

{{~#user}}

You will be given a passage and a query.
Your job is to generate three step strategy that can be used to extract the most relevant fragment of the passage to the query.

query: {{query}}
passage: {{passages}}

{{~/user}}

{{#assistant~}}

{{gen "strategy" list_append=True temperature=0.0}}

{{~/assistant}}

{{~#user}}

Based on the given strategy, Answer the query. However, do not solely rely on it.
Bare in mind that you are given limited amount of knowledge. 
Therefore, do not rely on the strategy too much.
The strategy may or may not contain direct answer to the query.
 
Query: {{query}}

Strategy: {{strategy}}

All you answers should be in Korean

{{~/user}}

{{#assistant~}}

{{gen "retrieved" max_tokens=500 temperature=0.0 n=1}}

{{~/assistant}}
'''

In [164]:
from tqdm import tqdm 

llm = guidance.llms.OpenAI("gpt-3.5-turbo", caching=True,
                            api_key = "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",
                            ) 
guidance.llm = llm 

guidance.llm.cache.clear()

In [165]:
structure_program = guidance(retriever_prompt_D_Str, 
                             query = raw_query, 
                             passages = raw_reference_list[1]#[:int(len(raw_reference_list[0])*0.5)],
                             )
res = structure_program()

In [122]:
input_prompt ='''

{{#system~}}

Based on the references, please answer the query more succinctly and professionally in a full sentence. 
The provided references are likely to contain crucial information needed to answer the question. 
Each references may not be related to each other. Therefore, thoroughly check contents of each references to get useful information. 
The reference is delimited by triple brackets [[[]]]. The query is delimited by triple parentheses ((())). 
If you cannot find any useful relevant information in the references, please answer with your own knowledge. 
When asked with a question that possibly require an insight, carefully generate your answer based on the information given in the references. 
Please answer in full Korean sentence. 

{{~/system~}}


{{~#user}}

----
Reference: [[[{{passages}}]]], 
query: ((({{query}})))

{{~/user}}

{{#assistant~}}

{{gen 'answer' max_tokens=200 temperature=1.0 n=1 list_append=True}}

{{~/assistant}}

'''

In [213]:
structure_program = guidance(input_prompt, 
                             query = raw_query, 
                             passages = res['retrieved'],
                             )
res2 = structure_program()