In [1]:
import llama_index
llama_index.__version__

'0.9.12'

### Load Library

In [2]:
import nest_asyncio
nest_asyncio.apply()

from dotenv import load_dotenv
load_dotenv()

import warnings
warnings.filterwarnings("ignore")

# Query Engine & Retrievers
from engine import local_llm_model
from engine import load_db
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.retrievers import QueryFusionRetriever
from llama_index.retrievers import AutoMergingRetriever
from llama_index import set_global_service_context

# Node Postprocessors
from llama_index.postprocessor import (
    SentenceTransformerRerank,
    LongContextReorder,
    # KeywordNodePostprocessor,
    LLMRerank
)
from postprocess import (
    DuplecatedNodePostprocessor,
    UnionNodePostprocessor,
    LimitRetrievedNodesLength, 
    UnionNodePostprocessorSortedScore
)

## Prompt
from llama_index.prompts import PromptTemplate
from llama_index.prompts.prompt_type import PromptType
from prompts import QUERY_GEN_PROMPT
from prompts import DEFAULT_TEXT_QA_PROMPT_TMPL

# Display
from utils import pprint_response_title

### Initialize

In [3]:
MODEL_NAME = "google/gemma-7b-it"
# MODEL_NAME = "google/gemma-7b"
# MODEL_NAME = "TheBloke/SOLAR-10.7B-v1.0-GPTQ"
# MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
# MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
# MODEL_NAME = "Qwen/Qwen-14B-Chat-Int4"
# MODEL_NAME = "Qwen/Qwen1.5-7B-Chat-GPTQ-Int4"
# MODEL_NAME = "Qwen/Qwen1.5-14B-Chat-GPTQ-Int4"

embed_name = "BAAI/bge-small-en"
# embed_name = "BAAI/bge-small-en-v1.5" # 256 : 39s, 512 : 23.7s
# embed_name = "thenlper/gte-base" # 256 : 27.1s, 512 : 18.4s #None Error가 발생함.. 왜지
# embed_name = "jamesgpt1/sf_model_e5" # 256 : 41s, 512 : 18s #Embedding 이 깨짐
# embed_name = "WhereIsAI/UAE-Large-V1"

# load local embedding model and llm model
service_context, embed_model, llm = local_llm_model(MODEL_NAME,
                                                    embed_name,
                                                    type_='local',#,'openai',#'local',
                                                    token_counter= False
                                                    )
set_global_service_context(service_context)

# load vector db
index, storage_context = load_db(embed_name, embed_size = 'h2') #h2

Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.25it/s]


### Engine

In [4]:
from llama_index.postprocessor import MetadataReplacementPostProcessor

w_post_processor = MetadataReplacementPostProcessor(target_metadata_key="window")
d_post_processor = DuplecatedNodePostprocessor()
u_post_processor = UnionNodePostprocessor()
u_post_processor2 = UnionNodePostprocessorSortedScore()
s_post_processor = SentenceTransformerRerank(
        top_n =  40,
        model = 'sentence-transformers/all-mpnet-base-v2' # avsolatorio/GIST-Embedding-v0
        )

l_post_processor = LongContextReorder()
t_post_processor = LimitRetrievedNodesLength(limit=4000)
from llama_index import ServiceContext  
from llama_index.llms import OpenAI
llm_post_processor = LLMRerank(
        # choice_select_prompt = 
        service_context = ServiceContext.from_defaults(
                llm = OpenAI(temperature=0, model="gpt-3.5-turbo-16k")),
        top_n = 14
)

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# query = "Does anybody have experiences with patients with epilepsy? We have a customer who's clients often use an NVS (nervus vagus stimulator) for their epilepsy. Do we know if they can be measured with InBody? Does the measurement affect the NVS?"
query = "Effect of InBody measurement on patients with epilepsy using NVS"
query = "Find me a text that shows overestimate muscle mass or fat free mass and underestimate fat mass when measured with InBody equipment compared to DXA or DEXA."
query = "Find me a paper that utilizes Phase Angle for athletes"
# query = "Find me a InBody paper that utilizes Phase Angle for athletes"
# query = "Can the catheter affect impedance when measuring BIA?"
# # query = "Hi all, We have a question, just to be sure: What should we advise in case of the same impedance values in the trunk 1kHz and 5kHz? This isn't an inversion of impedance values, will the measurement thus be accurate? Thanks in advance!"
# query = "Has hypertonic dehydration resulted in an increased ECW Ratio?"
# query = "Find me a paper that utilizes Phase Angle for athletes"
# query = "Hi all, do we have studies that compare body composition / fat percentage measured with Caliper/skinfold measurements with InBody in athletes?"
# query = "Find papers comparing skinfold, caliper measurements, and InBody"
# query = "What is inbody 770?"
# query = "Hi all, We have a question, just to be sure: What should we advise in case of the same impedance values in the trunk 1kHz and 5kHz? This isn't an inversion of impedance values, will the measurement thus be accurate? Thanks in advance!"
# query = questions['Q7']
# query = questions['Q11']
# query = "find papers related to InBody BWA 2.0"
# query = "InBody BWA 2.0 research papers"
# query = "InBody BWA 2.0 technology review articles"
# query = "Find the athlete's body composition measurements with InBody 970"
# query = "Find me a paper on epilepsy?"

In [10]:
# import json

# with open('../questions/InBody_questions.json', 'r') as f:
#    questions = json.load(f)
   
# # questions

In [11]:
# Prompt Template
# from prompts import DEFAULT_TREE_SUMMARIZE_TMPL


DEFAULT_TEXT_QA_PROMPT_TMPL = """
"Answer queries without using prior knowledge based on a given Context"
Context :
{context_str}
Query : {query_str}
Answer : 
"""
DEFAULT_TREE_SUMMARIZE_TMPL = (
    "The context below is the title of the article, and the answer to the query referencing that article."
    "Given the information from multiple sources and not prior knowledge, answer the query.\n"
    "When answering, be sure to include a ## separated title in your answer."
    
    "Please answer in the format of python list.\n"
    "Returns the number preceding every title that can answer the query."
    "If no number is relevant, answer with an empty list []."
    
    'Example format: \n'
    "----------------------\n"
    "## 0. Title : <Title 0>\n"
    "<Answer about query of <Title 0>\n"
    "## 1. Title : <Title 1>\n"
    "<Answer about query of <Title 1>\n"
    "...\n\n"
    "## n. Title : <Title n>\n"
    "<Answer about query of <Title n>\n"
    
    "If you choice the number 0 and 2, then answer is [0, 1], like below."
    "----------------------\n"
    "Query : <query>\n"
    "Answer : \n"
    "[0, 1]"
    "\n\n"
    
    "If you choice the number just 0, then answer is [0], like below."
    "----------------------\n"
    "Query : <query>\n"
    "Answer : \n"
    "[0]"
    "\n\n"

    "Let's try this now: \n\n"
    "----------------------\n"
    "{context_str}\n"
    "----------------------\n"
    "Query : {query_str}\n"
    "Answer : \n"
)
FINAL_QA_PROMPT_TMPL = """
Base your answer on the context.
If NO context is given, Don't use your prior knowledge and answer with something like "no context was given".
Or, If the query is different from what you know or context, you can answer with "Different answer based on my knowledge and context Because..."
Please provide a numbered response to the papers you found, as shown below.

----------------------
Context : 
<context>
Query : 
<query>
Answer : 
the first answer is ...

the second answer is ...

the third answer is ...
----------------------

"Let's try this now:"
----------------------
Context :
{context_str}
Query : {query_str}
Answer : 
"""


In [12]:
# query engine
from engine import query_engine, retriever_engine

retriever = retriever_engine(top_k = 40, storage_context=storage_context, retrieve_mode='hierarchical', index=index)
final_response, response, num_content_dict, nodes, = query_engine(
    retriever,
    query,
    node_postprocessors = [d_post_processor, u_post_processor2, s_post_processor],
    service_context = service_context,
    DEFAULT_TEXT_QA_PROMPT_TMPL = DEFAULT_TEXT_QA_PROMPT_TMPL,
    DEFAULT_TREE_SUMMARIZE_TMPL = DEFAULT_TREE_SUMMARIZE_TMPL,
    FINAL_QA_PROMPT_TMPL = FINAL_QA_PROMPT_TMPL
    )

import textwrap
from utils import sim_sentence_extract
print(textwrap.fill(final_response, width=70))
print()
print('Sources : ')
for idx, i in enumerate(eval(response)):
    print(f"{idx}. Title : {num_content_dict[i].metadata['file_name']}")
    pprint_res = sim_sentence_extract(query, num_content_dict[i], 350)
    print(textwrap.fill(f"Text : {pprint_res}", width=70))
    
nodes

Generated queries:
1. "Phase Angle measurement in athletes research paper"
2. "Application of Phase Angle in sports performance study"
The paper that utilizes Phase Angle for athletes is the second paper
titled "Cell integrity indicators assessed by bioelectrical impedance
A systematic review of studies involving athletes (720, 770,
2019).pdf". In this paper, the author reviewed studies involving
athletes and found that PA values can be reduced during and after the
competition, as well as after muscle injuries or body weight
reduction.

Sources : 
0. Title : 2020_S10_Whole body and regional phase angle as indicators of muscular performance in athletes.pdf
Text :   ## Title : 2020_S10_Whole body and regional phase angle as
indicators of muscular performance in athletes.pdf ##  Full Terms &
Conditions of access and use can be found at https://www.tandfonline.c
om/action/journalInformation?journalCode=tejs20 European Journal of
Sport Science ISSN: (Print) (Online) Journal homepage:
https:

[NodeWithScore(node=TextNode(id_='dd30d57f-ff07-4de3-86b5-c9c6f8c546fb', embedding=None, metadata={'file_name': '2022_720_An innovative approach to functional spiroergometric examination among power athletes.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='faad546911e86a6f525e5fe34d999d97179337b1cadd7661ac18ec24708b906f', text='\n ## Title : 2022_720_An innovative approach to functional spiroergometric examination among power athletes.pdf ##\n\ndoi: \n10.1519/JSC.0000000000000981. \nMarček T., Dzurenková D., Bohuš B., Gulán Ľ., Hájko vá M., Hostýn V., Meško D., & Novotná E. \n(2007). Telovýchovné lekárstvo . Univerzita Komenského, Bratislava. \nMcGuigan, M. (2017). Developing power.  Human Kinetics, Inc. \nMoroščák, J., Ružbarský, P., Balint, G., & Vodicka,  T. (2013). Anaerobic and aerobic fitness of ice ho ckey \nplayers throughout annual training cycle. Gymnasium, Scientific Journal of Education, Sports and Health, \n14 (2), 86–91. \nPas