<a href="https://colab.research.google.com/github/Nov05/Google-Colaboratory/blob/master/20231214_L3_Sentence_window_retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **\<top\>**  

* modified by nov05 on 2023-12-14  
* go to [the course](https://learn.deeplearning.ai/building-evaluating-advanced-rag/lesson/4/sentence-window-retrieval)
* watch [the video](https://dft3h5i221ap1.cloudfront.net/LlamaIndex-TrueEra/C1/video/LITE_L3.mp4)    

In [6]:
!gdown --no-check-certificate --folder https://drive.google.com/drive/folders/10LpvZD_trQ7t0J3NeMoP6zFvU8ZmwvCv
!cp /content/l3_files/* /content
!rm -r /content/l3_files

In [None]:
!pip install openai==1.3.5
!pip install llama-index==0.9.8
!pip install python-dotenv
!pip install trulens-eval==0.18.1
!pip install pypdf
!pip uninstall transformers ## 4.35.2 pre-installed by colab
!pip install transformers==4.33.2
!pip install sentence-transformers==2.2.2
## restart the session

In [None]:
import os
from google.colab import userdata
# os.environ["OPENAI_API_KEY"] = 'eyJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJhcHAiLCJzdWIiOiIyNjgzNiIsImF1ZCI6IldFQiIsImlhdCI6MTcwMjQyMDIzNywiZXhwIjoxNzAzMDI1MDM3fQ.V4b0c5otl7FjxiSv80CFzSc4SFcyre2l9WKweNckufU'
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
# os.environ["HUGGINGFACE_API_KEY"] =
os.environ["OPENAI_API_KEY"]

In [17]:
from google.colab import output
output.enable_custom_widget_manager()
# output.disable_custom_widget_manager()

# Lesson 3: Sentence Window Retrieval

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import utils
import os
import openai
openai.api_key = utils.get_openai_api_key()

In [4]:
from llama_index import SimpleDirectoryReader
documents = SimpleDirectoryReader(
    input_files=["./eBook-How-to-Build-a-Career-in-AI.pdf"]
).load_data()

In [5]:
print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))
print(documents[0])

<class 'list'> 

41 

<class 'llama_index.schema.Document'>
Doc ID: e90592db-fa8b-4484-b5cc-d988f4692024
Text: PAGE 1Founder, DeepLearning.AICollected Insights from Andrew Ng
How to  Build Your Career in AIA Simple Guide


In [6]:
from llama_index import Document
document = Document(text="\n\n".join([doc.text for doc in documents]))

## Window-sentence retrieval setup

In [7]:
from llama_index.node_parser import SentenceWindowNodeParser
# create the sentence window node parser w/ default settings
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

[nltk_data] Downloading package punkt to /tmp/llama_index...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [8]:
text = "hello. how are you? I am fine!  "
nodes = node_parser.get_nodes_from_documents([Document(text=text)])

In [9]:
print([x.text for x in nodes])

['hello. ', 'how are you? ', 'I am fine!  ']


In [10]:
print(nodes[1].metadata["window"])

hello.  how are you?  I am fine!  


In [11]:
text = "hello. foo bar. cat dog. mouse"
nodes = node_parser.get_nodes_from_documents([Document(text=text)])

In [12]:
print([x.text for x in nodes])

['hello. ', 'foo bar. ', 'cat dog. ', 'mouse']


In [13]:
print(nodes[0].metadata["window"])

hello.  foo bar.  cat dog. 


### Building the index

In [14]:
from llama_index.llms import OpenAI
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

In [15]:
from llama_index import ServiceContext
sentence_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    # embed_model="local:BAAI/bge-large-en-v1.5"
    node_parser=node_parser,
)

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [16]:
from llama_index import VectorStoreIndex
sentence_index = VectorStoreIndex.from_documents(
    [document], service_context=sentence_context
)

In [18]:
sentence_index.storage_context.persist(persist_dir="./sentence_index")

In [19]:
# This block of code is optional to check
# if an index file exist, then it will load it
# if not, it will rebuild it

import os
from llama_index import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index import load_index_from_storage

if not os.path.exists("./sentence_index"):
    sentence_index = VectorStoreIndex.from_documents(
        [document], service_context=sentence_context
    )

    sentence_index.storage_context.persist(persist_dir="./sentence_index")
else:
    sentence_index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir="./sentence_index"),
        service_context=sentence_context
    )

### Building the postprocessor

In [20]:
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
postproc = MetadataReplacementPostProcessor(
    target_metadata_key="window"
)

In [21]:
from llama_index.schema import NodeWithScore
from copy import deepcopy
scored_nodes = [NodeWithScore(node=x, score=1.0) for x in nodes]
nodes_old = [deepcopy(n) for n in nodes]

In [22]:
nodes_old[1].text

'foo bar. '

In [23]:
replaced_nodes = postproc.postprocess_nodes(scored_nodes)

In [24]:
print(replaced_nodes[1].text)

hello.  foo bar.  cat dog.  mouse


### Adding a reranker

In [25]:
from llama_index.indices.postprocessor import SentenceTransformerRerank
# BAAI/bge-reranker-base
# link: https://huggingface.co/BAAI/bge-reranker-base
rerank = SentenceTransformerRerank(
    top_n=2, model="BAAI/bge-reranker-base"
)

config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

In [27]:
from llama_index import QueryBundle
from llama_index.schema import TextNode, NodeWithScore

query = QueryBundle("I want a dog.")

scored_nodes = [
    NodeWithScore(node=TextNode(text="This is a cat"), score=0.6),
    NodeWithScore(node=TextNode(text="This is a dog"), score=0.4),
]

In [28]:
reranked_nodes = rerank.postprocess_nodes(
    scored_nodes, query_bundle=query
)

In [29]:
print([(x.text, x.score) for x in reranked_nodes])

[('This is a dog', 0.91827357), ('This is a cat', 0.0014040739)]


### Runing the query engine

In [30]:
sentence_window_engine = sentence_index.as_query_engine(
    similarity_top_k=6, node_postprocessors=[postproc, rerank]
)

In [31]:
window_response = sentence_window_engine.query(
    "What are the keys to building a career in AI?"
)

In [32]:
from llama_index.response.notebook_utils import display_response
display_response(window_response)

**`Final Response:`** The keys to building a career in AI are learning foundational technical skills, working on projects, and finding a job, all of which is supported by being part of a community.

## Putting it all Together

In [33]:
import os
from llama_index import ServiceContext, VectorStoreIndex, StorageContext
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index import load_index_from_storage


def build_sentence_window_index(
    documents,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    sentence_window_size=3,
    save_dir="sentence_index",
):
    # create the sentence window node parser w/ default settings
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=sentence_window_size,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    sentence_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
        node_parser=node_parser,
    )
    if not os.path.exists(save_dir):
        sentence_index = VectorStoreIndex.from_documents(
            documents, service_context=sentence_context
        )
        sentence_index.storage_context.persist(persist_dir=save_dir)
    else:
        sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=sentence_context,
        )

    return sentence_index


def get_sentence_window_query_engine(
    sentence_index, similarity_top_k=6, rerank_top_n=2
):
    # define postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )

    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
    )
    return sentence_window_engine

In [34]:
from llama_index.llms import OpenAI
index = build_sentence_window_index(
    [document],
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    save_dir="./sentence_index",
)

In [35]:
query_engine = get_sentence_window_query_engine(index, similarity_top_k=6)

## TruLens Evaluation

In [36]:
eval_questions = []
with open('generated_questions.text', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        eval_questions.append(item)

In [37]:
from trulens_eval import Tru
def run_evals(eval_questions, tru_recorder, query_engine):
    for question in eval_questions:
        with tru_recorder as recording:
            response = query_engine.query(question)

In [38]:
from utils import get_prebuilt_trulens_recorder
from trulens_eval import Tru
Tru().reset_database()

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


### Sentence window size = 1

In [39]:
sentence_index_1 = build_sentence_window_index(
    documents,
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    embed_model="local:BAAI/bge-small-en-v1.5",
    sentence_window_size=1,
    save_dir="sentence_index_1",
)

In [40]:
sentence_window_engine_1 = get_sentence_window_query_engine(
    sentence_index_1
)

In [41]:
tru_recorder_1 = get_prebuilt_trulens_recorder(
    sentence_window_engine_1,
    app_id='sentence window engine 1'
)

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


In [42]:
run_evals(eval_questions, tru_recorder_1, sentence_window_engine_1)

In [43]:
Tru().run_dashboard()

Starting dashboard ...
npx: installed 22 in 3.468s

Go to this url and submit the ip given here. your url is: https://new-steaks-ring.loca.lt

  Submit this IP Address: 34.145.137.183



<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

<img src="https://github.com/Nov05/pictures/blob/master/DLAI/20231213_Building%20and%20Evaluating%20Advanced%20RAG/2023-12-14%2004_27_23-Leaderboard.jpg?raw=true" width=800>  

### Note about the dataset of questions
- Since this evaluation process takes a long time to run, the following file `generated_questions.text` contains one question (the one mentioned in the lecture video).
- If you would like to explore other possible questions, feel free to explore the file directory by clicking on the "Jupyter" logo at the top right of this notebook. You'll see the following `.text` files:

> - `generated_questions_01_05.text`
> - `generated_questions_06_10.text`
> - `generated_questions_11_15.text`
> - `generated_questions_16_20.text`
> - `generated_questions_21_24.text`

Note that running an evaluation on more than one question can take some time, so we recommend choosing one of these files (with 5 questions each) to run and explore the results.

- For evaluating a personal project, an eval set of 20 is reasonable.
- For evaluating business applications, you may need a set of 100+ in order to cover all the use cases thoroughly.
- Note that since API calls can sometimes fail, you may occasionally see null responses, and would want to re-run your evaluations.  So running your evaluations in smaller batches can also help you save time and cost by only re-running the evaluation on the batches with issues.

In [44]:
eval_questions = []
with open('generated_questions.text', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        eval_questions.append(item)

### Sentence window size = 3

In [45]:
sentence_index_3 = build_sentence_window_index(
    documents,
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    embed_model="local:BAAI/bge-small-en-v1.5",
    sentence_window_size=3,
    save_dir="sentence_index_3",
)
sentence_window_engine_3 = get_sentence_window_query_engine(
    sentence_index_3
)
tru_recorder_3 = get_prebuilt_trulens_recorder(
    sentence_window_engine_3,
    app_id='sentence window engine 3'
)

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


In [46]:
run_evals(eval_questions, tru_recorder_3, sentence_window_engine_3)

In [47]:
Tru().run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.
Dashboard already running at path:   Submit this IP Address: 34.145.137.183



<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

<img src="https://github.com/Nov05/pictures/blob/master/DLAI/20231213_Building%20and%20Evaluating%20Advanced%20RAG/2023-12-14%2004_30_31-Leaderboard.jpg?raw=true" width=800>

# **\<bottom>**