# Advanced RAG Pipeline

### Importing libraries and OpenAi Key

In [3]:
import utils

import os
import openai
openai.api_key = utils.get_openai_api_key()

### Uploading our PDF document file

In [4]:
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["./Data Engineering #2.pdf"]
).load_data()

### Checking info of the document uploaded

In [5]:
print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))
print(documents[0])

<class 'list'> 

19 

<class 'llama_index.schema.Document'>
Doc ID: bc1a4ea3-addc-412f-9ab2-9507af9c45ae
Text: DATA ENGINEERING MASTERCLASS sprints.ai


## Basic RAG pipeline

In [6]:
from llama_index import Document

document = Document(text="\n\n".join([doc.text for doc in documents]))

### Implementing the LLM GPT and setting Parameters

In [7]:
from llama_index import VectorStoreIndex
from llama_index import ServiceContext
from llama_index.llms import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)
service_context = ServiceContext.from_defaults(
    llm=llm, embed_model="local:BAAI/bge-small-en-v1.5"
)
index = VectorStoreIndex.from_documents([document],
                                        service_context=service_context)

In [8]:
query_engine = index.as_query_engine()

### Testing the model performance by asking relevant questions based on the uploaded pdf file doc

In [36]:
response = query_engine.query(
    "What are steps to take when finding projects to build your experience?"
)
print(str(response))

Understand data storage and management systems, design and build data pipelines, master data modeling and schema design, learn Python for data preparation and analysis, use SQL for data analysis and reporting, master big data technologies, gain an overview of data quality and data governance policies, learn about data security and privacy, develop problem-solving skills, and familiarize yourself with modern cloud data stack and tools such as AWS.


In [37]:
response = query_engine.query(
    "What are steps to build your experience and order them numeric?"
)
print(str(response))

1. Understand data storage and management systems
2. Design and build data pipelines
3. Master data modeling and schema design
4. Learn how to use Python for Data preparation and analysis
5. Learn how to use SQL for data analysis and reporting
6. Master big data technologies
7. Gain an overview of data quality and data governance policies and procedures
8. Learn about data security and privacy
9. Develop problem-solving skills
10. Learn about the modern cloud data stack and get familiarized with AWS tools


In [38]:
response = query_engine.query(
    "What is the duration of this program?"
)
print(str(response))

The duration of this program is 8 weeks.


In [39]:
response = query_engine.query(
    "What are the prerequests to apply for this course?"
)
print(str(response))

Good command of English, basic fundamental knowledge of databases Structured Query Language (SQL), and basic knowledge of one programming language.


In [40]:
response = query_engine.query(
    "Is there a recommendation letter will be recived? And what should I do to acheive the certificate of completiton?"
)
print(str(response))

Upon successful completion of the masterclass, you may receive a certificate of completion. To achieve this certificate, you should ensure you meet all the requirements of the program, including dedicating the required hours, completing the tasks and projects, participating in interactive sessions, and demonstrating proficiency in the skills and topics covered throughout the course. Additionally, you may inquire about the possibility of obtaining a recommendation letter from the program organizers or instructors to further enhance your credentials.


## Evaluation setup using TruLens

In [9]:
eval_questions = []
with open('eval_questions.txt', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        print(item)
        eval_questions.append(item)

What are the keys to building a career in AI?
How can teamwork contribute to success in AI?
What is the importance of networking in AI?
What are some good habits to develop for a successful career?
How can altruism be beneficial in building a career?
What is imposter syndrome and how does it relate to AI?
Who are some accomplished individuals who have experienced imposter syndrome?
What is the first step to becoming good at AI?
What are some common challenges in AI?
Is it normal to find parts of AI challenging?


In [10]:
# You can try your own question:
new_question = "What is the right AI job for me?"
eval_questions.append(new_question)

In [11]:
print(eval_questions)

['What are the keys to building a career in AI?', 'How can teamwork contribute to success in AI?', 'What is the importance of networking in AI?', 'What are some good habits to develop for a successful career?', 'How can altruism be beneficial in building a career?', 'What is imposter syndrome and how does it relate to AI?', 'Who are some accomplished individuals who have experienced imposter syndrome?', 'What is the first step to becoming good at AI?', 'What are some common challenges in AI?', 'Is it normal to find parts of AI challenging?', 'What is the right AI job for me?']


In [12]:
from trulens_eval import Tru
tru = Tru()

tru.reset_database()

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


There is some of the code helper functions inside a utils.py file.  
- You can view the utils.py file in the file directory.
- You can change code that's currently wrapped inside these helper functions, to customize your RAG pipeline.

In [13]:
from utils import get_prebuilt_trulens_recorder

tru_recorder = get_prebuilt_trulens_recorder(query_engine,
                                             app_id="Direct Query Engine")

In [14]:
with tru_recorder as recording:
    for question in eval_questions:
        response = query_engine.query(question)

In [15]:
records, feedback = tru.get_records_and_feedback(app_ids=[])

In [49]:
records.head()

In [None]:
# launches on http://localhost:8501/
tru.run_dashboard()

## Advanced RAG pipeline

### 1. Sentence Window retrieval

In [None]:
from llama_index.llms import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

In [None]:
from utils import build_sentence_window_index

sentence_index = build_sentence_window_index(
    document,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="sentence_index"
)

In [None]:
from utils import get_sentence_window_query_engine

sentence_window_engine = get_sentence_window_query_engine(sentence_index)

In [None]:
window_response = sentence_window_engine.query(
    "how do I get started on a personal project in AI?"
)
print(str(window_response))

In [None]:
tru.reset_database()

tru_recorder_sentence_window = get_prebuilt_trulens_recorder(
    sentence_window_engine,
    app_id = "Sentence Window Query Engine"
)

In [None]:
for question in eval_questions:
    with tru_recorder_sentence_window as recording:
        response = sentence_window_engine.query(question)
        print(question)
        print(str(response))

In [None]:
tru.get_leaderboard(app_ids=[])

In [None]:
# launches on http://localhost:8501/
tru.run_dashboard()

### 2. Auto-merging retrieval

In [None]:
from utils import build_automerging_index

automerging_index = build_automerging_index(
    documents,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="merging_index"
)

In [None]:
from utils import get_automerging_query_engine

automerging_query_engine = get_automerging_query_engine(
    automerging_index,
)

In [None]:
auto_merging_response = automerging_query_engine.query(
    "How do I build a portfolio of AI projects?"
)
print(str(auto_merging_response))

In [None]:
tru.reset_database()

tru_recorder_automerging = get_prebuilt_trulens_recorder(automerging_query_engine,
                                                         app_id="Automerging Query Engine")

In [None]:
for question in eval_questions:
    with tru_recorder_automerging as recording:
        response = automerging_query_engine.query(question)
        print(question)
        print(response)

In [None]:
tru.get_leaderboard(app_ids=[])

In [None]:
# launches on http://localhost:8501/
tru.run_dashboard()