In [1]:
!which python3

/usr/bin/python3


In [4]:
#!rm -f minsearch.py
#!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [4]:
# Librerias
import requests
from openai import OpenAI
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

In [5]:
# Cargar data
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [6]:
# Cliente ElastiSearch
es_client = Elasticsearch('http://localhost:9200')
# Verifica si el cliente python esta conectado a una instancia de elasticsearch
print(es_client.ping())

True


In [7]:
# Index the data, the course field a keyword and the rest should be text.
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"}
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [8]:
# Add data to elastic
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [24]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [10]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [11]:
def llm(prompt):
    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [12]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [13]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [27]:
query = "I just discovered the course. Can I still join it"

In [28]:
rag(query)

" Yes, you are still eligible to submit your projects even if you discover the course after its official start date as long as it's within a reasonable timeframe and you can complete all work on time.\n\nJust keep in mind that there will be deadlines for turning in final project submissions, so plan ahead accordingly!"

In [29]:
print(_)

 Yes, you are still eligible to submit your projects even if you discover the course after its official start date as long as it's within a reasonable timeframe and you can complete all work on time.

Just keep in mind that there will be deadlines for turning in final project submissions, so plan ahead accordingly!


Un punto a tener en cuenta en que nuestra query está en inglés por lo cual si nosotros ponemos un texto en español la busqueda segun elasticsearch muy probablemente nos devolvera vacio y la respuesta de RAG no tendrá sentido. Así que debe hacerse la query según el idioma de la documentación adicional donde querramos buscar.