In [1]:
import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain_ollama.llms import OllamaLLM

In [2]:
# 1. Cargar datos desde una página web usando BeautifulSoup
def extract_text_from_url(url):
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Error al acceder a la página: {response.status_code}")
    
    # Parseamos el HTML con BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extraemos el contenido principal (en este caso, todo el texto dentro de <p>)
    paragraphs = soup.find_all('p')  
    text = "\n".join([para.get_text(strip=True) for para in paragraphs if para.get_text(strip=True)])
    
    if not text:
        raise Exception("No se encontró texto significativo en la página.")
    
    return text

In [3]:
# URL de la página web
url = "https://en.wikipedia.org/wiki/Doctor_Who"

# Extraer texto de la página
web_text = extract_text_from_url(url)
print(f"Texto extraído de la página: {web_text[:500]}...\n")  # Muestra un resumen del contenido extraído

Texto extraído de la página: Doctor Whois a Britishscience fiction television seriesbroadcast by theBBCsince 1963. The series, created bySydney Newman,C. E. WebberandDonald Wilson, depicts the adventures of anextraterrestrial beingcalledthe Doctor, part of ahumanoidspecies calledTime Lords. The Doctor travels in the universe and in time using atime travellingspaceshipcalled theTARDIS, which externally appears as a Britishpolice box. While travelling, the Doctor works to save lives and liberateoppressedpeoples by combatingfo...



In [4]:
# 2. Dividir el texto en fragmentos (chunks) con LangChain
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_text(web_text)
print(f"Se generaron {len(chunks)} fragmentos de texto.\n")


Se generaron 224 fragmentos de texto.



In [5]:
# 3. Crear embeddings usando HuggingFace
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Crear la base de datos vectorial con Chroma
vectorstore = Chroma.from_texts(
    texts=chunks,
    embedding=embeddings,
    collection_name="doctor_who_facts",
    persist_directory="chroma_db"  # Carpeta para guardar la base de datos
)

# Persistir la base de datos en disco
vectorstore.persist()
print(f"Base de datos vectorial creada y guardada con {len(chunks)} documentos.\n")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Base de datos vectorial creada y guardada con 224 documentos.



  vectorstore.persist()


In [14]:
# 4. Configurar el modelo LLM con Ollama
llm = OllamaLLM(model="llama3.2", format="json", server_url="http://localhost:11434") 


In [10]:
# 5. Crear el sistema RAG con Chroma
retriever = vectorstore.as_retriever()
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever
)

In [28]:
query = "Who is David Tennant?"

In [29]:
# 6. Hacer preguntas al sistema
respuesta = qa_chain.invoke({"query": query})
print(f"Respuesta a la consulta: {respuesta}")

Respuesta a la consulta: {'query': 'Who is David Tennant?', 'result': '{ "value": "David Tennant played two incarnations of the Doctor, specifically the Tenth and Fourteenth Doctors. He previously played the Tenth Doctor in 2005-2010, and then took over as the Fourteenth Doctor after Jodie Whittaker\'s final appearance in 2022." }'}
