In [33]:
#!pip install langchain langchain_ollama
#!pip install chromadb sentence-transformers langchain_huggingface langchain_chroma

## Librerias que necesitamos

In [34]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

## Cargar los datos desde una pagina WEB

In [35]:
import requests
import re

url = 'https://en.wikipedia.org/wiki/Football'
response = requests.get(url)

if response.status_code == 200:
    web_content = response.text
    
    # Eliminar las etiquetas HTML
    clean_text = re.sub(r'<[^>]*>', '', web_content)
    
    # Limpiar espacios en blanco adicionales
    clean_text = re.sub(r'\s+', ' ', clean_text)
    
    print("Texto de la página cargado:")
    print(clean_text[:1000])  # Muestra los primeros 1000 caracteres del texto limpio
else:
    raise Exception(f"Error al cargar la página: {response.status_code}")

Texto de la página cargado:
 Football - Wikipedia (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available";var cookie=document.cookie.match(/(?:^|; )enwikimwclientpreferences=([^;]+)/);if(cookie){cookie[1].split('%2C').forEach(function(pref){className=className.replace(new RegExp('(^| )'+pref.replace(/-clientpref-\w+$|[^\w-]+/g,'')+'-clientpref-\\w+( |$)'),'$1'+pref+'$2');});}document.documentElement.className=className;}());RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTr

### Seleccionamos el Modelo que vamos a utilizar

In [36]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

### Acedemos a la base de datos

In [37]:
vector_store = Chroma(
    collection_name="rag1",
    embedding_function=embeddings,
    persist_directory="./datasets",
)

In [38]:
# Se ve como funciona la base de datos
retriever = vector_store.as_retriever()

### RAG Chain

In [41]:
# Plantilla de conversación
conversation_template = """Please answer the following question based only on the given context:
{context}

Question: {question}
"""

# Crear el prompt a partir de la plantilla
prompt_template = ChatPromptTemplate.from_template(conversation_template)

# LLM local
local_llm = "tinyllama"
local_model = ChatOllama(model=local_llm)

# Cadena de procesamiento
qa_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt_template
    | local_model
    | StrOutputParser()
)

In [42]:
qa_chain.invoke("What is football?")

'Football is a sport that involves two teams of eleven players, who stand on a rectangular field with goalposts at each end. Each team tries to kick or carry a ball into the opposing team\'s goal (also known as "goals"). The game is played over two halves of 45 minutes each, separated by extra time. The winning team is determined by the number of goals scored in regulation time, plus any extra time (if necessary).'