# Playground RAG ChatBot

load data
https://python.langchain.com/docs/integrations/providers/unstructured
https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.directory.DirectoryLoader.html

In [None]:
import chromadb
import os
import shutil
import ollama

from dotenv import load_dotenv
from openai import OpenAI

from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.messages import HumanMessage, SystemMessage


In [None]:
DATA_PATH = "../data/"

def load_docs():
    loader = DirectoryLoader(DATA_PATH, glob="*", show_progress=True)
    documents = loader.load()
    return documents

docs = load_docs()
docs

split document in to chunks

https://python.langchain.com/docs/modules/data_connection/document_transformers/recursive_text_splitter

https://docs.mistral.ai/guides/rag/#split-document-into-chunks


In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=100,
    length_function=len,
    is_separator_regex=True,
)

chunks = text_splitter.split_documents(docs)

print(f"Len docs: {len(docs)}")
print(f"Len chunks: {len(chunks)}")

In [None]:
doc = chunks[0]
print(f"page_content: {doc.page_content}")
print(f"metadata: {doc.metadata}")


Querie with ChromaDB

https://www.trychroma.com/

https://api.python.langchain.com/en/latest/embeddings/langchain_community.embeddings.openai.OpenAIEmbeddings.html

https://docs.mistral.ai/capabilities/embeddings/

https://platform.openai.com/docs/guides/embeddings

load openaI api key

https://platform.openai.com/api-keys

In [None]:
load_dotenv()
key = os.getenv('OPENAI_API_KEY')
if(len(key) != 0):
    print("success")

In [None]:
CHROMA_PATH = "../chroma"

if os.path.exists(CHROMA_PATH):
    shutil.rmtree(CHROMA_PATH)

chromadb = Chroma.from_documents(
    chunks, OpenAIEmbeddings(openai_api_key=key), persist_directory=CHROMA_PATH
)
chromadb.persist()

# ChatBot Prototyp

predict

https://api.python.langchain.com/en/v0.0.343/embeddings/langchain.embeddings.openai.OpenAIEmbeddings.html

https://docs.mistral.ai/guides/rag/#rag-with-langchain

https://platform.openai.com/docs/guides/text-generation/chat-completions-api


In [None]:
embedding_function = OpenAIEmbeddings(openai_api_key=key)
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

In [None]:
query_text = "Erzähle mir was über Tobias" #user input

In [None]:
results = db.similarity_search_with_score(query_text, k=5)
# filter result
# if len(results) == 0 or results[0][1] < 0.7:
#     print(f"Unable to find matching results.")

In [None]:
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
context_text

### OpenAi

In [None]:
# create prompt
client = OpenAI(api_key=key)
completion = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": f"Answer the question based only on the following context: {context_text}"},
        {"role": "user", "content": f"{query_text}"}
    ],
)

print(completion.choices[0].message.content)


In [None]:
completion

In [None]:
#output
sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {completion.choices[0].message.content}\nSources: {sources}\n\nUnique sources: {list(set(sources))}"
print(formatted_response)

### Ollama - locales LLM

https://ollama.com

https://github.com/ollama/ollama




ollama serve

ollama run llama3   ~4.7GB

In [None]:
response = ollama.chat(model='llama3', messages=[
    {'role': 'user', 'content': f'{query_text}'},
    {'role': 'system', 'content': f'Answer the question based only on the following context: {context_text}'}
])
print(response['message']['content'])

In [None]:
#output
sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {response['message']['content']}\n\nSources: {sources}"
print(formatted_response)