In [None]:
# ! pip install langchain-openai langchain-community langchainhub gpt4all langchain-chroma chromadb langchain

In [None]:
# Clone Handbook
# ! git clone https://github.com/AgileFreaks/Handbook.git

In [2]:
# Load all files from handbook
import os
import shutil

source_dir = "handbook"
destination_dir = "docs/handbook_files"

# Check if the destination directory exists
if not os.path.exists(destination_dir):
    os.makedirs(destination_dir)  # Recursive directory creation
else:
    print("Destination directory already exists.")

for root, dirs, files in os.walk(source_dir):
    for file in files:
        if file.endswith(".md"):
            # Construct the source and destination paths
            source_path = os.path.join(root, file)
            destination_path = os.path.join(destination_dir, file)
            
            # Move the file to the destination directory
            shutil.move(source_path, destination_path)
            print(f"Moved '{source_path}' to '{destination_path}'")

Destination directory already exists.


In [3]:
from langchain_openai import ChatOpenAI

In [None]:
# Load entire directory
from langchain_community.document_loaders import DirectoryLoader, UnstructuredMarkdownLoader

loader = DirectoryLoader("E:\Repos\LangChain\docs\handbook_files", glob="*.md", loader_cls=UnstructuredMarkdownLoader, use_multithreading=True)
pages = loader.load()

In [5]:
# Point to the local server
client = ChatOpenAI(base_url="http://localhost:3008/v1", api_key="lm-studio")

In [6]:
# Get Text from all pages
txt = ' '.join([d.page_content for d in pages])

In [7]:
print("Pages: ",len(pages))
print("Text lenght: ", len(txt))

Pages:  39
Text lenght:  78298


In [9]:
# Split Text into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, MarkdownHeaderTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 250,
    add_start_index=True
)

In [10]:
chunks = text_splitter.split_documents(pages)

In [11]:
# Count chunks
len(chunks)

117

## Vectorstore

In [None]:
# ! pip install chromadb

In [12]:
from langchain.vectorstores import Chroma
persist_directory = 'docs/chroma/'

In [13]:
# Embedding with GPT4All
from langchain_chroma import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings

embedding_function = GPT4AllEmbeddings(device='gpu')

vectordb = Chroma.from_documents(
    documents=chunks, 
    embedding=embedding_function,
    persist_directory=persist_directory
    )

print(f"Saved {(len.chunks)} chunks to {persist_directory}")

## Prepare DB

In [15]:
db = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding_function
)

### Small test

In [16]:
print(db._collection.count())

117


In [47]:
query_text = "What are the benefits of being a freak?"

In [None]:
# k is number of results we want to return
# docs = db.similarity_search(query_text, k=3)
results = db.similarity_search_with_score(query_text, k=3)

if len(results) == 0 or results[0][1] < 0.7:
    print(f"Unable to find matching results.")
else:
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    sources = [doc.metadata.get("source", None) for doc, _score in results]
    formatted_response = f"Response: {context_text}\nSources: {sources}"
    print(formatted_response)

## RAG

In [75]:
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationChain

os.environ["OPENAI_API_KEY"] = "no_need"

model = ChatOpenAI(openai_api_base="http://localhost:3008/v1", model_name="lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF")

memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

# Play with this
# retriever = vectordb.as_retriever()

conversation = ConversationChain(
    llm=model,
    verbose=False,
    memory=ConversationBufferMemory()
)

PROMPT_TEMPLATE = """
Answer the question based only on the following context:
{context}
DO NOT give irelevant information that is not in the context.
Remember the correct word for the company is AgileFreaks.
---
Answer the question based on the above context: {question}
"""

In [None]:
# Update the question
# query_text = "What are the benefits of being a freak?"
# query_text = "how much money can i make as a freak?"
# query_text = "what are the skills and their name, what do they mean?"
query_text = "how can I get to the next skill level if now I am advanced?"

In [None]:
results = db.similarity_search_with_score(query_text, k=3)

In [None]:
if len(results) == 0 or results[0][1] < 0.7:
    print(f"Unable to find matching results.")
else:
    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)

    response_text = conversation.predict(input=prompt)

    sources = [doc.metadata.get("source", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)