In [4]:
%pip install langchain langchain_community langchain_ollama langchain_text_splitters


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [22]:
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader

# Load all text files from the processed_data directory
folder_path = "/workspaces/RAG_BOT/processed_data"
document_loader = DirectoryLoader(folder_path, glob="**/*.txt", loader_cls=TextLoader)
raw_documents = document_loader.load()

print(f"Number of documents loaded: {len(raw_documents)}")
print("Documents loaded:")
print(raw_documents)

Number of documents loaded: 1
Documents loaded:
[Document(metadata={'source': '/workspaces/RAG_BOT/processed_data/PolicyMangement.txt'}, page_content='ENDPOINT: Delete authentication profile\nPATH: /AuthProfile/DeleteProfile\nMETHOD: POST\nTAGS: Authentication Profile\nMETADATA:\n  * x-idap-anon: False\n  * x-codegen-request-body-name: payload\nPARAMETERS:\n  * uuid (query, Required): Authentication UUID.\nREQUEST BODY: Required\n  Content Type: application/json\n  Schema Properties:\n    * uuid (string): The authentication profile uuid either passed in by method call or as part of the payload.\n    Required fields: uuid\n  Sample Request JSON:\n  ```json\n  {\n  "uuid": "string_value"\n}\n  ```\nRESPONSES:\n  Status Code: 200\n  Description: API-Result\n  Content Type: */*\n  Response Schema: AuthProfileDeleteProfile\n  Response Body Properties:\n    * Result (boolean): Success or failure of the delete\n    * Error (object): Error message text on failure, may be null\n  Sample Respons

In [23]:
import re
from langchain_core.documents import Document

# Split each raw document based on dashed line and create sub-documents
split_documents = []
for doc in raw_documents:
    # Use regex to split based on dashed lines like '-----...'
    parts = re.split(r'-{5,}', doc.page_content)
    for i, part in enumerate(parts):
        cleaned_part = part.strip()
        if cleaned_part:
            split_documents.append(
                Document(
                    page_content=cleaned_part,
                    metadata={"source": doc.metadata["source"], "part": i + 1}
                )
            )

print(f"Total split chunks: {len(split_documents)}")

Total split chunks: 9


In [29]:
from langchain_ollama import OllamaLLM

llm = OllamaLLM(model="llama3.2:latest")

# Example usage
response = llm.invoke("What is LangChain?")
print(response)

LangChain is an open-source project that aims to provide a robust, scalable, and flexible framework for building blockchain-based data pipelines. It allows developers to easily integrate blockchain technology with traditional data processing workflows, making it easier to build decentralized applications (dApps) and decentralized finance (DeFi) platforms.

The core idea behind LangChain is to create a standardized interface for interacting with blockchains, allowing developers to focus on building their application's logic without worrying about the underlying blockchain infrastructure. This makes it easier to develop complex data pipelines that can handle large amounts of data and perform tasks such as data processing, machine learning, and analytics.

LangChain provides a set of APIs and tools that enable developers to:

1. Interact with blockchains (e.g., Ethereum, Polkadot) using a standardized interface.
2. Build and manage decentralized data pipelines that can handle large amount

# Ollama Embedding

In [30]:
from langchain_ollama import OllamaEmbeddings

embedding_model = OllamaEmbeddings(
    model="llama3.2:latest",
    # model="nomic-embed-text:latest",
    base_url="http://localhost:11434"  # Ensure Ollama is running locally
)

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore
from langchain.schema import Document

# Convert chunks to LangChain Documents
documents = [Document(page_content=chunk.page_content) for chunk in split_documents]

# Create an in-memory vector store
vector_store_ollama = InMemoryVectorStore.from_documents(documents, embedding_model)

print("Number of documents in vector store:", len(vector_store_ollama.store))

In [None]:
# Define a query
query = "how to get a policy?"

# Search for similar chunks and get scores
similar_docs_with_scores = vector_store_ollama.similarity_search_with_score(query, k=3)

print("Most similar documents:")
# Iterate through the results, which are tuples of (Document, score)
for i, (doc, score) in enumerate(similar_docs_with_scores, start=1):
    print(f"Result {i} (Score: {score}):")
    print(doc.page_content)  # Print first 200 characters
    print("-" * 50)

# Nomic Embedding

In [None]:
from langchain_ollama import OllamaEmbeddings

embedding_model_nomic = OllamaEmbeddings(
    # model="llama3.2:latest",
    model="nomic-embed-text:latest",
    base_url="http://localhost:11434"  # Ensure Ollama is running locally
)

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore
from langchain.schema import Document

# Convert chunks to LangChain Documents
documents = [Document(page_content=chunk.page_content) for chunk in split_documents]

# Create an in-memory vector store
vector_store_nomic = InMemoryVectorStore.from_documents(documents, embedding_model_nomic)

print("Number of documents in vector store:", len(vector_store_nomic.store))

In [None]:
# Define a query
query = "how to get a policy?"

# Search for similar chunks and get scores
similar_docs_with_scores = vector_store_nomic.similarity_search_with_score(query, k=3)

print("Most similar documents:")
# Iterate through the results, which are tuples of (Document, score)
for i, (doc, score) in enumerate(similar_docs_with_scores, start=1):
    print(f"Result {i} (Score: {score}):")
    print(doc.page_content)  # Print first 200 characters
    print("-" * 50)

# Hugging Face Embedding

%pip install langchain_huggingface

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model_huggin = HuggingFaceEmbeddings(
    model="all-MiniLM-L6-v2", # Ensure Ollama is running locally
)

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore
from langchain.schema import Document

# Convert chunks to LangChain Documents
documents = [Document(page_content=chunk.page_content) for chunk in chunks]

# Create an in-memory vector store
vector_store_hugging = InMemoryVectorStore.from_documents(documents, embedding_model_huggin)

print("Number of documents in vector store:", len(vector_store_hugging.store))

In [None]:
# Define a query
query = "how to get a policy?"

# Search for similar chunks and get scores
similar_docs_with_scores = vector_store_hugging.similarity_search_with_score(query, k=3)

print("Most similar documents:")
# Iterate through the results, which are tuples of (Document, score)
for i, (doc, score) in enumerate(similar_docs_with_scores, start=1):
    print(f"Result {i} (Score: {score}):")
    print(doc.page_content)  # Print first 200 characters
    print("-" * 50)