In [None]:

import os
from dotenv import load_dotenv
from bs4 import SoupStrainer
from datasets import load_dataset
from langchain_openai import ChatOpenAI
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Load environment variables from a .env file
load_dotenv()

# Initialize the chat model with specific parameters
chat_model = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.7,
    max_retries=2,
)

# Set up the embeddings model using HuggingFace
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")

# Load a dataset for additional use
dataset = load_dataset("flax-sentence-embeddings/stackexchange_math_jsonl", "title_answer")

# Create a loader to fetch content from a specified URL
web_loader = WebBaseLoader(
    web_paths=("https://www.couchbase.com/blog/what-is-vector-search/",),
    bs_kwargs=dict(parse_only=SoupStrainer())  # Parse specific parts of the HTML
)

# Load documents from the web
documents = web_loader.load()


# Initialize a text splitter to break documents into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=100)

# Split documents into chunks
document_chunks = text_splitter.split_documents(documents)
print(document_chunks)
# Create a vector store for document retrieval
vector_store = Chroma.from_documents(documents=document_chunks, embedding=embeddings, persist_directory="./chroma_langchain_db")

# Set up a retriever to get relevant document snippets
document_retriever = vector_store.as_retriever(k=3)

# Define a template for prompting the chat model
template = """Answer the question and explain the answer is coming from context or not and give me the answer you know:
{context}

Question: {messages}
"""

# Create a prompt using the template
chat_prompt = ChatPromptTemplate.from_template(template)

# Helper function to format document content
def format_documents(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Define a chain of processes for RAG (Retrieval-Augmented Generation)
rag_chain = (
    {"context": document_retriever | format_documents, "messages": RunnablePassthrough()}
    | chat_prompt
    | chat_model
    | StrOutputParser()
)

# Continuously prompt the user for input and process it through the RAG chain
#while True:
 #   user_input = input(">>")
  #  print(rag_chain.invoke(user_input))


In [None]:
rag_chain.invoke("what Surote write on Medium?")

In [None]:
rag_chain.invoke("what is Moo Deng?")

In [None]:
# load content from : https://edition.cnn.com/2024/09/14/travel/pigmy-hippo-thailand-latest-online-sensation-intl-hnk/index.html

In [None]:
rag_chain.invoke("who is Surote?")

In [None]:
rag_chain.invoke("what is the Earth")