# This example works fully using my OPENAI apikey. Now we need to transfer it to use Azure OpenAI and we try to use our own embedding model. 

https://python.langchain.com/docs/integrations/chat/azure_chat_openai/

- Change to hugging face
- Add option for memory only vector db
- change to Azure Chat

https://www.analyticsvidhya.com/blog/2023/07/guide-to-chroma-db-a-vector-store-for-your-generative-ai-llms/
https://www.reddit.com/r/LocalLLaMA/comments/18j39qt/what_embedding_models_are_you_using_for_rag/

In [2]:
import os
import bs4
from langchain import hub
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough

from configparser import ConfigParser

c = ConfigParser()
c.read(r"C:\workspace\APIKEY_personal.ini")

os.environ["OPENAI_API_KEY"] = c["KEY"]["OPENAI PERSONAL"]

loader = WebBaseLoader(
    web_paths=(
        "https://devlog.tublian.com/tublian-open-source-internship-cohort2-a-path-to-software-development-mastery",
    ),
)
loader.requests_kwargs = {"verify": False}
docs = loader.load()

print(docs)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

vectorstore = Chroma.from_documents(
    documents=splits, embedding=OpenAIEmbeddings(), persist_directory="./chroma_db"
)
retriever = vectorstore.as_retriever()

print(retriever)

prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("invoking...")
result = rag_chain.invoke("How long is the Open Source internship?")
print(result)
print("invoking...1")



[Document(metadata={'source': 'https://devlog.tublian.com/tublian-open-source-internship-cohort2-a-path-to-software-development-mastery', 'title': 'Tublian Open Source Internship: A Path to Software Development Mastery', 'description': 'This is a detailed article on Tublian Open Source Internship and all it is about.', 'language': 'en'}, page_content='Tublian Open Source Internship: A Path to Software Development MasteryFollowFollowPhoto by Nick Fewings on UnsplashTublian Open Source Internship [Cohort2]: A Path to Software Development MasteryElevate Your Skills as an Emerging Software Developer!Ngwube Precious·Dec 11, 2023·6 min readAre you an emerging software developer looking to take your skills to the next level?\nDo you want to contribute meaningfully to the world of Open Source while gaining invaluable real-world experience? If you answer yes, our 30-day OpenSource Internship program is just what you\'ve been searching for.\n"In the past, contributing to OpenSource was a nice ha



invoking...
The Open Source internship lasts for 30 days, starting on December 18th, 2023. Participants are expected to dedicate 6-8 hours a week to complete the tasks.
invoking...1


Change to new embedding

In [5]:
import os
import bs4
from langchain import hub
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_community.embeddings import HuggingFaceEmbeddings

from configparser import ConfigParser

c = ConfigParser()
c.read(r"C:\workspace\APIKEY_personal.ini")

os.environ["OPENAI_API_KEY"] = c["KEY"]["OPENAI_PERSONAL"]
os.environ["LANGCHAIN_API_KEY"] = c["KEY"]["LANGSMITH_PERSONAL"]

loader = WebBaseLoader(
    web_paths=(
        "https://devlog.tublian.com/tublian-open-source-internship-cohort2-a-path-to-software-development-mastery",
    ),
)
loader.requests_kwargs = {"verify": False}
docs = loader.load()

print(docs)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

model_id = "BAAI/bge-small-en-v1.5"
embedding_function = HuggingFaceEmbeddings(model_name=model_id, model_kwargs={'device': 'cpu'})


vectorstore = Chroma.from_documents(
    documents=splits, embedding=embedding_function, persist_directory="./chroma_db"
)
retriever = vectorstore.as_retriever()

print(retriever)

prompt = hub.pull("rlm/rag-prompt",api_key=c["KEY"]["LANGSMITH_PERSONAL"])
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("invoking...")
result = rag_chain.invoke("How long is the Open Source internship?")
print(result)
print("invoking...1")



[Document(metadata={'source': 'https://devlog.tublian.com/tublian-open-source-internship-cohort2-a-path-to-software-development-mastery', 'title': 'Tublian Open Source Internship: A Path to Software Development Mastery', 'description': 'This is a detailed article on Tublian Open Source Internship and all it is about.', 'language': 'en'}, page_content='Tublian Open Source Internship: A Path to Software Development MasteryFollowFollowPhoto by Nick Fewings on UnsplashTublian Open Source Internship [Cohort2]: A Path to Software Development MasteryElevate Your Skills as an Emerging Software Developer!Ngwube Precious·Dec 11, 2023·6 min readAre you an emerging software developer looking to take your skills to the next level?\nDo you want to contribute meaningfully to the world of Open Source while gaining invaluable real-world experience? If you answer yes, our 30-day OpenSource Internship program is just what you\'ve been searching for.\n"In the past, contributing to OpenSource was a nice ha



tags=['Chroma', 'HuggingFaceEmbeddings'] vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000002606B271850> search_kwargs={}
invoking...
The Open Source internship lasts for 30 days, starting on December 18th, 2023. Participants are expected to dedicate 6-8 hours a week to complete the tasks.
invoking...1


Let's see if we can use Azure Chat

In [None]:
import os
import bs4
from langchain import hub
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_community.embeddings import HuggingFaceEmbeddings

from configparser import ConfigParser

c = ConfigParser()
c.read(r"C:\workspace\APIKEY_personal.ini")

os.environ["OPENAI_API_KEY"] = c["KEY"]["OPENAI_PERSONAL"]
os.environ["LANGCHAIN_API_KEY"] = c["KEY"]["LANGSMITH_PERSONAL"]

loader = WebBaseLoader(
    web_paths=(
        "https://devlog.tublian.com/tublian-open-source-internship-cohort2-a-path-to-software-development-mastery",
    ),
)
loader.requests_kwargs = {"verify": False}
docs = loader.load()

print(docs)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

model_id = "BAAI/bge-small-en-v1.5"
embedding_function = HuggingFaceEmbeddings(model_name=model_id, model_kwargs={'device': 'cpu'})


vectorstore = Chroma.from_documents(
    documents=splits, embedding=embedding_function, persist_directory="./chroma_db"
)
retriever = vectorstore.as_retriever()

print(retriever)

prompt = hub.pull("rlm/rag-prompt",api_key=c["KEY"]["LANGSMITH_PERSONAL"])
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("invoking...")
result = rag_chain.invoke("How long is the Open Source internship?")
print(result)
print("invoking...1")