In [None]:
! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain bs4

In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = ""

In [None]:
os.environ['OPENAI_API_KEY'] = ""
os.environ['BASE_URL'] = "https://api.zhizengzeng.com/v1"

In [1]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

#### INDEXING ####

# Load Documents
loader = CSVLoader(
    file_path = "./data/titanic_cleaned.csv",
    csv_args={
    'delimiter': ',',
    'quotechar': '"',
    'fieldnames': ["PassengerId","Survived","Pclass","Name","Sex","Age","SibSp","Parch","Ticket","Fare","Cabin","Embarked"]}
)
docs = loader.load()


In [2]:
# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embed
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings(openai_api_key="", base_url="https://api.zhizengzeng.com/v1"))

retriever = vectorstore.as_retriever()



In [4]:
#### RETRIEVAL and GENERATION ####

# Prompt
prompt = hub.pull("rlm/rag-prompt")

# LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, openai_api_key="", base_url="https://api.zhizengzeng.com/v1")

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Question
rag_chain.invoke("Calculate the mean of all data in column Survived")



'The mean of all data in the column "Survived" is 0.33.'