In [1]:
import os
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
# 使用文件載入器，讀取分機資料csv
loader = CSVLoader(file_path='./employees.csv')
data = loader.load()
data[:5]

[Document(metadata={'source': './employees.csv', 'row': 0}, page_content='部門: 資訊部\n職稱: 部門經理\n姓名: 王大明\n分機: 1001'),
 Document(metadata={'source': './employees.csv', 'row': 1}, page_content='部門: 資訊部\n職稱: 軟體工程師\n姓名: 李小華\n分機: 1002'),
 Document(metadata={'source': './employees.csv', 'row': 2}, page_content='部門: 資訊部\n職稱: 系統管理員\n姓名: 張三\n分機: 1003'),
 Document(metadata={'source': './employees.csv', 'row': 3}, page_content='部門: 行銷部\n職稱: 部門經理\n姓名: 陳美麗\n分機: 2001'),
 Document(metadata={'source': './employees.csv', 'row': 4}, page_content='部門: 行銷部\n職稱: 市場專員\n姓名: 黃小玲\n分機: 2002')]

In [4]:
# 使用Text splitter分割成更小的chunk
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=5)
all_splits = text_splitter.split_documents(data)
all_splits[:5]

[Document(metadata={'source': './employees.csv', 'row': 0}, page_content='部門: 資訊部\n職稱: 部門經理\n姓名: 王大明\n分機: 1001'),
 Document(metadata={'source': './employees.csv', 'row': 1}, page_content='部門: 資訊部\n職稱: 軟體工程師\n姓名: 李小華\n分機: 1002'),
 Document(metadata={'source': './employees.csv', 'row': 2}, page_content='部門: 資訊部\n職稱: 系統管理員\n姓名: 張三\n分機: 1003'),
 Document(metadata={'source': './employees.csv', 'row': 3}, page_content='部門: 行銷部\n職稱: 部門經理\n姓名: 陳美麗\n分機: 2001'),
 Document(metadata={'source': './employees.csv', 'row': 4}, page_content='部門: 行銷部\n職稱: 市場專員\n姓名: 黃小玲\n分機: 2002')]

In [5]:
# 把文件進行embedding成向量，並存入Chroma db
persist_directory = 'db'
vectordb = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings(), persist_directory=persist_directory)

In [6]:
# 測試Vector db查詢
query = "張三的分機"
docs = vectordb.similarity_search(query)
docs

[Document(metadata={'row': 2, 'source': './employees.csv'}, page_content='部門: 資訊部\n職稱: 系統管理員\n姓名: 張三\n分機: 1003'),
 Document(metadata={'row': 2, 'source': './employees.csv'}, page_content='部門: 資訊部\n職稱: 系統管理員\n姓名: 張三\n分機: 1003'),
 Document(metadata={'row': 2, 'source': './employees.csv'}, page_content='部門: 資訊部\n職稱: 系統管理員\n姓名: 張三\n分機: 1003'),
 Document(metadata={'row': 2, 'source': './employees.csv'}, page_content='部門: 資訊部\n職稱: 系統管理員\n姓名: 張三\n分機: 1003')]

In [None]:
# 載入圖片
#from langchain_community.document_loaders.image import UnstructuredImageLoader
#loader = UnstructuredImageLoader("./payment.png")
#data = loader.load()
#data

In [7]:
# 使用文件載入器，讀取經費報銷流程
loader = TextLoader(file_path='./payment.txt')
data = loader.load()
data

[Document(metadata={'source': './payment.txt'}, page_content='[經費報銷流程]金額100,000以上請購單位不得自辦，須備妥：1.請購單或簽呈 2.購案說明書 3.規格明細表 4. 指定廠商需付限制性招標申請書\n[經費報銷流程]金額20,000~99,999，須備妥：1.請購單 2.一家估價單以上 經院長核定後須備妥：1.已核准請購單 2.黏貼憑證 3.議價後之估價單\n[經費報銷流程]金額6,000~19,999，須備妥：1.請購單 2.黏貼憑證 3.一家估價單 3.發票\n[經費報銷流程]金額5,999以下，須備妥：1.請購單 2.黏貼憑證 3.發票')]

In [8]:
# 使用Text splitter分割成更小的chunk
text_splitter = RecursiveCharacterTextSplitter(chunk_size=80, chunk_overlap=5)
all_splits = text_splitter.split_documents(data)
all_splits

[Document(metadata={'source': './payment.txt'}, page_content='[經費報銷流程]金額100,000以上請購單位不得自辦，須備妥：1.請購單或簽呈 2.購案說明書 3.規格明細表 4. 指定廠商需付限制性招標申請書'),
 Document(metadata={'source': './payment.txt'}, page_content='[經費報銷流程]金額20,000~99,999，須備妥：1.請購單 2.一家估價單以上 經院長核定後須備妥：1.已核准請購單 2.黏貼憑證 3.議價後之估價單'),
 Document(metadata={'source': './payment.txt'}, page_content='[經費報銷流程]金額6,000~19,999，須備妥：1.請購單 2.黏貼憑證 3.一家估價單 3.發票'),
 Document(metadata={'source': './payment.txt'}, page_content='[經費報銷流程]金額5,999以下，須備妥：1.請購單 2.黏貼憑證 3.發票')]

In [9]:
# 把文件進行embedding成向量，並存入Chroma db
vectordb.add_documents(documents=all_splits, embedding=OpenAIEmbeddings())

['974dc9d4-d642-4701-9f96-d34f46bef2cf',
 '184e3d1a-7333-433f-8f6a-97a52ecfd1b9',
 '024502bf-d3ad-41ec-b1df-dda016a28a67',
 'be43078c-9085-4745-b9e4-58f2d947f890']

In [10]:
# 測試Vector db查詢
query = "發票如何請款？"
docs = vectordb.similarity_search(query)
docs

[Document(metadata={'source': './payment.txt'}, page_content='[經費報銷流程]金額5,999以下，須備妥：1.請購單 2.黏貼憑證 3.發票'),
 Document(metadata={'source': './payment.txt'}, page_content='[經費報銷流程]金額6,000~19,999，須備妥：1.請購單 2.黏貼憑證 3.一家估價單 3.發票'),
 Document(metadata={'source': './payment.txt'}, page_content='[經費報銷流程]金額20,000~99,999，須備妥：1.請購單 2.一家估價單以上 經院長核定後須備妥：1.已核准請購單 2.黏貼憑證 3.議價後之估價單'),
 Document(metadata={'source': './payment.txt'}, page_content='[經費報銷流程]金額100,000以上請購單位不得自辦，須備妥：1.請購單或簽呈 2.購案說明書 3.規格明細表 4. 指定廠商需付限制性招標申請書')]

In [11]:
# 啟動LLM服務
llm = ChatOpenAI(
    model="gpt-4o-mini-2024-07-18",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

In [12]:
# 測試確認LLM有成功啟動
llm("What is Taiwan known for?")

  warn_deprecated(


AIMessage(content="Taiwan is known for a variety of cultural, historical, and natural attractions, as well as its vibrant economy. Here are some key aspects:\n\n1. **Technology and Industry**: Taiwan is a global leader in technology and manufacturing, particularly in semiconductor production. Companies like TSMC (Taiwan Semiconductor Manufacturing Company) are crucial to the global tech supply chain.\n\n2. **Night Markets**: Taiwan is famous for its bustling night markets, where visitors can enjoy a wide array of street food, snacks, and local delicacies. Popular dishes include stinky tofu, bubble tea, and oyster omelets.\n\n3. **Natural Beauty**: The island boasts stunning landscapes, including mountains, beaches, and national parks. Taroko Gorge and Alishan are popular destinations for hiking and nature lovers.\n\n4. **Cultural Heritage**: Taiwan has a rich cultural heritage influenced by indigenous peoples, Chinese immigrants, and Japanese colonial history. This is reflected in its 

In [13]:
# 設定prompt
prompt_template = """You are an internal support chatbot for a company, designed to answer various work-related questions from colleagues.
You need to provide accurate, concise, and helpful answers. If necessary, direct the user to relevant internal resources or documents. 
Please respond to all questions in a friendly and professional tone.
Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
{context}
Question: {question}
Answer in Chineses:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context","question"]
)

In [14]:
# 創建RAG，將Chroma & Prompt & LLM串在一起
chain_type_kwargs = {"prompt": PROMPT}
retriever = vectordb.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5})
qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    chain_type_kwargs = chain_type_kwargs
)

In [15]:
# 測試RAG
query = "龐統的分機幾號？"
qa.invoke(query)

{'query': '龐統的分機幾號？', 'result': '龐統的分機是3009。'}

In [16]:
# 以Gradio作為GUI串接介面，讓使用者可以和RAG進行問答
def predict(message,history):
    return qa.invoke(message)['result']

gr.ChatInterface(predict).launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [None]:
#龐統的分機幾號？
#請問資訊部同仁的分機號碼？
#我想知道50000元的經費請款流程？
#請問請購多少金額需要估價單？
#今天晚餐吃什麼？