In [None]:
#############
############# 질의응답, 모델, 임베딩
#############
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.retrievers import ContextualCompressionRetriever
from langchain.llms import LlamaCpp
from langchain import PromptTemplate, LLMChain

template = """Question: {question}
Answer: Let's work this out in a step by step way to be sure we have the right answer."""
prompt = PromptTemplate(template=template, input_variables=["question"])

llm = LlamaCpp(
	# model_path: 로컬머신에 다운로드 받은 모델의 위치
    model_path="C:\llama_test\llama-2-7b-chat.Q4_K_M.gguf",
    temperature=0.0,
    top_p=1,
    max_tokens=8192,
    verbose=True,
    # n_ctx: 모델이 한 번에 처리할 수 있는 최대 컨텍스트 길이
    n_ctx=4096 
)
llm_chain = LLMChain(prompt=prompt, llm=llm)

embedding = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")

###### 처음 저장할때
# vectordb = Chroma.from_documents(texts,embedding,persist_directory="C:/startcoding_vs/test_venv/gg")

vectordb = Chroma(persist_directory="C:/startcoding_vs/test_venv/gg", embedding_function=embedding)
# print("chromadb에 저장")


##### 파파고 api
import requests
import json

### api key
CLIENT_ID, CLIENT_SECRET = '발급받은 ID', '발급받은 Secret'

## 질문
text = '김종원 교수 수업 하나만 알려줘'

url = 'https://openapi.naver.com/v1/papago/n2mt'

## 헤더
headers = {
    'Content-Type': 'application/json',
    'X-Naver-Client-Id': CLIENT_ID,
    'X-Naver-Client-Secret': CLIENT_SECRET
}

## ko -> en
data = {'source': 'ko', 'target': 'en', 'text': text}

res = requests.post(url, json.dumps(data), headers=headers)

en_text = res.json()['message']['result']['translatedText']

print('question : ' + text )
print('')
print('question : ' + en_text)

docs = vectordb.similarity_search(en_text)
# print(docs)

prompt= en_text
from langchain.chains import RetrievalQA

# 유사도 0.7로 임베딩 필터를 저장
# 유사도에 맞추어 대상이 되는 텍스트를 임베딩함
embeddings_filter = EmbeddingsFilter(
    embeddings=embedding, 
    similarity_threshold=0.70
)

# 압축 검색기 생성
compression_retriever = ContextualCompressionRetriever(
	# embeddings_filter 설정
    base_compressor=embeddings_filter, 
    # retriever 를 호출하여 검색쿼리와 유사한 텍스트를 찾음
    base_retriever=vectordb.as_retriever()
)

# RetrievalQA 클래스의 from_chain_type이라는 클래스 메서드를 호출하여 질의응답 객체를 생성
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=compression_retriever)
response = qa.run(prompt)

### 파파고 api
headers = {
    'Content-Type': 'application/json',
    'X-Naver-Client-Id': CLIENT_ID,
    'X-Naver-Client-Secret': CLIENT_SECRET
}

## en -> ko
data2 = {'source': 'en', 'target': 'ko', 'text': response}

res = requests.post(url, json.dumps(data2), headers=headers)

result_text = res.json()['message']['result']['translatedText']

print('response : ' + response)
# print('____________파파고 번역 후_________________')
print('')
print('response : ' + result_text)



In [None]:
##################
################## PDF파일
##################
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

## 불러오기
loader = PyPDFLoader('C:/startcoding_vs/test_venv/hh/2010~2023학년도 교양교육과정 이수기준 안내.pdf')

documents = loader.load()
documents[0].page_content[:200]

## 텍스트 나누기
text_splitter = CharacterTextSplitter(
	chunk_size=100, 
    chunk_overlap=0
    )

texts = text_splitter.split_documents(documents)

## 임베딩
embedding = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")

## 저장
vectordb = Chroma.from_documents(texts,embedding,persist_directory="C:/startcoding_vs/test_venv/gg")

In [None]:
##################
################## CSV파일
##################
from langchain.document_loaders import CSVLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

## 불러오기
loader = CSVLoader('C:/startcoding_vs/test_venv/hh/의료ㆍIT학과 이수기준.csv')

documents = loader.load()
documents[0].page_content[:200]

## 텍스트 나누기
text_splitter = CharacterTextSplitter(
	chunk_size=100, 
    chunk_overlap=0
    )

texts = text_splitter.split_documents(documents)

## 임베딩
embedding = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")

## 저장
vectordb = Chroma.from_documents(texts,embedding,persist_directory="C:/startcoding_vs/test_venv/gg")