In [1]:
from pinecone import Pinecone, ServerlessSpec
import os
import json
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

  from tqdm.autonotebook import tqdm


In [2]:
# 파일 위치 지정
keys_file_path = os.path.join('data', 'api_keys.txt')

# 파일에서 API 키를 로드하는 함수
def load_api_keys(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        keys = json.load(file)
    return keys

# API 키 사용
api_keys = load_api_keys(keys_file_path)
openAI_keys = api_keys['openAI_keys']
pinecone_key1 = api_keys['pinecone_key1']
pinecone_key2 = api_keys['pinecone_key2']

In [3]:
# Pinecone 연결 및 index 설정
pc = Pinecone(api_key=pinecone_key2)

# key1 => minilm / key2 => mpnet
index = pc.Index('mpnet')

index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.41754,
 'namespaces': {'': {'vector_count': 41754}},
 'total_vector_count': 41754}

In [4]:
file_path = os.path.join(os.getcwd(), "data", "processed_data.txt")

# 문장을 저장할 배열 초기화
sentences = []

# 파일 열기
with open(file_path, "r", encoding="utf-8") as file:
    # 파일의 각 줄을 읽어와 배열에 추가
    for line in file:
        sentences.append(line.strip())

In [5]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Assume sentences is a list of input sentences
embedded_sentences = []

# Encode each sentence and store the embeddings
for sentence in tqdm(sentences):
    embedded_sentences.append(model.encode(sentence))

print(embedded_sentences[0])

100%|████████████████████████████████████████████████████████████████████████████████| 186/186 [00:27<00:00,  6.67it/s]

[ 2.08201744e-02 -1.07037596e-01  2.14415211e-02 -2.68955510e-02
  1.31547293e-02 -1.39538571e-02 -5.96780330e-02 -1.33626806e-02
  5.38083836e-02  1.43554788e-02 -1.85819734e-02  3.86043340e-02
 -1.62289664e-02 -2.65340004e-02  2.53813956e-02  3.68113117e-03
  5.00071533e-02 -1.22267585e-02  5.08567207e-02  6.88743778e-04
 -6.02892153e-02 -2.90247258e-02  2.58078594e-02  1.80703532e-02
 -2.36614328e-02 -3.33530386e-03 -9.54449642e-03 -1.60108935e-02
 -4.21916135e-02 -5.46915457e-02 -2.66851559e-02  8.36803298e-03
 -2.46263295e-02  4.35893610e-02  1.58004116e-06  1.32559072e-02
  6.32733926e-02  2.15729256e-03 -2.68571861e-02 -6.21924326e-02
  9.08079892e-02  1.58581957e-02 -1.68577712e-02 -2.92072102e-04
  2.28485502e-02 -1.97100956e-02  3.42469960e-02  2.12272946e-02
  9.36810579e-03 -6.27371818e-02  1.00637553e-03 -3.88962217e-02
 -6.14600908e-03  4.12377268e-02 -5.44335023e-02  2.05665547e-02
 -2.20686439e-02 -7.78652588e-03 -1.24147777e-02  3.30783762e-02
 -6.54799398e-03 -1.57479




In [6]:
# Initialize an empty list to collect all retrieved assertions
all_assertions = []

# Loop through each encoded sentence and query the index for matches
for i, encoded_sentence in enumerate(tqdm(embedded_sentences)):
    # Query the index using the current encoded sentence
    matches = index.query(
        vector=encoded_sentence.tolist(),
        top_k=5,  # Adjust the number of matches to retrieve as needed
        include_values=False,
        include_metadata=True,
    )
    
    # Extract the assertions metadata from each match
    assertions_list = [match['metadata']['assertion'].strip() for match in matches['matches']]

    # Collect all results in the overall list
    all_assertions.append(' '.join(assertions_list))

# If you want to store or process the combined assertions list
print(all_assertions)

100%|████████████████████████████████████████████████████████████████████████████████| 186/186 [00:42<00:00,  4.41it/s]

['Koreans use chopsticks and spoons to eat their meals. Korean chopsticks are typically made of metal, are flat, and have blunt or sharp ends. Korean food is unique and exciting, with a variety of flavors, textures, and aromas. Korean food is no stranger to many cultures and is known for its unique and delicious dishes. Koreans usually eat rice with a spoon, which is different from other Asian countries.', 'Turkish food is a diverse and delicious cuisine that is becoming more popular all over the world. Turkish cuisine uses a variety of spices to enhance the flavor of their dishes. Turkish tea is a big part of Turkish cuisine and is served in many different ways. Turkey is known for its spices, which are used in many different ways in Turkish cuisine. Turkish coffee is a delicious and unique coffee drink that is enjoyed by many.', 'The cuisine of Egypt has a long and rich history, with many dishes dating back to ancient times. The cuisine of Egypt is a very rich cuisine that has many u




In [7]:
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

# LangChain이 지원하는 다른 채팅 모델을 사용합니다. 여기서는 Ollama를 사용합니다.
llm = ChatOllama(model="llama2")

In [8]:
from langchain.prompts import PromptTemplate

template = """Information: {retrieval}
Question: {query}"""

prompt = PromptTemplate.from_template(template=template)

# LangChain 표현식 언어 체인 구문을 사용합니다.
# LCEL에 대한 자세한 내용은 https://python.langchain.com/docs/expression_language/why 에서 확인할 수 있습니다.
chain = prompt | llm | StrOutputParser()

In [9]:
query = 'Reconstruct the first as the main sentence, using the rest as supporting information, into 2-3 sentences.'

In [10]:
with open("result.txt", "w") as file:
    # 두 리스트를 동시에 반복하며 인덱스와 값을 가져옴
    for retrieval in zip(sentences, all_assertions):
        # chain.invoke를 사용하여 result를 가져옴
        result = chain.invoke({"retrieval": retrieval, "query": query})
        
        # 필요한 경우 result의 특정 속성에 접근해서 작성
        # 예: result['output']로 필요한 데이터를 가져온다면 아래처럼 작성
        file.write(result + '\n')

KeyboardInterrupt: 