In [1]:
from pinecone import Pinecone, ServerlessSpec
import os
import json
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from itertools import islice

  from tqdm.autonotebook import tqdm


In [2]:
# 파일 위치 지정
keys_file_path = os.path.join('data', 'api_keys.txt')

# 파일에서 API 키를 로드하는 함수
def load_api_keys(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        keys = json.load(file)
    return keys

# API 키 사용
api_keys = load_api_keys(keys_file_path)
openAI_keys = api_keys['openAI_keys']
pinecone_key1 = api_keys['pinecone_key1']
pinecone_key2 = api_keys['pinecone_key2']

In [3]:
# Pinecone 연결 및 index 설정
pc = Pinecone(api_key=pinecone_key2)

# key1 => minilm / key2 => mpnet
index = pc.Index('mpnet')

index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.41754,
 'namespaces': {'': {'vector_count': 41754}},
 'total_vector_count': 41754}

In [4]:
file_path = 'augmented_sentences.json'

# 파일이 존재하는지 확인
if os.path.exists(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    # 모든 문장을 하나의 1차원 배열에 저장합니다.
    sentences = []
    for sentence in data['food'].values():
        sentences.extend(sentence)  # 각 국가별 문장 리스트를 추가합니다.

    # 결과를 출력합니다.
    print(sentences)
else:
    print(f"Error: The file {file_path} does not exist.")

['The young man is stirring his pot of gzleme with a wooden spoon.', 'The young man is stirring his pot of baklava with a wooden spoon.', 'The young man is stirring his pot of piyaz with a wooden spoon.', 'The young man is stirring his pot of kebab with a wooden spoon.', 'The young man is stirring his pot of menemen with a wooden spoon.', 'A group of people preparing gzleme in a kitchen.', 'A group of people preparing baklava in a kitchen.', 'A group of people preparing piyaz in a kitchen.', 'A group of people preparing kebab in a kitchen.', 'A group of people preparing menemen in a kitchen.', 'A kitchen counter with cutting board, knife and gzleme.', 'A kitchen counter with cutting board, knife and baklava.', 'A kitchen counter with cutting board, knife and piyaz.', 'A kitchen counter with cutting board, knife and kebab.', 'A kitchen counter with cutting board, knife and menemen.', 'A married couple preparing gzleme in a house kitchen.', 'A married couple preparing baklava in a house 

In [5]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Assume sentences is a list of input sentences
embedded_sentences = []

# Encode each sentence and store the embeddings
for sentence in tqdm(sentences):
    embedded_sentences.append(model.encode(sentence))

print(embedded_sentences[0])

100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [00:44<00:00,  6.80it/s]

[-3.71109392e-03 -9.60004628e-02  3.05198934e-02 -1.31522464e-02
 -7.67655147e-04 -9.96269286e-03 -7.59852305e-02 -5.61543517e-02
  6.96647093e-02 -8.53400026e-03  9.83509701e-03  2.43866220e-02
 -1.97766293e-02 -1.85474996e-02  4.80835699e-02  2.30576266e-02
  5.74317053e-02 -1.77572630e-02  3.55541823e-03 -7.67420279e-03
 -1.85963772e-02 -2.08412614e-02  2.16055922e-02  2.54263077e-03
 -2.55253725e-02 -2.30705924e-02  1.26526598e-03 -2.05529220e-02
 -4.39247675e-02  9.66274121e-04 -5.75497188e-02  2.30528302e-02
 -1.56187676e-02  5.12747467e-02  1.53848998e-06 -3.19567829e-04
  6.01781122e-02  2.31144950e-02 -2.83398479e-02 -4.91200350e-02
  4.32884395e-02  3.62939015e-02 -9.67505574e-03  2.82748975e-02
 -1.70047078e-02  2.59789769e-02  7.84189627e-03  1.35087511e-02
  4.24999371e-03 -6.73742369e-02 -2.17284411e-02 -5.73800094e-02
  1.08138788e-02 -5.16779441e-03  4.20753565e-03 -7.63360222e-05
 -2.77996585e-02  2.86687892e-02 -3.23090679e-03  5.75970262e-02
 -2.59156432e-02  1.08762




In [6]:
# Initialize an empty list to collect all retrieved assertions
all_assertions = []

# Loop through each encoded sentence and query the index for matches
for encoded_sentence in tqdm(embedded_sentences):
    # Query the index using the current encoded sentence
    matches = index.query(
        vector=encoded_sentence.tolist(),
        top_k=3,  # Adjust the number of matches to retrieve as needed
        include_values=False,
        include_metadata=True,
    )
    
    # Extract the assertions metadata from each match
    assertions_list = [match['metadata']['assertion'].strip() for match in matches['matches']]

    # Collect all results in the overall list
    all_assertions.append(' '.join(assertions_list))

# If you want to store or process the combined assertions list
print(all_assertions)

100%|████████████████████████████████████████████████████████████████████████████████| 300/300 [01:42<00:00,  2.94it/s]

['In Wales, the tradition of carving love spoons in wood goes back many centuries. The Japanese tea ceremony traditionally uses a bamboo whisk to stir up the tea and infuse it. The thistle spoonrest is handmade in Scotland and is a Scottish symbol.', 'Baklava is a Middle Eastern dessert that is made of nuts, phyllo dough, and a sweet syrup. Baklava is a Turkish dessert typically served with tea or coffee. Baklava is a Turkish or Greek dish that is made with phylo dough, nuts, and honey.', 'Central Asians love their rice pilaf, and the Uzbek plov is a popular dish in the region. Filipinos regularly use spoons together with forks and knives. Peanut oil is used in Asian cooking for its high smoke point and flavor.', 'The Turkish kebab is a skewered meat dish that is cooked over an open fire. Kebabs are a popular dish in India that come in many different varieties. Kebabs are a wide variety of meat dishes originating from Iran and now popular all over the world.', 'Stir-frying is a quick a




In [7]:
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

# LangChain이 지원하는 다른 채팅 모델을 사용합니다. 여기서는 Ollama를 사용합니다.
llm = ChatOllama(model="llama2")

In [8]:
from langchain.prompts import PromptTemplate

template = """Information: {retrieval}
Question: {query}"""

prompt = PromptTemplate.from_template(template=template)

# LangChain 표현식 언어 체인 구문을 사용합니다.
# LCEL에 대한 자세한 내용은 https://python.langchain.com/docs/expression_language/why 에서 확인할 수 있습니다.
chain = prompt | llm | StrOutputParser()

In [9]:
query = 'Reconstruct the first as the main sentence, using the rest as supporting information, into 2-3 sentences.'

In [10]:
with open("result.txt", "w") as file:
    # 두 리스트를 동시에 반복하며 인덱스와 값을 가져옴
    for retrieval in tqdm(islice(zip(sentences, all_assertions), 5)):
        # chain.invoke를 사용하여 result를 가져옴
        result = chain.invoke({"retrieval": retrieval, "query": query})
        
        # 필요한 경우 result의 특정 속성에 접근해서 작성
        # 예: result['output']로 필요한 데이터를 가져온다면 아래처럼 작성
        file.write(result + '\n')

5it [05:10, 62.17s/it]
