In [2]:
import os
import json
import re
from tqdm import tqdm

In [3]:
# 파일 위치 지정
keys_file_path = os.path.join('data', 'api_keys.txt')

# 파일에서 API 키를 로드하는 함수
def load_api_keys(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        keys = json.load(file)
    return keys

# API 키 사용
api_keys = load_api_keys(keys_file_path)
pinecone_key1 = api_keys['pinecone_key1']
pinecone_key2 = api_keys['pinecone_key2']

In [28]:
from pinecone import Pinecone, ServerlessSpec

# Pinecone 연결 및 index 설정
# key1 => minilm / key2 => mpnet
pc = Pinecone(api_key=pinecone_key1)
index = pc.Index('minilm')

index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.41754,
 'namespaces': {'': {'vector_count': 41754}},
 'total_vector_count': 41754}

## Keyword와 비슷한 문장 가져오는 RAG

In [37]:
file_path = 'filtered_national_keyword.json'

# 파일이 존재하는지 확인
if os.path.exists(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    facet = ['food', 'drink', 'clothing']
    keywords = []

    for _facet in facet:
        # 모든 문장을 하나의 1차원 배열에 저장합니다.
        for sentence in data[_facet].values():
            keywords.extend(sentence)  # 각 국가별 문장 리스트를 추가합니다.

    # 결과를 출력합니다.
    print(keywords)
else:
    print(f"Error: The file {file_path} does not exist.")

['mercimek kofte', 'baklava', 'cig borek', 'ezogelin corba', 'gozleme', 'kofta', 'koshari', 'fatteh', 'falafel', 'mulukhiyah', 'rice cakes', 'pajeon', 'juk', 'japchae', 'hotteok', 'kao ka moo', 'khao soi', 'yam nua beef salad', 'pad kra pao', 'massaman curry', 'elote', 'chiles en nogada', 'gorditas', 'mole poblano', 'chilaquiles', 'gazpacho', 'patatas bravas', 'tortilla espaola', 'pollo a la plancha', 'gambas al ajillo', 'paella', 'raki', 'ayran', 'algam', 'Tamer Hindi', 'sugarcane juice', 'sobia', 'karkade', 'gamhongno', 'dongdongju', 'soju', 'maesilju', 'sikhye', 'nam anchan', 'grass jelly drink', 'oliang', 'nam matoom', 'cha yen', 'margarita', 'carajillo', 'el pajarete', 'mezcal', 'paloma', 'licor de hierbas', 'horchata', 'cava', 'rioja', 'tinto de verano', 'abaya', 'yelek', 'keffiyeh', 'shemagh', 'kaftan', 'gallibaya', 'hanbok', 'pha nung', 'boromphiman', 'ruean ton', 'mo hom', 'pha sin', 'tehuana', 'parachico', 'huipil', 'sombrero', 'sarape', 'mantilla', 'peineta', 'traje de chula

In [32]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [38]:
# Assume sentences is a list of input sentences
embedded_keywords = []

# Encode each sentence and store the embeddings
for keyword in tqdm(keywords):
    embedded_keywords.append(model.encode(keyword))

print(embedded_keywords[0])

100%|██████████| 80/80 [00:00<00:00, 110.32it/s]

[-2.32939124e-02  1.59413919e-01 -3.42669971e-02  2.93148011e-02
  8.23388025e-02  2.29797282e-04  1.00153491e-01  8.55913535e-02
  8.22149217e-02 -6.09760508e-02  1.02106540e-03 -1.07429363e-01
  1.66621059e-02 -3.96179073e-02 -2.28135493e-02 -5.13862744e-02
 -7.12071881e-02  8.30131173e-02 -3.43720615e-02 -7.26319486e-05
  6.54212385e-03 -5.47983088e-02  2.69031357e-02  7.00280890e-02
 -9.37716514e-02 -1.45024695e-02  4.19363417e-02  2.16361955e-02
 -2.55240966e-02 -7.14001805e-02  5.85819595e-02  8.62768665e-02
  1.50947494e-03  2.74538854e-03  4.94130142e-03  5.19874915e-02
 -3.25366370e-02 -7.31509924e-02 -5.03854686e-03 -2.79459264e-02
 -2.05747318e-02 -5.40256388e-02 -5.97806796e-02 -3.67651768e-02
 -1.07703386e-02  2.65982579e-02 -1.77338179e-02  4.02986184e-02
  1.70892179e-02  2.83131935e-02 -1.02311984e-01 -1.07256092e-01
 -4.26507890e-02 -1.22023402e-02 -2.05631424e-02  1.07999137e-02
 -2.10147519e-02  1.08033828e-02  1.05931878e-01 -4.61447313e-02
  3.01043894e-02 -1.04785




In [39]:
import threading
from tqdm import tqdm

lock = threading.Lock()

def query_and_append(i, embedded_keyword, all_assertions_with_index):
    matches = index.query(
        vector=embedded_keyword.tolist(),
        top_k=3,
        include_values=False,
        include_metadata=True,
    )
    assertions_list = [match['metadata']['assertion'].strip() for match in matches['matches']]
    
    with lock:
        all_assertions_with_index.append((i, ' '.join(assertions_list)))

all_assertions_with_index = []
threads = []

# enumerate를 사용하여 각 embedded_keyword에 대한 인덱스도 같이 반복 처리합니다.
for i, embedded_keyword in enumerate(tqdm(embedded_keywords)):
    thread = threading.Thread(target=query_and_append, args=(i, embedded_keyword, all_assertions_with_index))
    threads.append(thread)
    thread.start()

for thread in threads:
    thread.join()

# 작업이 완료된 후, 인덱스 기준으로 all_assertions_with_index 리스트를 정렬합니다.
# 정렬된 리스트에서 각 튜플의 두 번째 요소(실제 assertion 문자열)만 추출하여 최종 리스트를 생성합니다.
all_assertions_with_index.sort(key=lambda x: x[0])
all_assertions = [assertion for _, assertion in all_assertions_with_index]

print(all_assertions)


100%|██████████| 80/80 [00:01<00:00, 67.42it/s] 


["Kriek is a Belgian beer that is made with cherries. Kofte is a Turkish dish made with ground beef and spices. The Netherlands celebrates the King's Birthday on Koningsdag.", 'Baklava is a Turkish dessert typically served with tea or coffee. Baklava is a Middle Eastern dessert that is made of nuts, phyllo dough, and a sweet syrup. Baklava is a popular dessert in Lebanon, Greece, and Turkey.', 'In Albania nettles are used in a dish called borek. Turkish coffee is brewed in a pot called a cezve. The African Civet is quite famous for its musky territorial marking scent.', 'Couscous is a pasta that is made of semolina, originating from North Africa. Couscous is a traditional Berber dish from Morocco made from fine grains of semolina. Coimbra is a place with a rich history and culture.', 'Goulash is a dish originating from medieval Hungary. Gorditas are a type of Mexican bread made of corn flour, usually filled with a variety of fillings. Hungarian goulash is a soup with beef and vegetable

In [40]:
# (sentence, assertion)을 json 형태로 파일에 저장

# 키워드와 주장을 매핑하여 딕셔너리 생성
keyword_assertions_dict = {keyword: assertion for keyword, assertion in zip(keywords, all_assertions)}

# 생성된 딕셔너리를 JSON 형태로 파일에 저장
with open('keyword_assertions.json', 'w', encoding='utf-8') as jsonfile:
    json.dump(keyword_assertions_dict, jsonfile, ensure_ascii=False, indent=4)


## Llama 사용 부분

In [55]:
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import CommaSeparatedListOutputParser, StrOutputParser
from langchain.prompts import PromptTemplate

In [56]:
# 필터링돼 증강된 문장 로드
with open('filtered_augmented_sentences.json', 'r') as f:
    augmented_sentence = json.load(f)
print(augmented_sentence)

# 키워드 별로 RAG된 assertions들 로드
with open('keyword_assertions.json', 'r') as f:
    keyword_assertions = json.load(f)
print(keyword_assertions)

{'food': {'Turkish': {'mercimek kofte': ['The young man is stirring his pot of mercimek kofte with a wooden spoon.', 'A group of people preparing mercimek kofte in a kitchen.', 'A kitchen counter with cutting board, knife and mercimek kofte.', 'A married couple preparing mercimek kofte in a house kitchen.', 'Several people are sitting around an outdoor table eating a mercimek kofte.', 'A man sitting at a table having a mercimek kofte.', 'A plate of mercimek kofte in containers is on a tray.', 'a close up of a few plates of mercimek kofte on a table', 'mercimek kofte is served on a plate near a vase.', 'A man sitting at a table with a large plate of mercimek kofte on it.'], 'baklava': ['The young man is stirring his pot of baklava with a wooden spoon.', 'A group of people preparing baklava in a kitchen.', 'A kitchen counter with cutting board, knife and baklava.', 'A married couple preparing baklava in a house kitchen.', 'Several people are sitting around an outdoor table eating a bakla

In [57]:
# LangChain이 지원하는 다른 채팅 모델을 사용합니다. 여기서는 Ollama를 사용합니다.
llm = ChatOllama(model="llama2")

In [59]:
# 템플릿 생성
template = """
<key_sentence> 
    {sentence}
</key_sentence> 
###
<information>
    {information}
</information>
###
<query>
    {query}
</query>
"""

prompt = PromptTemplate.from_template(template=template)

chain = prompt | llm | StrOutputParser()

In [62]:
query = 'Based on the provided information, augment the key sentence with additional details. However, limit the modification to the addition of approximately 2-3 words to the current sentence. Please make the sentence sound natural.'

In [67]:
sentence = augmented_sentence['food']['Turkish']['baklava'][0]
assertions = keyword_assertions['baklava']

print(assertions)


results = chain.invoke({"sentence": sentence, "information": assertions, "query": query})

print(results)

Baklava is a Turkish dessert typically served with tea or coffee. Baklava is a Middle Eastern dessert that is made of nuts, phyllo dough, and a sweet syrup. Baklava is a popular dessert in Lebanon, Greece, and Turkey.
Sure! Here's an example of how you could modify the key sentence based on the provided information:

The young man is stirring his pot of baklava with a wooden spoon, enjoying the sweet aroma of the nutty dessert.

In this modified key sentence, I added approximately 2 words to the original sentence - "enjoying the sweet aroma" - to provide more context and detail about the young man's experience while stirring his baklava.
