In [3]:
import os
import json
import re
from tqdm import tqdm

In [2]:
# 파일 위치 지정
keys_file_path = os.path.join('data', 'api_keys.txt')

# 파일에서 API 키를 로드하는 함수
def load_api_keys(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        keys = json.load(file)
    return keys

# API 키 사용
api_keys = load_api_keys(keys_file_path)
pinecone_key1 = api_keys['pinecone_key1']
pinecone_key2 = api_keys['pinecone_key2']

In [3]:
from pinecone import Pinecone, ServerlessSpec

# Pinecone 연결 및 index 설정
# key1 => minilm / key2 => mpnet
pc = Pinecone(api_key=pinecone_key1)
index = pc.Index('minilm')

index.describe_index_stats()

  from tqdm.autonotebook import tqdm


{'dimension': 384,
 'index_fullness': 0.41754,
 'namespaces': {'': {'vector_count': 41754}},
 'total_vector_count': 41754}

## Keyword와 비슷한 문장 가져오는 RAG

In [4]:
file_path = 'filtered_national_keyword.json'

# 파일이 존재하는지 확인
if os.path.exists(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    facet = ['food', 'drink', 'clothing']
    keywords = []

    for _facet in facet:
        # 모든 문장을 하나의 1차원 배열에 저장합니다.
        for sentence in data[_facet].values():
            keywords.extend(sentence)  # 각 국가별 문장 리스트를 추가합니다.

    # 결과를 출력합니다.
    print(keywords)
else:
    print(f"Error: The file {file_path} does not exist.")

['cig kofte', 'baklava', 'cig borek', 'dolma', 'manti', 'baba ghanoush', 'koshari', 'fatteh', 'shawarma', 'mulukhiyah', 'bibimbap', 'bulgogi', 'dongchimi', 'jjajangmyeon', 'kimchi', 'tom kha gai', 'khao soi', 'som tam', 'pad kra pao moo', 'massaman curry', 'carnitas', 'chiles en nogada', 'gorditas', 'mole poblano', 'chilaquiles', 'gazpacho', 'patatas bravas', 'tortilla espaola', 'pulpo a la gallega', 'gambas al ajillo', 'raki', 'ayran', 'algam', 'Tamer Hindi', 'sugarcane juice', 'sobia', 'karkade', 'makgeolli', 'dongdongju', 'soju', 'maesilju', 'sikhye', 'nam anchan', 'grass jelly drink', 'oliang', 'nam matoom', 'cha yen', 'margarita', 'carajillo', 'el pajarete', 'mezcal', 'paloma', 'licor de hierbas', 'horchata', 'cava', 'rioja', 'tinto de verano', 'abaya', 'yelek', 'entari', 'keffiyeh', 'shemagh', 'kaftan', 'gallibaya', 'hanbok', 'pha nung', 'boromphiman', 'ruean ton', 'mo hom', 'pha sin', 'tehuana', 'parachico', 'huipil', 'sombrero', 'sarape', 'mantilla', 'peineta', 'traje de chulap

In [5]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')



In [6]:
# Assume sentences is a list of input sentences
embedded_keywords = []

# Encode each sentence and store the embeddings
for keyword in tqdm(keywords):
    embedded_keywords.append(model.encode(keyword))

print(embedded_keywords[0])

100%|██████████| 80/80 [00:00<00:00, 87.90it/s]

[-7.14668334e-02  4.34861965e-02  1.89664755e-02  6.03785589e-02
 -1.74046531e-02  4.39097062e-02  9.87930074e-02  7.75352195e-02
  4.93674204e-02 -3.92985009e-02 -3.00128274e-02 -8.49083513e-02
  6.46709837e-03 -1.10503687e-02  1.08710695e-02 -7.13862106e-02
 -4.22240198e-02  3.56383659e-02 -7.70435110e-02 -1.13515519e-01
  7.71048851e-03 -3.40777524e-02 -2.38555484e-02  4.22517583e-02
 -6.71169758e-02 -1.22007225e-02 -3.38360555e-02  2.23608408e-02
  2.65608728e-02 -3.82317156e-02  3.81373540e-02  1.41649485e-01
 -7.43258372e-02  2.56522801e-02 -6.18350357e-02 -2.85928394e-03
  2.41350438e-02 -8.56428444e-02  1.72857977e-02  2.35820413e-02
 -4.45830589e-03  2.37919502e-02 -2.47925073e-02 -1.35728829e-02
  1.62764471e-02  3.49336397e-03 -7.68971592e-02  1.81083903e-02
 -1.72411941e-03  4.30794694e-02 -3.24682854e-02 -9.29977521e-02
  5.40664457e-02 -3.53374844e-03  2.55837031e-02  2.16011778e-02
 -4.61821770e-03  2.77885003e-03  1.14926897e-01 -1.25068808e-02
  2.91368309e-02 -2.66401




In [65]:
matches = index.query(
    vector=embedded_keywords[0].tolist(),
    top_k=3,
    include_values=False,
    include_metadata=True,
)
print(matches)

{'matches': [{'id': '4363',
              'metadata': {'assertion': 'Kofte is a Turkish dish made with '
                                        'ground beef and spices.\n'},
              'score': 0.547318935,
              'values': []},
             {'id': '36616',
              'metadata': {'assertion': 'Koftas are a type of Middle Eastern '
                                        'hamburger mixture.\n'},
              'score': 0.433353961,
              'values': []},
             {'id': '13223',
              'metadata': {'assertion': 'Kolaches are a Czech pastry that is '
                                        'popular in Texas.\n'},
              'score': 0.427233905,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}


In [69]:
import threading
from tqdm import tqdm
import time  # sleep을 사용하기 위해 time 모듈을 import합니다.

lock = threading.Lock()

def query_and_append(i, embedded_keyword, all_assertions_with_index):
    matches = index.query(
        vector=embedded_keyword.tolist(),
        top_k=3,
        include_values=False,
        include_metadata=True,
    )
    # 유사도가 0.5 이상인 값만 저장
    assertions_list = [match['metadata']['assertion'].strip() for match in matches['matches'] if match['score'] > 0.5]
    
    with lock:
        all_assertions_with_index.append((i, ' '.join(assertions_list)))
    
    time.sleep(0.1)  # 여기서 함수 실행 후 0.1초 동안 sleep합니다.

all_assertions_with_index = []
threads = []

for i, embedded_keyword in enumerate(tqdm(embedded_keywords)):
    thread = threading.Thread(target=query_and_append, args=(i, embedded_keyword, all_assertions_with_index))
    threads.append(thread)
    thread.start()

for thread in threads:
    thread.join()

all_assertions_with_index.sort(key=lambda x: x[0])
all_assertions = [assertion for _, assertion in all_assertions_with_index]

print(all_assertions)
len(all_assertions)


100%|██████████| 80/80 [00:00<00:00, 285.72it/s]


['Kofte is a Turkish dish made with ground beef and spices.', 'Baklava is a Turkish dessert typically served with tea or coffee. Baklava is a Middle Eastern dessert that is made of nuts, phyllo dough, and a sweet syrup. Baklava is a popular dessert in Lebanon, Greece, and Turkey.', '', 'Dolma is a Turkish dish made of grape leaves stuffed with rice and other ingredients. Dolma is a popular dish throughout the Middle East, Central Asia, Eastern Europe, and middle east. Dolma is a Turkish dish consisting of stuffed grape leaves.', 'Manti is a Turkish dish made of boiled ravioli-like pasta, and is popular for its home-cooked taste. The Turkish dish of manti, which are beef dumplings, remains popular and is often made at home.', '', 'Kasha is a type of grain that is popular in Eastern European and Russian cuisine.', '', 'Shawarma is a Middle Eastern dish that is made with sliced seasoned meat. Shawarma is a popular dish and fast-food staple across the Middle East and North Africa. Chicken 

80

In [70]:
# (sentence, assertion)을 json 형태로 파일에 저장

# 키워드와 주장을 매핑하여 딕셔너리 생성
keyword_assertions_dict = {keyword: assertion for keyword, assertion in zip(keywords, all_assertions)}

# 생성된 딕셔너리를 JSON 형태로 파일에 저장
with open('keyword_assertions.json', 'w', encoding='utf-8') as jsonfile:
    json.dump(keyword_assertions_dict, jsonfile, ensure_ascii=False, indent=4)


## Llama 사용 부분

In [4]:
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import CommaSeparatedListOutputParser, StrOutputParser
from langchain.prompts import PromptTemplate

In [19]:
# 필터링돼 증강된 문장 로드
with open('filtered_augmented_sentences copy.json', 'r') as f:
    augmented_sentence = json.load(f)
print(augmented_sentence)

# 키워드 별로 RAG된 assertions들 로드
with open('keyword_assertions.json', 'r') as f:
    keyword_assertions = json.load(f)
print(keyword_assertions)

{'food': {'Turkish': {'cig kofte': ['The young man is stirring his pot of cig kofte with a wooden spoon.'], 'baklava': ['The young man is stirring his pot of baklava with a wooden spoon.', 'A group of people preparing baklava in a kitchen.']}, 'Egyptian': {'baba ghanoush': ['The young man is stirring his pot of baba ghanoush with a wooden spoon.', 'A group of people preparing baba ghanoush in a kitchen.'], 'koshari': ['The young man is stirring his pot of koshari with a wooden spoon.', 'A group of people preparing koshari in a kitchen.']}, 'Korean': {'bibimbap': ['The young man is stirring his pot of bibimbap with a wooden spoon.']}}, 'drink': {'Turkish': {'raki': ['Man in cycling clothes raki beside his bicycle.', 'a plate of food on a table next to a raki'], 'ayran': ['Man in cycling clothes ayran beside his bicycle.']}}}
{'cig kofte': 'Kofte is a Turkish dish made with ground beef and spices.', 'baklava': 'Baklava is a Turkish dessert typically served with tea or coffee. Baklava is 

In [20]:
# LangChain이 지원하는 다른 채팅 모델을 사용합니다. 여기서는 Ollama를 사용합니다.
llm = ChatOllama(model="llama2:13b")

In [21]:
# 템플릿 생성
template = """
<key_sentence> 
    {sentence}
</key_sentence> 
###
<information>
    {information}
</information>
###
<query>
    {query}
</query>
"""
prompt = PromptTemplate.from_template(template=template)

chain = prompt | llm | StrOutputParser()

In [22]:
query = "Based on the provided information, please paraphrase the given key sentence using natural expressions, adding only about few words. The expressions in the modified sentence must all be natural, and slight modifications to the key sentence are allowed. If there is no additional information to add to the key sentence, output the key sentence. Your response should include only the modified sentence."

In [23]:
import pandas as pd
from tqdm.notebook import tqdm

total = sum(len(sentences) for national_list in augmented_sentence.values() for keywords in national_list.values() for sentences in keywords.values())

df_list = []

with tqdm(total=total, desc="Overall Progress") as pbar:
    # 여기서 augmented_sentence, keyword_assertions, chain, query는 이미 정의되어 있다고 가정합니다.
    for facet, national_list in augmented_sentence.items():
        for national, keywords in national_list.items():
            for keyword, aug_sentences in keywords.items():
                for aug_sentence in aug_sentences:
                    sentence = aug_sentence
                    assertions = keyword_assertions[keyword]

                    # 이 부분은 예시로 제공된 코드의 chain.invoke 함수 호출 방식에 따라 달라질 수 있습니다.
                    # chain.invoke 함수에 대한 구체적인 정보가 없으므로, 이 코드 블록은 가상의 예시로 작성되었습니다.
                    results = chain.invoke({"sentence": sentence, "information": assertions, "query": query})                
                    final_augmented = results.split('\n')[-1]

                    df_list.append({
                        'facet': facet, 
                        'national': national, 
                        'keyword': keyword, 
                        'final_augmented': final_augmented, 
                        'result': results
                    })

                    pbar.update(1)


df = pd.DataFrame(df_list)

df.to_excel('final_augmented_sentence.xlsx', index=False)

Overall Progress:   0%|          | 0/11 [00:00<?, ?it/s]