In [1]:
import os
import json
import re
from tqdm import tqdm

In [2]:
# 파일 위치 지정
keys_file_path = os.path.join('data', 'api_keys.txt')

# 파일에서 API 키를 로드하는 함수
def load_api_keys(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        keys = json.load(file)
    return keys

# API 키 사용
api_keys = load_api_keys(keys_file_path)
pinecone_keys = api_keys['pinecone_key1']

In [3]:
from pinecone import Pinecone, ServerlessSpec

# Pinecone 연결 및 index 설정
pc = Pinecone(api_key=pinecone_keys)
index = pc.Index('minilm')

index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.41754,
 'namespaces': {'': {'vector_count': 41754}},
 'total_vector_count': 41754}

In [4]:
file_path = os.path.join(os.getcwd(), "data", "processed_data.txt")

# 문장을 저장할 배열 초기화
sentences = []

# 파일 열기
with open(file_path, "r", encoding="utf-8") as file:
    # 파일의 각 줄을 읽어와 배열에 추가
    for line in file:
        sentences.append(line.strip())

In [5]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# 벡터 DB 업로드 부분

In [6]:
# Assume sentences is a list of input sentences
embedded_sentences = []

# Encode each sentence and store the embeddings
for sentence in tqdm(sentences):
    embedded_sentences.append(model.encode(sentence))

print(embedded_sentences[0]) # 186개 문장을 전부 임베딩

100%|██████████| 186/186 [00:02<00:00, 81.53it/s]

[ 3.11310752e-03 -1.56439617e-02  5.08689322e-02  1.52268374e-04
 -1.67714953e-02 -2.43715756e-02  4.28456999e-02 -6.47027120e-02
  3.78207788e-02 -2.82120947e-02  1.98299158e-02 -5.90517521e-02
 -3.56922857e-02 -2.47333087e-02 -1.91530194e-02 -1.31397828e-01
 -1.82549655e-02 -1.22621441e-02 -1.36688203e-02 -9.34328660e-02
  1.46321645e-02  2.18940224e-03  3.63208801e-02 -2.29838751e-02
  1.04088530e-01  4.17623436e-03  1.18183769e-01  2.40357127e-02
  5.30165993e-02  5.28994761e-02  1.07629538e-01  7.27605773e-03
 -9.59043652e-02 -1.77922398e-02 -5.71395867e-02 -2.48129833e-02
 -1.59236938e-02  6.53056726e-02 -3.06698177e-02  1.14739798e-02
 -2.08299793e-02 -1.98709555e-02  1.04911640e-01 -4.63728141e-03
  4.30373959e-02  9.54070129e-03 -4.88798656e-02 -8.10768902e-02
  8.56835581e-03 -4.16736156e-02 -8.08678120e-02 -3.28446589e-02
  5.54848760e-02  2.29463130e-02  4.26597521e-02 -2.31313426e-02
  3.42331380e-02  8.57734680e-02 -1.16089091e-03  5.46230525e-02
 -1.00753181e-01  1.86795




In [7]:
# Initialize an empty list to collect all retrieved assertions
all_assertions = []

# Loop through each encoded sentence and query the index for matches
for i, encoded_sentence in enumerate(tqdm(embedded_sentences)):
    # Query the index using the current encoded sentence
    matches = index.query(
        vector=encoded_sentence.tolist(),
        top_k=5,  # Adjust the number of matches to retrieve as needed
        include_values=False,
        include_metadata=True,
    )
    
    # Extract the assertions metadata from each match
    assertions_list = [match['metadata']['assertion'].strip() for match in matches['matches']]

    # Collect all results in the overall list
    all_assertions.append(' '.join(assertions_list))

# If you want to store or process the combined assertions list
print(all_assertions)

  5%|▌         | 10/186 [00:19<05:43,  1.95s/it]


KeyboardInterrupt: 

# 데이터 증강 부분

In [6]:
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import CommaSeparatedListOutputParser
from langchain.prompts import PromptTemplate

In [7]:
# LangChain이 지원하는 다른 채팅 모델을 사용합니다. 여기서는 Ollama를 사용합니다.
llm = ChatOllama(model="llama2")

## CSV Parser
output_parser = CommaSeparatedListOutputParser()
format_instructions = output_parser.get_format_instructions()

In [8]:
# 템플릿 생성
template = """
Information: 
{information}
###
Question: 
{query}
###
Output format:
{format_instructions}
###
System:
{system}
"""

prompt = PromptTemplate.from_template(
    template=template,
    partial_variables={"format_instructions":  format_instructions}
    )

chain = prompt | llm | output_parser

In [9]:
national = "Korea"
facet = "food" # food, drink, clothing

In [11]:
sentence = model.encode(f"{national} traditional {facet}")

matches = index.query(
    vector=sentence.tolist(),
    top_k=20,  # Adjust the number of matches to retrieve as needed
    include_values=False,
    include_metadata=True,
)
print(matches)

# Extract the assertions metadata from each match
assertions_list = [match['metadata']['assertion'].strip() for match in matches['matches']]
assertions = ' '.join(assertions_list)

print(assertions)

{'matches': [{'id': '10097',
              'metadata': {'assertion': 'Bulgogi is a Korean dish that is '
                                        'quite popular, both with Koreans and '
                                        'foreigners.\n'},
              'score': 0.922656238,
              'values': []},
             {'id': '10085',
              'metadata': {'assertion': 'Bulgogi is a popular Korean dish that '
                                        'is sweet and savory.\n'},
              'score': 0.921311736,
              'values': []},
             {'id': '10056',
              'metadata': {'assertion': 'Bulgogi is a Korean dish made of '
                                        'grilled, marinated meat, typically '
                                        'beef.\n'},
              'score': 0.867816865,
              'values': []},
             {'id': '10169',
              'metadata': {'assertion': 'Korean bulgogi is a dish made with '
                                        "be

In [12]:
system_prompt = "Limit speaking to anything beyond what is asked. !!DO NOT SPEAK SURE!!"
query = f"Find proper nouns of traditional {national} {facet} from 'information' and write them according to the 'output format', excluding names that are general {facet}."

In [16]:
results = set(chain.invoke({"system": system_prompt, "information": assertions, "query": query}))

print(assertions)
print()
print(results)
print()

# 추가할 항목을 담을 집합을 준비합니다.
proc_set = set()

for item in results:
    # 특수 문자 제거, 소문자 변환, 앞뒤 공백 제거
    cleaned_item = re.sub(r'[^a-zA-Z0-9\n\s]', '', item).lower().strip()

    # '\n'이 있는 경우 분할하여 저장
    if '\n' in cleaned_item:
        cleaned_item_list = cleaned_item.split('\n')
        for c in cleaned_item_list:
            proc_set.add(c.strip())
    else:
        proc_set.add(cleaned_item)

print(proc_set)

# 키워드 길이 0~20까지만 유지
filtered_results = {item for item in proc_set if 0 < len(item) < 20}

print(filtered_results)

Bulgogi is a Korean dish that is quite popular, both with Koreans and foreigners. Bulgogi is a popular Korean dish that is sweet and savory. Bulgogi is a Korean dish made of grilled, marinated meat, typically beef. Korean bulgogi is a dish made with beef, and is one of Korea's most popular dishes. Korean food is healthy and delicious. The word "bap" in Korean refers to rice or a meal, and is the staple food for most Koreans. Korean food is typically very flavorful, and tteokbokki is one of the most popular dishes. The food is a combination of Korean and Mexican flavors. Korean food is known for its health benefits. Korean food is delicious and affordable. Bulgur is a whole grain, widely used in Middle Eastern cuisine and rich in fiber. Kimchi is a fermented vegetable dish from Korea. Buuz is a traditional Mongolian dish that is similar to a steamed meatball wrapped in a noodle. The staple food of Korea is rice. Korean cuisine is characterized by its many side dishes. Bulgur is a key in

In [None]:
## 증강할 캡션 문장 로드
with open('data/sentences.json', 'r') as f:
    sentences = json.load(f)

print(sentences)

{'food': ['The young man is stirring his pot of {national_keyword} with a wooden spoon.', 'A group of people preparing {national_keyword} in a kitchen.', 'A kitchen counter with cutting board, knife and {national_keyword}.', 'A married couple preparing {national_keyword} in a house kitchen.', 'Several people are sitting around an outdoor table eating a {national_keyword}.', 'A man sitting at a table having a {national_keyword}.', 'A plate of {national_keyword} in containers is on a tray.', 'a close up of a few plates of {national_keyword} on a table', '{national_keyword} is served on a plate near a vase.', 'A man sitting at a table with a large plate of {national_keyword} on it.'], 'drink': ['Man in cycling clothes {national_keyword} beside his bicycle.', 'a plate of food on a table next to a {national_keyword}', 'Friends having a {national_keyword} together in the kitchen', 'Women are preparing {national_keyword} for themselves in the kitchen.', 'many people at tables with {national_k

In [None]:
for sentence in sentences[facet]:
    for result in filtered_results:
        augmented_sentence = sentence.format(national_keyword=result)
        print(augmented_sentence)