In [1]:
import json
import re
import os
import tqdm

In [2]:
from langchain_community.utilities import GoogleSearchAPIWrapper
from langchain_core.tools import Tool
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import CommaSeparatedListOutputParser
from langchain.prompts import PromptTemplate

In [3]:
# 파일 위치 지정
keys_file_path = os.path.join('data', 'api_keys.txt')

# 파일에서 API 키를 로드하는 함수
def load_api_keys(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        keys = json.load(file)
    return keys

# API 키 사용
api_keys = load_api_keys(keys_file_path)

os.environ["GOOGLE_CSE_ID"] = api_keys['google_cse_id']
os.environ["GOOGLE_API_KEY"] = api_keys['google_api_key']
os.environ['HUGGINGFACEHUB_API_TOKEN'] = api_keys['huggingface_key']

KeyError: 'huggingface_key'

In [4]:
## Web search
search = GoogleSearchAPIWrapper()

tool = Tool(
    name="google_search",
    description="Search Google for recent results.",
    func=search.run,
)

## Ollama
llm = ChatOllama(model="llama2:13b")

## CSV Parser
output_parser = CommaSeparatedListOutputParser()
format_instructions = output_parser.get_format_instructions()

  warn_deprecated(


In [5]:
# 템플릿 생성
template = """
Information: 
{information}
###
Question: 
{query}
###
Output format:
{format_instructions}
###
System:
{system}
"""

prompt = PromptTemplate.from_template(
    template=template,
    partial_variables={"format_instructions":  format_instructions}
    )

chain = prompt | llm | output_parser

In [9]:
# national = "Korean"
# facet = "food" # food, drink, clothing
print(tool.run("Turkish traditional food"))

Apr 13, 2024 ... 10 Traditional Turkish foods you must try in Turkey · Baklava · Cig Borek · Guvec · Gozleme · Imam Bayaldi · Manti · Menemen · Pide. ADD +. Turkish foods: 23 delicious dishes · Piyaz · Ezogelin corba · Saksuka · Kisir · Mercimek kofte · Yaprak dolma · Inegol kofte · Iskender kebab. Typical Turkish food, beyond kebabs and baklavas · Baklava · Döner kebap · Lahmacun · Dolma · Meze · Börek · Menemen · Pilav; Manti; Köfte; Mercimek ... Doner Kebab Recipe At Home! · Turkish Easy Lamb Kabab With Vegetables In The Oven · Traditional Turkish Dinner Menu | 7 Recipes And Planning Guide · Turkish ... Feb 10, 2024 ... What Is Turkish Mantı Made of? Turkish Mantı is a traditional Turkish dish that consists of small, handmade dumplings filled with seasoned ... Feb 3, 2024 ... What are the Traditional Turkish foods? · Kebab: A Culinary Icon · Köfte: The Comforting Classic · Meze: The Spirit of Turkish Gastronomy · Meat ... Oct 22, 2022 ... What Turkish-style foods are considered "Tra

In [7]:
import re
import json

national_list = ["Turkish", "Egyptian", "Korean", "Thai", "Mexican", "Spanish"]
facet_list = ["food", "drink", "clothing"]

# 각 facet에 대한 일반적인 단어 리스트
general_terms = {
    "food": ["bread", "rice", "soup", "salad"],
    "drink": ["water", "coffee", "tea", "milk"],
    "clothing": ["shirt", "pants", "dress", "shoes"]
}

national_keyword_dict = {}

for facet in facet_list:
    national_keyword_dict[facet] = {}
    
    for national in national_list:
        system_prompt = "Limit speaking to anything beyond what is asked. !!DO NOT SPEAK SURE!!"
        search_result = tool.run(f"{national} traditional {facet}")
        
        # 일반적인 단어 리스트를 문자열로 변환
        general_terms_str = ', '.join(general_terms[facet])
        query = f"Extract proper nouns of traditional {national} {facet} from the provided information. Exclude common {facet} terms such as {general_terms_str}. Provide the names according to the output format and ensure they are specific to {national} culture."

        # 라마로 키워드 추출
        results = set(chain.invoke({"system": system_prompt, "information": search_result, "query": query}))

        # 출력된 내용에서 한번 더 후처리
        proc_set = set()
        for item in results:
            # 특수 문자 제거, 소문자 변환, 앞뒤 공백 제거
            cleaned_item = re.sub(r'[^a-zA-Z0-9\n\s/]', '', item).lower().strip()
            # '\n'이 있는 경우 분할하여 저장
            if '\n' in cleaned_item:
                cleaned_item_list = cleaned_item.split('\n')
                for c in cleaned_item_list:
                    proc_set.add(c.strip())
            else:
                proc_set.add(cleaned_item)
        # 키워드 길이 0~20까지만 유지
        filtered_results = [item for item in proc_set if 0 < len(item) < 20]

        national_keyword_dict[facet][national] = filtered_results

        print(national, ": ", filtered_results)

print(national_keyword_dict)

# augment_dict를 JSON 형태의 문자열로 변환합니다.
national_keyword_dict_str = json.dumps(national_keyword_dict, ensure_ascii=False, indent=4)

# 파일을 쓰기 모드로 엽니다. 'w'를 사용하여 파일이 이미 존재할 경우 내용을 덮어씁니다.
with open('national_keyword.json', 'w', encoding='utf-8') as file:
    # 변환된 JSON 문자열을 파일에 씁니다.
    file.write(national_keyword_dict_str)


OllamaEndpointNotFoundError: Ollama call failed with status code 404. Maybe your model is not found and you should pull the model with `ollama pull llama2:13b`.

# 여기서부터는 데이터 증강 코드

In [32]:
## 증강할 캡션 문장 로드
with open('data/sentences.json', 'r') as f:
    sentences = json.load(f)

print(sentences)

with open('filtered_national_keyword.json', 'r') as f:
    national_keyword_dict = json.load(f)

print(national_keyword_dict)

{'food': {'Turkish': ['cig kofte', 'baklava', 'cig borek', 'dolma', 'manti'], 'Egyptian': ['baba ghanoush', 'koshari', 'fatteh', 'shawarma', 'mulukhiyah'], 'Korean': ['bibimbap', 'bulgogi', 'dongchimi', 'jjajangmyeon', 'kimchi'], 'Thai': ['tom kha gai', 'khao soi', 'som tam', 'pad kra pao moo', 'massaman curry'], 'Mexican': ['carnitas', 'chiles en nogada', 'gorditas', 'mole poblano', 'chilaquiles'], 'Spanish': ['gazpacho', 'patatas bravas', 'tortilla espaola', 'pulpo a la gallega', 'gambas al ajillo']}, 'drink': {'Turkish': ['raki', 'ayran', 'algam'], 'Egyptian': ['Tamer Hindi', 'sugarcane juice', 'sobia', 'karkade'], 'Korean': ['makgeolli', 'dongdongju', 'soju', 'maesilju', 'sikhye'], 'Thai': ['nam anchan', 'grass jelly drink', 'oliang', 'nam matoom', 'cha yen'], 'Mexican': ['margarita', 'carajillo', 'el pajarete', 'mezcal', 'paloma'], 'Spanish': ['licor de hierbas', 'horchata', 'cava', 'rioja', 'tinto de verano']}, 'clothing': {'Turkish': ['abaya', 'yelek', 'entari'], 'Egyptian': ['k

In [33]:
facet = ["food", "drink", "clothing"]
augment_dict = {}

for _facet in facet:
    augment_dict[_facet] = {}
    for sentence in sentences[_facet]:
        for national, keywords in national_keyword_dict[_facet].items():
            if national not in augment_dict[_facet]:
                augment_dict[_facet][national] = {}
            for keyword in keywords:      
                if keyword not in augment_dict[_facet][national]:
                    augment_dict[_facet][national][keyword] = []

                augmented_sentence = sentence.format(national_keyword=keyword)
                augment_dict[_facet][national][keyword].append(augmented_sentence)

# augment_dict를 JSON 형태의 문자열로 변환합니다.
augment_dict_str = json.dumps(augment_dict, ensure_ascii=False, indent=4)

# 파일을 쓰기 모드로 엽니다. 'w'를 사용하여 파일이 이미 존재할 경우 내용을 덮어씁니다.
with open('filtered_augmented_sentences.json', 'w', encoding='utf-8') as file:
    # 변환된 JSON 문자열을 파일에 씁니다.
    file.write(augment_dict_str)

In [34]:
total_augmented_size = 0
for facet, national in augment_dict.items():
    for _national in national:
        for keyword in augment_dict[facet][_national].values():
            total_augmented_size += len(keyword)

print(total_augmented_size) # 증강된 문장 수

16000
