In [36]:
import json
import re
import os

In [37]:
from langchain_community.utilities import GoogleSearchAPIWrapper
from langchain_core.tools import Tool
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import CommaSeparatedListOutputParser
from langchain.prompts import PromptTemplate

In [38]:
# 파일 위치 지정
keys_file_path = os.path.join('data', 'api_keys.txt')

# 파일에서 API 키를 로드하는 함수
def load_api_keys(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        keys = json.load(file)
    return keys

# API 키 사용
api_keys = load_api_keys(keys_file_path)

os.environ["GOOGLE_CSE_ID"] = api_keys['google_cse_id']
os.environ["GOOGLE_API_KEY"] = api_keys['google_api_key']

In [39]:
## Web search
search = GoogleSearchAPIWrapper()

tool = Tool(
    name="google_search",
    description="Search Google for recent results.",
    func=search.run,
)

## Ollama
llm = ChatOllama(model="llama2")

## CSV Parser
output_parser = CommaSeparatedListOutputParser()
format_instructions = output_parser.get_format_instructions()

In [40]:
# 템플릿 생성
template = """
Information: 
{information}
###
Question: 
{query}
###
Output format:
{format_instructions}
###
System:
{system}
"""

prompt = PromptTemplate.from_template(
    template=template,
    partial_variables={"format_instructions":  format_instructions}
    )

chain = prompt | llm | output_parser

In [41]:
# national = "Korean"
# facet = "food" # food, drink, clothing

In [52]:
national_list = ["Turkish", "Egyptian", "Korean", "Thai", "Mexican", "Spanish"]
facet_list = ["food", "drink", "clothing"]

national_keyword_dict = {}

for facet in facet_list:
    national_keyword_dict[facet] = {}
    
    for national in national_list:
        system_prompt = "Limit speaking to anything beyond what is asked. !!DO NOT SPEAK SURE!!"
        search_result = tool.run(f"{national} traditional {facet}")
        query = f"Find proper nouns of traditional {national} {facet} from 'information' and write them according to the 'output format', excluding names that are general {facet}."

        # 라마로 키워드 추출
        results = set(chain.invoke({"system": system_prompt, "information": search_result, "query": query}))

        # 출력된 내용에서 한번 더 후처리
        proc_set = set()
        for item in results:
            # 특수 문자 제거, 소문자 변환, 앞뒤 공백 제거
            cleaned_item = re.sub(r'[^a-zA-Z0-9\n\s/]', '', item).lower().strip()
            # '\n'이 있는 경우 분할하여 저장
            if '\n' in cleaned_item:
                cleaned_item_list = cleaned_item.split('\n')
                for c in cleaned_item_list:
                    proc_set.add(c.strip())
            else:
                proc_set.add(cleaned_item)
        # 키워드 길이 0~20까지만 유지
        filtered_results = [item for item in proc_set if 0 < len(item) < 20]

        national_keyword_dict[facet][national] = filtered_results

        print(national, ": ", filtered_results)

print(national_keyword_dict)

# augment_dict를 JSON 형태의 문자열로 변환합니다.
national_keyword_dict_str = json.dumps(national_keyword_dict, ensure_ascii=False, indent=4)

# 파일을 쓰기 모드로 엽니다. 'w'를 사용하여 파일이 이미 존재할 경우 내용을 덮어씁니다.
with open('national_keyword.json', 'w', encoding='utf-8') as file:
    # 변환된 JSON 문자열을 파일에 씁니다.
    file.write(national_keyword_dict_str)

Turkish :  ['kfte', 'iskender kebab', 'meze', 'yaprak dolma', 'dner kebap', 'baklava', 'inegol kofte', 'mercimek kofte', 'menemen', 'brek', 'dolma', 'manti', 'lahmacun', 'pilav']
Egyptian :  ['koshari', 'ful medames', 'kofta', 'shawarma', 'baba ganoush', 'fatteh', 'mulukhiyah', 'falafel']
Korean :  ['doenjang jjigae', 'japchae', 'dongchimi', 'mandoo', 'bulgogi', 'bibimbap']
Thai :  ['pad kra pao', 'khao pad', 'pad thai', 'som tam', 'tom yum goong', 'khao soi', 'massaman']
Mexican :  ['pozole', 'aguachile', 'barbacoa', 'mole poblano', 'carnitas', 'birria', 'chiles en nogada']
Spanish :  ['patatas bravas', 'paella', 'tortilla espaola', 'pollo a la plancha', 'gazpacho', 'gambas al ajillo']
Turkish :  ['ayran', 'rak', 'algam']
Egyptian :  ['tea', 'mineral water', 'karkade', 'coffee', 'sugarcane juice']
Korean :  ['sujeonggwa', 'sikhye', 'makgeoli', 'gamhongno', 'soju']
Thai :  ['singha thai beer', 'coco yen iced cocoa']
Mexican :  ['carajillo', 'tepache', 'paloma', 'mezcal', 'raicilla', 'm

In [45]:
## 증강할 캡션 문장 로드
with open('data/sentences.json', 'r') as f:
    sentences = json.load(f)

print(sentences)

{'food': ['The young man is stirring his pot of {national_keyword} with a wooden spoon.', 'A group of people preparing {national_keyword} in a kitchen.', 'A kitchen counter with cutting board, knife and {national_keyword}.', 'A married couple preparing {national_keyword} in a house kitchen.', 'Several people are sitting around an outdoor table eating a {national_keyword}.', 'A man sitting at a table having a {national_keyword}.', 'A plate of {national_keyword} in containers is on a tray.', 'a close up of a few plates of {national_keyword} on a table', '{national_keyword} is served on a plate near a vase.', 'A man sitting at a table with a large plate of {national_keyword} on it.'], 'drink': ['Man in cycling clothes {national_keyword} beside his bicycle.', 'a plate of food on a table next to a {national_keyword}', 'Friends having a {national_keyword} together in the kitchen', 'Women are preparing {national_keyword} for themselves in the kitchen.', 'many people at tables with {national_k

In [48]:
facet = "food"
augment_dict = {facet: {}}

for sentence in sentences[facet]:
    for national, keywords in national_keyword_dict[facet].items():
        if national not in augment_dict[facet]:
            augment_dict[facet][national] = []
        for keyword in keywords[:3]:      
            augmented_sentence = sentence.format(national_keyword=keyword)
            print(augmented_sentence)
            augment_dict[facet][national].append(augmented_sentence)

# augment_dict를 JSON 형태의 문자열로 변환합니다.
augment_dict_str = json.dumps(augment_dict, ensure_ascii=False, indent=4)

# 파일을 쓰기 모드로 엽니다. 'w'를 사용하여 파일이 이미 존재할 경우 내용을 덮어씁니다.
with open('augmented_sentences.json', 'w', encoding='utf-8') as file:
    # 변환된 JSON 문자열을 파일에 씁니다.
    file.write(augment_dict_str)

The young man is stirring his pot of gzleme with a wooden spoon.
The young man is stirring his pot of baklava with a wooden spoon.
The young man is stirring his pot of piyaz with a wooden spoon.
The young man is stirring his pot of kebab with a wooden spoon.
The young man is stirring his pot of menemen with a wooden spoon.
The young man is stirring his pot of kofta with a wooden spoon.
The young man is stirring his pot of ful medames with a wooden spoon.
The young man is stirring his pot of shawarma with a wooden spoon.
The young man is stirring his pot of baba ganoush with a wooden spoon.
The young man is stirring his pot of fatteh with a wooden spoon.
The young man is stirring his pot of doenjang jjigae with a wooden spoon.
The young man is stirring his pot of japchae with a wooden spoon.
The young man is stirring his pot of rice cakes/tteok with a wooden spoon.
The young man is stirring his pot of juk with a wooden spoon.
The young man is stirring his pot of hotteok with a wooden sp