## Llama 사용 부분

In [8]:
import json
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import CommaSeparatedListOutputParser, StrOutputParser
from langchain.prompts import PromptTemplate

In [9]:
# 필터링돼 증강된 문장 로드
with open('filtered_augmented_sentences.json', 'r') as f:
    augmented_sentence = json.load(f)
print(augmented_sentence)

# 키워드 별로 RAG된 assertions들 로드
with open('keyword_assertions.json', 'r') as f:
    keyword_assertions = json.load(f)
print(keyword_assertions)

{'cig kofte': 'Kofte is a Turkish dish made with ground beef and spices.', 'baklava': 'Baklava is a Turkish dessert typically served with tea or coffee. Baklava is a Middle Eastern dessert that is made of nuts, phyllo dough, and a sweet syrup. Baklava is a popular dessert in Lebanon, Greece, and Turkey.', 'cig borek': '', 'dolma': 'Dolma is a Turkish dish made of grape leaves stuffed with rice and other ingredients. Dolma is a popular dish throughout the Middle East, Central Asia, Eastern Europe, and middle east. Dolma is a Turkish dish consisting of stuffed grape leaves.', 'manti': 'Manti is a Turkish dish made of boiled ravioli-like pasta, and is popular for its home-cooked taste. The Turkish dish of manti, which are beef dumplings, remains popular and is often made at home.', 'baba ghanoush': '', 'koshari': 'Kasha is a type of grain that is popular in Eastern European and Russian cuisine.', 'fatteh': '', 'shawarma': 'Shawarma is a Middle Eastern dish that is made with sliced seasone

In [10]:
# LangChain이 지원하는 다른 채팅 모델을 사용합니다. 여기서는 Ollama를 사용합니다.
llm = ChatOllama(model="llama2:13b")

In [11]:
# 템플릿 생성
template = """
<key_sentence> 
    {sentence}
</key_sentence> 
###
<information>
    {information}
</information>
###
<query>
    {query}
</query>
"""
prompt = PromptTemplate.from_template(template=template)

chain = prompt | llm | StrOutputParser()

In [12]:
query = "Based on the provided information, please paraphrase the given key sentence using natural expressions, adding only about few words. The expressions in the modified sentence must all be natural, and slight modifications to the key sentence are allowed. If there is no additional information to add to the key sentence, output the key sentence. Your response should include only the modified sentence."

In [13]:
# ConnectionError: HTTPConnectionPool(host='localhost', port=11434): Max retries exceeded with url: /api/chat (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7c019666f040>: Failed to establish a new connection: [Errno 111] Connection refused'))
# 위 에러 뜨면 가상환경에서 ollama serve 치고 재시도

In [15]:
import pandas as pd
from tqdm.notebook import tqdm

total = sum(len(sentences) for national_list in augmented_sentence.values() for keywords in national_list.values() for sentences in keywords.values())

import pandas as pd
from tqdm import tqdm

# 초기 설정
df_list = []
file_count = 1
save_threshold = 1000  # 파일당 저장할 항목 수

with tqdm(total=total, desc="Overall Progress") as pbar:
    for facet, national_list in augmented_sentence.items():
        for national, keywords in national_list.items():
            for keyword, aug_sentences in keywords.items():
                for aug_sentence in aug_sentences:
                    sentence = aug_sentence
                    assertions = keyword_assertions[keyword]

                    results = chain.invoke({"sentence": sentence, "information": assertions, "query": query})                
                    final_augmented = results.split('\n')[-1]

                    df_list.append({
                        'facet': facet, 
                        'national': national, 
                        'keyword': keyword, 
                        'final_augmented': final_augmented, 
                        'result': results
                    })

                    # 진행 상태 업데이트
                    pbar.update(1)

                    # df_list의 길이가 save_threshold에 도달했는지 확인
                    if len(df_list) >= save_threshold:
                        df = pd.DataFrame(df_list)
                        # 파일 이름에 file_count를 포함해 저장
                        df.to_excel(f'final_augmented_sentence_part{file_count}.xlsx', index=False)
                        # 저장 후 df_list 초기화 및 file_count 증가
                        df_list = []
                        file_count += 1

# 남은 데이터가 있다면 마지막 파일로 저장
if df_list:
    df = pd.DataFrame(df_list)
    df.to_excel(f'final_augmented_sentence_part{file_count}.xlsx', index=False)


Overall Progress: 100%|██████████| 16000/16000 [4:07:32<00:00,  1.08it/s]  
