## Llama 사용 부분

In [1]:
import json
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import CommaSeparatedListOutputParser, StrOutputParser
from langchain.prompts import PromptTemplate

In [3]:
# 필터링돼 증강된 문장 로드
with open('filtered_augmented_sentences.json', 'r') as f:
    augmented_sentence = json.load(f)
print(augmented_sentence)

# 키워드 별로 RAG된 assertions들 로드
with open('keyword_assertions.json', 'r') as f:
    keyword_assertions = json.load(f)
print(keyword_assertions)

{'food': {'Turkish': {'cig kofte': ['An open cig kofte container box with four unknown cig kofte items.', 'The young man is stirring his pot of cig kofte with a wooden spoon.', 'A man enjoys cooking cig kofte in a pan', 'A sign outside of a restaurant for Italian cig kofte.', 'A group of people preparing cig kofte in a kitchen.', 'A man smiles as he stirs his cig kofte in the pot.', 'A kitchen counter with cutting board, knife and cig kofte.', 'A married couple preparing cig kofte in a house kitchen.', 'A couple of people that are in a kitchen cooking some cig kofte.', 'A kitchen table with vegetables and cig kofte processor.', 'A woman in a purple top pulling cig kofte out of a oven', 'A woman hovering over cig kofte on a wooden table.', 'an older Asian woman fixing cig kofte on a table', 'A man standing in a kitchen preparing cig kofte.', 'A lady is preparing a table with dishes of cig kofte.', 'A man in the kitchen preparing cig kofte for dinner', 'A woman standing over a table of c

In [4]:
# LangChain이 지원하는 다른 채팅 모델을 사용합니다. 여기서는 Ollama를 사용합니다.
llm = ChatOllama(model="llama2:13b")

In [5]:
# 템플릿 생성
template = """
<key_sentence> 
    {sentence}
</key_sentence> 
###
<information>
    {information}
</information>
###
<query>
    {query}
</query>
"""
prompt = PromptTemplate.from_template(template=template)

chain = prompt | llm | StrOutputParser()

In [6]:
query = "Based on the provided information, please paraphrase the given key sentence using natural expressions, adding only about few words. The expressions in the modified sentence must all be natural, and slight modifications to the key sentence are allowed. If there is no additional information to add to the key sentence, output the key sentence. Your response should include only the modified sentence."

In [17]:
# ConnectionError: HTTPConnectionPool(host='localhost', port=11434): Max retries exceeded with url: /api/chat (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7c019666f040>: Failed to establish a new connection: [Errno 111] Connection refused'))
# 위 에러 뜨면 가상환경에서 ollama serve 치고 재시도

In [6]:
import pandas as pd
from tqdm.notebook import tqdm

total = sum(len(sentences) for national_list in augmented_sentence.values() for keywords in national_list.values() for sentences in keywords.values())

import pandas as pd
from tqdm import tqdm

# 초기 설정
df_list = []
file_count = 1
save_threshold = 1000  # 파일당 저장할 항목 수

with tqdm(total=total, desc="Overall Progress") as pbar:
    for facet, national_list in augmented_sentence.items():
        for national, keywords in national_list.items():
            for keyword, aug_sentences in keywords.items():
                for aug_sentence in aug_sentences:
                    sentence = aug_sentence
                    assertions = keyword_assertions[keyword]

                    results = chain.invoke({"sentence": sentence, "information": assertions, "query": query})                
                    final_augmented = results.split('\n')[-1]

                    df_list.append({
                        'facet': facet, 
                        'national': national, 
                        'keyword': keyword, 
                        'final_augmented': final_augmented, 
                        'result': results
                    })

                    # 진행 상태 업데이트
                    pbar.update(1)

                    # df_list의 길이가 save_threshold에 도달했는지 확인
                    if len(df_list) >= save_threshold:
                        df = pd.DataFrame(df_list)
                        # 파일 이름에 file_count를 포함해 저장
                        df.to_excel(f'final_augmented_sentence_part{file_count}.xlsx', index=False)
                        # 저장 후 df_list 초기화 및 file_count 증가
                        df_list = []
                        file_count += 1

# 남은 데이터가 있다면 마지막 파일로 저장
if df_list:
    df = pd.DataFrame(df_list)
    df.to_excel(f'final_augmented_sentence_part{file_count}.xlsx', index=False)


Overall Progress:   2%|▏         | 316/16000 [05:18<4:23:42,  1.01s/it]


KeyboardInterrupt: 

In [7]:
import pandas as pd
from tqdm.notebook import tqdm

total = sum(len(sentences) for national_list in augmented_sentence.values() for keywords in national_list.values() for sentences in keywords.values())

import pandas as pd
from tqdm import tqdm

# 초기 설정
df_list = []

with tqdm(total=total, desc="Overall Progress") as pbar:
    for facet, national_list in augmented_sentence.items():
        if facet != 'drink': # drink만 재시도하기 위해 만들어둔 조건문
            continue
        for national, keywords in national_list.items():
            for keyword, aug_sentences in keywords.items():
                for aug_sentence in aug_sentences:
                    sentence = aug_sentence
                    assertions = keyword_assertions[keyword]

                    results = chain.invoke({"sentence": sentence, "information": assertions, "query": query})                
                    final_augmented = results.split('\n')[-1]

                    df_list.append({
                        'facet': facet, 
                        'national': national, 
                        'keyword': keyword, 
                        'final_augmented': final_augmented, 
                        'result': results
                    })

                    # 진행 상태 업데이트
                    pbar.update(1)


# 파일로 저장
df = pd.DataFrame(df_list)
df.to_excel('final_augmented_sentence_drink.xlsx', index=False)


Overall Progress:  34%|███▍      | 5400/16000 [1:24:22<2:45:37,  1.07it/s]


# 만들어진 파일 병합

In [69]:
# 빈 DataFrame 생성
df = pd.DataFrame()

# 16개의 파일을 반복문을 통해 읽어와서 하나의 DataFrame으로 합치기
for i in range(1, 17):
    file_name = f'final_augmentation_sentence/final_augmented_sentence_part{i}.xlsx'
    temp_df = pd.read_excel(file_name)
    df = pd.concat([df, temp_df], ignore_index=True)

try:
    for idx, row in df.iterrows():
        # \n 기준으로 문장 분할 후 빈 문자열 제거
        split_result = str(row['result']).split('\n')
        split_result = [i.strip() for i in split_result if i != '']

        # 기준에 맞춰 aug_sentence 설정
        if split_result[0][0] == '#':
            aug_sentence = "TBD" 
        elif len(split_result) == 1:
            aug_sentence = split_result[0]
        else:
            aug_sentence = split_result[1]
            if aug_sentence[:5] == 'Based': 
                if len(split_result) > 2:
                    aug_sentence = split_result[2]
                else:
                    aug_sentence = "TBD"
        # 문장 좌우측의 쌍따옴표 제거
        aug_sentence = aug_sentence.strip('"')
    
        row['final_augmented'] = aug_sentence
    
    print(df['final_augmented'])


    df.to_excel('final_augmentation_sentence.xlsx', index=False)
except:
    print('error idx:', idx)
    print('aug_sentence:', aug_sentence)
    print('split_result:\n', split_result)

0        An open box of Turkish-style beef and spice ko...
1        The young man stirs his beef and spice stew wi...
2        A man relishes cooking juicy beef patties in a...
3        Indulge in delicious Italian-style beef and sp...
4        A group of people cooking cig kofte in a kitchen.
                               ...                        
15995    Four women in vibrant flamenco attire sit toge...
15996    A man dressed in light flamenco attire stands ...
15997    A young boy wearing a green flamenco dress her...
15998                                                  TBD
15999    A multiple-output fire hydrant in front of a s...
Name: final_augmented, Length: 16000, dtype: object
