# 實驗

In [1]:
import configparser
import os

from dotenv import load_dotenv

load_dotenv(".env")

CONFIG = configparser.ConfigParser()
PATH_CONFIG = os.getenv('path_2_config')

ENVIRONMENT = os.getenv('environment')
if ENVIRONMENT=="windows":
    CONFIG.read(PATH_CONFIG, encoding='utf-8')
else:
    CONFIG.read(PATH_CONFIG)

DEBUGGER = CONFIG["DEBUGGER"]["DEBUGGER"]

## 超參數設定


### 停止條件

In [2]:
PATH_STOP_FILE = os.path.join(os.getcwd(), 'stop_true.txt') # 人工 early stop 的檔案位置
NUM_NEW_PROMPT = 20   # 必須是 prompt_population 的倍數
STOP_SCORE=100  # 練蠱終止條件(我是設超過baseline做100題後的分數，這邊看你資料量來設)

### 路徑

In [3]:
from pathlib import Path

folder_record = Path(CONFIG["datapath"]["record_folder"])
name_experiment = "experiment_test" # 主要是改這行
FOLDER_EXPERIMENT = folder_record / name_experiment

## 製作初始化資料

**只有要建立新資料的時候才需要跑**

In [4]:
from main import init_setting

type_llm = "Breeze"
# type_embedding = "multi-qa-mpnet-base-dot-v1"
type_embedding = "bgem3"
# type_embedding = None

# path_prompt = CONFIG["datapath"]["init_os_prompt"]
# path_prompt = "D:\\實習\\evoprompt\\Ress\\dataset\\init_os_prompt_corpus_2024_0724_1357.json"
path_prompt = "D:\\實習\\evoprompt\\Ress\\dataset\\init_os_prompt_corpus.json"

path_data = CONFIG["datapath"]["Final_Quality"]
print(f"{path_data=}")

ttl_model, ttl_dataset, ttl_pair_os_prompt_scores = init_setting(type_llm, type_embedding, path_data, path_prompt)

  from .autonotebook import tqdm as notebook_tqdm


目前使用模型為:Breeze


100%|██████████| 30/30 [06:43<00:00, 13.44s/it]
100%|██████████| 7/7 [01:37<00:00, 13.89s/it]
100%|██████████| 30/30 [07:27<00:00, 14.90s/it]
100%|██████████| 7/7 [01:40<00:00, 14.35s/it]
100%|██████████| 30/30 [06:49<00:00, 13.65s/it]
100%|██████████| 7/7 [01:36<00:00, 13.77s/it]
100%|██████████| 30/30 [07:32<00:00, 15.07s/it]
100%|██████████| 7/7 [01:42<00:00, 14.64s/it]


In [5]:
import json
from utils.tools import get_file_name, time_now

t = time_now()
path_folder = "D:\\實習\\evoprompt\\Ress\\init_prompt"
path_file = f"{path_folder}/{t}_20data.json"
data = {
    "corpus": get_file_name(path_data),
    "type_llm": type_llm,
    "type_embedding": type_embedding,
    "prompt_popularion": ttl_pair_os_prompt_scores,
    "data": ttl_dataset
}
with open(path_file, 'w') as file:
    json.dump(data, file, indent=4)

## 初始化

In [4]:
import json

from utils.call_model.embedding import Encoder
from utils.call_model.llm import LLM

def get_init(path_init = None):
    
    if path_init==None:
        path_init = "D:\\實習\\evoprompt\\Ress\\init_prompt\\2024_0726_1212_data_30-7.json"
        # path_init = "D:\\實習\\evoprompt\\Ress\\init_prompt\\test.json"
    with open(path_init,  'r') as file:
        record_init = json.load(file)

    # 指定 llm
    llm = LLM(record_init["type_llm"])

    # 指定 embedding model
    if record_init["type_embedding"] is None:
        embedding_model = None
    else:
        embedding_model = Encoder(record_init["type_embedding"])

    ttl_model = (llm, embedding_model)
    
    return ttl_model, record_init

## ReSS

In [5]:
import json
from utils.ReSS import ReSS
from utils.tools import time_now

def test_ReSS():
    """ 每生成一個 prompt 就存一次 population
    """
    ttl_model, record_init = get_init()
    corpus = record_init["corpus"]
    type_llm = record_init["type_llm"]
    type_embedding = record_init["type_embedding"]
    ttl_pair_os_prompt_scores = record_init["prompt_popularion"]
    ttl_dataset = record_init["data"]
    
    ttl_dataset = {
        "train_split": ttl_dataset["train_split"][:1],
        "dev_split": ttl_dataset["dev_split"][:1]
    }

    # stop_run_num = NUM_NEW_PROMPT   # 或是設一個回合數來終止(本來我會讓他跑到天荒地老所以沒有用for loop)
    stop_run_num = 1

    record_population = []

    # record_population.append(ttl_pair_os_prompt_scores)
    sorted_pair = sorted(ttl_pair_os_prompt_scores,  key=lambda x: x['train_score'],  reverse=True)
    while(
        sorted_pair[0]['train_score']<STOP_SCORE
        and stop_run_num>0
        and not os.path.exists(PATH_STOP_FILE)   # 人工 early stop
    ):
        new_population = ReSS(ttl_model, ttl_dataset, sorted_pair)
        
        record_population.append(new_population)
        sorted_pair = sorted(new_population,  key=lambda x: x['train_score'],  reverse=True)

        stop_run_num -= 1

    # 儲存結果
    t = time_now()
    file_path = FOLDER_EXPERIMENT / f"{t}_ReSS.json"
    data = {
        "corpus": corpus,
        "type_llm": type_llm,
        "type_embedding": type_embedding,
        "best_promt": sorted_pair[0],
        "record": record_population
    }
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)
    print(f"\n\n\nthe result is saved at:\n{file_path}")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
for i in range(1):
    test_ReSS()

100%|██████████| 1/1 [00:02<00:00,  2.09s/it]
100%|██████████| 1/1 [00:18<00:00, 18.57s/it]




the result is saved at:
D:\實習\evoprompt\Ress\record\experiment_test\2024_0726_1653_ReSS.json





## EvoDE

In [5]:
import json
from pathlib import Path
from utils.EvoPrompt import EvoDE
from utils.tools import time_now

def test_EvoDE():
    """ 每更新一次整個 population 才儲存一次
    """
    t = time_now()
    path_folder = FOLDER_EXPERIMENT / f"{t}_Evo"
    path_folder.mkdir(parents=True, exist_ok=True)
    
    ttl_model, record_init = get_init()
    corpus = record_init["corpus"]
    type_llm = record_init["type_llm"]
    type_embedding = record_init["type_embedding"]
    ttl_pair_os_prompt_scores = record_init["prompt_popularion"]
    ttl_dataset = record_init["data"]
    
    # num_test = 1
    # ttl_dataset = {
    #     "train_split": ttl_dataset["train_split"][:num_test],
    #     "dev_split": ttl_dataset["dev_split"][:num_test]
    # }

    stop_run_num = int(NUM_NEW_PROMPT/4)   # 或是設一個回合數來終止(本來我會讓他跑到天荒地老所以沒有用for loop)
    # stop_run_num=1

    record_population = []

    record_population.append(ttl_pair_os_prompt_scores)
    sorted_pair = sorted(ttl_pair_os_prompt_scores,  key=lambda x: x['train_score'],  reverse=True)
    while(
        sorted_pair[0]['train_score']<STOP_SCORE
        and stop_run_num>0
        and not os.path.exists(PATH_STOP_FILE)   # 人工 early stop
    ):
        new_population = EvoDE(ttl_model, ttl_dataset, sorted_pair)
        
        record_population.append(new_population)
        sorted_pair = sorted(new_population,  key=lambda x: x['train_score'],  reverse=True)

        # 儲存結果
        file_path = path_folder / f"{int(NUM_NEW_PROMPT/4) - stop_run_num:02d}_EvoDE.json"
        data = {
            "corpus": corpus,
            "type_llm": type_llm,
            "type_embedding": type_embedding,
            "best_promt": sorted_pair[0],
            "record": sorted_pair
        }
        with open(file_path, 'w') as file:
            json.dump(data, file, indent=4)
        print(f"\n\n\nthe result is saved at:\n{file_path}")
        
        stop_run_num -= 1
    

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
pre_file_path = None
for i in range(5):
    test_EvoDE()

response=<Response [200]>
j_result={'generated_text': " Identifying the different parts between Prompt 1 and Prompt 2:\nPrompt 1: The text content should be concise and follow a universal structure.\nTo ensure a correct answer, the process of creating a summary should focus on identifying the main points and key details of the text.\nIt should avoid including specific content or names from the original article and instead provide a general overview of the information.\nThe summary should follow a universal structure, presenting the main idea and supporting details in a clear and concise manner.\nBy following these guidelines, the summary can accurately reflect the content of the text and lead to a correct answer.\nPrompt 2: The text content should be concise and follow a universal structure.\nTo ensure a correct answer, the process of creating a summary should focus on identifying the main points and key details of the text.\nIt should avoid including specific content or names from the

  3%|▎         | 1/30 [00:02<01:08,  2.34s/it]

response=<Response [200]>
j_result={'generated_text': ' Answer__option3'}
response=<Response [200]>
j_result={'generated_text': ' Open access (OA) is a revolutionary approach to sharing research literature that removes price and permission barriers, making it free and accessible to readers worldwide. Conventional publishing, on the other hand, involves charging for access to research articles and often imposes copyright and licensing restrictions. The relationship between conventional publishing and open access is that they serve different purposes and cater to the needs of authors and readers in different ways. While conventional publishing may be better for authors who rely on sales for income, open access benefits readers by providing free and unrestricted access to research. As the access revolution continues, both conventional publishing and open access are adapting to the needs of authors and readers, with many people interacting with both and venues picking one or the other opti

## EvoGA

In [7]:
import json
from pathlib import Path
from utils.EvoPrompt import EvoGA
from utils.tools import time_now

def test_EvoGA():
    """ 每更新一次整個 population 才儲存一次
    """
    t = time_now()
    path_folder = FOLDER_EXPERIMENT / f"{t}_EvoGA"
    path_folder.mkdir(parents=True, exist_ok=True)
    
    ttl_model, record_init = get_init()
    corpus = record_init["corpus"]
    type_llm = record_init["type_llm"]
    type_embedding = record_init["type_embedding"]
    ttl_pair_os_prompt_scores = record_init["prompt_popularion"]
    ttl_dataset = record_init["data"]
    
    ttl_dataset = {
        "train_split": ttl_dataset["train_split"][:1],
        "dev_split": ttl_dataset["dev_split"][:1]
    }

    # stop_run_num = int(NUM_NEW_PROMPT/4)   # 或是設一個回合數來終止(本來我會讓他跑到天荒地老所以沒有用for loop)
    stop_run_num=1

    record_population = []

    record_population.append(ttl_pair_os_prompt_scores)
    sorted_pair = sorted(ttl_pair_os_prompt_scores,  key=lambda x: x['train_score'],  reverse=True)
    while(
        sorted_pair[0]['train_score']<STOP_SCORE
        and stop_run_num>0
        and not os.path.exists(PATH_STOP_FILE)   # 人工 early stop
    ):
        new_population = EvoGA(ttl_model, ttl_dataset, sorted_pair)
        
        record_population.append(new_population)
        sorted_pair = sorted(new_population,  key=lambda x: x['train_score'],  reverse=True)

        # 儲存結果
        file_path = f"{path_folder}\\{int(NUM_NEW_PROMPT/4) - stop_run_num+2:02d}_EvoGA.json"
        data = {
            "corpus": corpus,
            "type_llm": type_llm,
            "type_embedding": type_embedding,
            "best_promt": sorted_pair[0],
            "record": sorted_pair
        }
        with open(file_path, 'w') as file:
            json.dump(data, file, indent=4)
        print(f"\n\n\nthe result is saved at:\n{file_path}")
        
        stop_run_num -= 1
    

In [8]:
pre_file_path = None
for i in range(1):
    test_EvoGA()

100%|██████████| 1/1 [00:02<00:00,  2.08s/it]
100%|██████████| 1/1 [00:27<00:00, 27.48s/it]
100%|██████████| 1/1 [00:02<00:00,  2.12s/it]
100%|██████████| 1/1 [00:26<00:00, 26.82s/it]
100%|██████████| 1/1 [00:02<00:00,  2.13s/it]
100%|██████████| 1/1 [00:25<00:00, 25.16s/it]
100%|██████████| 1/1 [00:02<00:00,  2.11s/it]
100%|██████████| 1/1 [00:23<00:00, 23.16s/it]




the result is saved at:
D:\實習\evoprompt\Ress\record\experiment_test\2024_0726_1715_EvoGA\06_EvoGA.json



