## Импорт необходимых библиотек и отключение ворнингов

In [132]:
import glob
import os
import pandas as pd
import random
import dspy
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
import warnings
warnings.filterwarnings('ignore')

## Инициализация моделей для генерации Zero-Shot CoT и bootstrapping

In [None]:
turbo = dspy.LM('gpt-3.5-turbo', max_tokens = 250, model_type = 'chat', api_base = "https://api.proxyapi.ru/openai/v1", api_key = "")
dspy.configure(lm = turbo)
gpt4 = dspy.LM('gpt-4o', max_tokens = 350, model_type='chat', api_base = "https://api.proxyapi.ru/openai/v1", api_key = "")

## Если установить значение ниже истинным, то буст будет сделан заново

In [None]:
RUN_FROM_SCRATCH = False

## Загрузка данных (каждый раз указываем путь к датасету!!)

In [135]:
def load_scone(root_dir = "C:\\Users\\julia\\OneDrive\\Desktop\\MFTI_code\\dspy\\scone_nli\\train"):
    dfs = []
    for filename in glob.glob("*.csv", root_dir = root_dir):
        url = f'{root_dir}\\{filename}'
        df = pd.read_csv(url, index_col = 0)
        df['category'] = os.path.basename(filename).replace(".csv", "")
        dfs.append(df)
    data_df = pd.concat(dfs)
    
    def as_example(row):
        suffix = '' if row ['category'] == 'one_scoped' else '_edited'
        hkey = 'sentence2' + suffix
        question = row[hkey][0].lower() + row[hkey][1:].strip(".")
        question = f"Can we logically conclude for sure that {question}?"
        label = "Yes" if row['gold_label' + suffix] == 'entailment' else "No"
        return dspy.Example({
            "context": row['sentence1' + suffix],
            "question": question,
            "answer": label,
            "category": row['category']
            }).with_inputs("context", "question")

    return list(data_df.apply(as_example, axis = 1).values)


## Формирование тестовой и демонстрационной выборок

In [136]:
all_train = load_scone()
random.seed(1)
random.shuffle(all_train)
train, dev = all_train[: 200], all_train[200: 250]
len(train), len(dev)

(200, 50)

## Тест

In [None]:
random.seed(1)
test = load_scone(root_dir = "C:\\Users\\julia\\OneDrive\\Desktop\\MFTI_code\\dspy\\scone_nli\\test")
test = [ex for ex in test if ex.category == "one_scoped"]
pd.Series([ex.answer for ex in test]).value_counts()
# print(test)

[Example({'context': 'the man does not own a dog', 'question': 'Can we logically conclude for sure that the man does not own a mammal?', 'answer': 'No', 'category': 'one_scoped'}) (input_keys={'question', 'context'}), Example({'context': 'the man does not own a mammal', 'question': 'Can we logically conclude for sure that the man does not own a dog?', 'answer': 'Yes', 'category': 'one_scoped'}) (input_keys={'question', 'context'}), Example({'context': 'the man does not own a dog', 'question': 'Can we logically conclude for sure that the man does not own an animal?', 'answer': 'No', 'category': 'one_scoped'}) (input_keys={'question', 'context'}), Example({'context': 'the man does not own an animal', 'question': 'Can we logically conclude for sure that the man does not own a dog?', 'answer': 'Yes', 'category': 'one_scoped'}) (input_keys={'question', 'context'}), Example({'context': 'the man does not own a dog', 'question': 'Can we logically conclude for sure that the man does not own a p

## Подключение тулз для подсчета Accuracy

In [138]:
scone_accuracy = dspy.evaluate.metrics.answer_exact_match
evaluator = Evaluate(devset = test, num_threads = 1, display_progress = True, display_table = 0)

## Zero-shot CoT  

In [139]:
class ScoNeSignature(dspy.Signature):
    ("""You are given some context (a premise) and a question (a hypothesis). You must indicate with Yes/No answer whether we can logically
    conclude the hypothesis from the premise.""")
    context = dspy.InputField()
    question = dspy.InputField()
    answer = dspy.OutputField(desc = "Yes or No")

class ScoNeCoT(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.ChainOfThought(ScoNeSignature)
    def forward(self, context, question):
        return self.generate_answer(context = context, question = question)

## Запуск и оценка

In [140]:
cot_zeroshot = ScoNeCoT()
evaluator(cot_zeroshot, metric = scone_accuracy)

Average Metric: 109 / 200  (54.5): 100%|██████████| 200/200 [06:57<00:00,  2.09s/it]

2024/11/11 18:06:53 INFO dspy.evaluate.evaluate: Average Metric: 109 / 200 (54.5%)





54.5

## Оптимизируем полученные результаты с помощью автоматической генерации Few-Shot CoT и GPT4o

In [143]:
bootstrap_optimizer = BootstrapFewShotWithRandomSearch(
    max_bootstrapped_demos = 8,
    max_labeled_demos = 8,
    num_candidate_programs = 10,
    num_threads = 8,
    metric = scone_accuracy,
    teacher_settings = dict(lm = gpt4))

Going to sample between 1 and 8 traces per predictor.
Will attempt to bootstrap 10 candidate sets.


In [144]:
if RUN_FROM_SCRATCH:
    cot_fewshot = bootstrap_optimizer.compile(cot_zeroshot, trainset = train, valset = dev)
else:
    cot_fewshot = ScoNeCoT()
    cot_fewshot.load("scone-cot_fewshot-turbo-gpt4-demos.json")

Average Metric: 25 / 50  (50.0): 100%|██████████| 50/50 [00:11<00:00,  4.51it/s]
2024/11/12 17:57:11 INFO dspy.evaluate.evaluate: Average Metric: 25 / 50 (50.0%)


New best score: 50.0 for seed -3
Scores so far: [50.0]
Best score so far: 50.0


Average Metric: 35 / 50  (70.0): 100%|██████████| 50/50 [00:13<00:00,  3.80it/s]
2024/11/12 17:57:24 INFO dspy.evaluate.evaluate: Average Metric: 35 / 50 (70.0%)


New best score: 70.0 for seed -2
Scores so far: [50.0, 70.0]
Best score so far: 70.0


  5%|▌         | 10/200 [00:43<13:40,  4.32s/it]


Bootstrapped 8 full traces after 10 examples for up to 1 rounds, amounting to 10 attempts.


Average Metric: 33 / 50  (66.0): 100%|██████████| 50/50 [00:13<00:00,  3.77it/s]
2024/11/12 17:58:20 INFO dspy.evaluate.evaluate: Average Metric: 33 / 50 (66.0%)


Scores so far: [50.0, 70.0, 66.0]
Best score so far: 70.0


  4%|▍         | 8/200 [00:24<09:42,  3.03s/it]


Bootstrapped 7 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.


Average Metric: 36 / 50  (72.0): 100%|██████████| 50/50 [00:13<00:00,  3.63it/s]
2024/11/12 17:58:58 INFO dspy.evaluate.evaluate: Average Metric: 36 / 50 (72.0%)


New best score: 72.0 for seed 0
Scores so far: [50.0, 70.0, 66.0, 72.0]
Best score so far: 72.0


  3%|▎         | 6/200 [00:24<13:00,  4.03s/it]


Bootstrapped 3 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.


Average Metric: 30 / 50  (60.0): 100%|██████████| 50/50 [00:12<00:00,  4.03it/s]
2024/11/12 17:59:35 INFO dspy.evaluate.evaluate: Average Metric: 30 / 50 (60.0%)


Scores so far: [50.0, 70.0, 66.0, 72.0, 60.0]
Best score so far: 72.0


  0%|          | 1/200 [00:01<06:15,  1.89s/it]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.


Average Metric: 35 / 50  (70.0): 100%|██████████| 50/50 [00:12<00:00,  3.93it/s]
2024/11/12 17:59:49 INFO dspy.evaluate.evaluate: Average Metric: 35 / 50 (70.0%)


Scores so far: [50.0, 70.0, 66.0, 72.0, 60.0, 70.0]
Best score so far: 72.0


  2%|▏         | 4/200 [00:08<07:10,  2.19s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.


Average Metric: 33 / 50  (66.0): 100%|██████████| 50/50 [00:14<00:00,  3.49it/s]
2024/11/12 18:00:13 INFO dspy.evaluate.evaluate: Average Metric: 33 / 50 (66.0%)


Scores so far: [50.0, 70.0, 66.0, 72.0, 60.0, 70.0, 66.0]
Best score so far: 72.0


  2%|▏         | 4/200 [00:09<07:35,  2.33s/it]


Bootstrapped 4 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.


Average Metric: 41 / 50  (82.0): 100%|██████████| 50/50 [00:13<00:00,  3.78it/s] 
2024/11/12 18:00:35 INFO dspy.evaluate.evaluate: Average Metric: 41 / 50 (82.0%)


New best score: 82.0 for seed 4
Scores so far: [50.0, 70.0, 66.0, 72.0, 60.0, 70.0, 66.0, 82.0]
Best score so far: 82.0


  3%|▎         | 6/200 [00:18<10:13,  3.16s/it]


Bootstrapped 5 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.


Average Metric: 35 / 50  (70.0): 100%|██████████| 50/50 [00:13<00:00,  3.82it/s]
2024/11/12 18:01:07 INFO dspy.evaluate.evaluate: Average Metric: 35 / 50 (70.0%)


Scores so far: [50.0, 70.0, 66.0, 72.0, 60.0, 70.0, 66.0, 82.0, 70.0]
Best score so far: 82.0


  1%|          | 2/200 [00:04<08:04,  2.44s/it]


Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.


Average Metric: 36 / 50  (72.0): 100%|██████████| 50/50 [00:16<00:00,  2.98it/s]
2024/11/12 18:01:29 INFO dspy.evaluate.evaluate: Average Metric: 36 / 50 (72.0%)


Scores so far: [50.0, 70.0, 66.0, 72.0, 60.0, 70.0, 66.0, 82.0, 70.0, 72.0]
Best score so far: 82.0


  4%|▎         | 7/200 [00:20<09:26,  2.93s/it]


Bootstrapped 6 full traces after 7 examples for up to 1 rounds, amounting to 7 attempts.


Average Metric: 38 / 50  (76.0): 100%|██████████| 50/50 [00:13<00:00,  3.60it/s]
2024/11/12 18:02:03 INFO dspy.evaluate.evaluate: Average Metric: 38 / 50 (76.0%)


Scores so far: [50.0, 70.0, 66.0, 72.0, 60.0, 70.0, 66.0, 82.0, 70.0, 72.0, 76.0]
Best score so far: 82.0


  2%|▎         | 5/200 [00:13<08:31,  2.62s/it]


Bootstrapped 4 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.


Average Metric: 38 / 50  (76.0): 100%|██████████| 50/50 [00:13<00:00,  3.66it/s]
2024/11/12 18:02:30 INFO dspy.evaluate.evaluate: Average Metric: 38 / 50 (76.0%)


Scores so far: [50.0, 70.0, 66.0, 72.0, 60.0, 70.0, 66.0, 82.0, 70.0, 72.0, 76.0, 76.0]
Best score so far: 82.0


  5%|▌         | 10/200 [00:37<11:51,  3.75s/it]


Bootstrapped 8 full traces after 10 examples for up to 1 rounds, amounting to 10 attempts.


Average Metric: 37 / 50  (74.0): 100%|██████████| 50/50 [00:13<00:00,  3.75it/s]
2024/11/12 18:03:21 INFO dspy.evaluate.evaluate: Average Metric: 37 / 50 (74.0%)


Scores so far: [50.0, 70.0, 66.0, 72.0, 60.0, 70.0, 66.0, 82.0, 70.0, 72.0, 76.0, 76.0, 74.0]
Best score so far: 82.0
13 candidate programs found.
