In [4]:
import ollama 
import os
from tqdm import tqdm
import json
import signal
import argparse
import wandb
import pandas as pd

import sys
from collections import defaultdict
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,confusion_matrix
from pathlib import Path

import optuna
import re

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [6]:
sys.argv = [
    'notebook',  
    '--modelname', 'llama3.2-vision:90b',
    '--data', '/root/home/data',
    '--data_path_files','/mnt/Gbenga_Enemy/ramy/WACV-2025-Workshop-ViGIR',
    '--results_dir', '/mnt/Gbenga_Enemy/ramy/results',
    '--timeout', '20',
    '--model_unloading',
    '--valid_responses' ,'/mnt/Gbenga_Enemy/ramy/results/valid_llama3_90b.json',
    '--test_responses' ,'/mnt/Gbenga_Enemy/ramy/results/test_llama3_90b.json'
    
]

In [7]:
parser = argparse.ArgumentParser(description="A script to run V-LLMs on different image classification datasets")

In [8]:
parser.add_argument("--modelname", type=str, required=True, help="The name of the V-LLM model")
parser.add_argument("--data", type=str, required=True, help="Path to the data")
parser.add_argument("--data_path_files", type=str, required=True, help="Path to the image data dir")
parser.add_argument("--results_dir", type=str, required=True, help="Folder name to save results")
parser.add_argument("--timeout", type=int, default=40, help="time out duration to skip one sample")
parser.add_argument("--model_unloading", action="store_true", help="Enables unloading mode. Every 100 sampels it unloades the model from the GPU to avoid carshing.")
parser.add_argument("--valid_responses",type=str, required=True)
parser.add_argument("--test_responses",type=str, required=True)


args = parser.parse_args()

In [6]:
run = wandb.init(
    entity="ramytrm",
    project=f"WACV-2025-QuestionFiltering",
    name="run_test_" +args.modelname+"QuestionFiltering"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mramytrm[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
def display_image(image_path):
    img = mpimg.imread(image_path)
    plt.figure(figsize=(20, 20))
    plt.imshow(img)
    plt.axis('off')
    plt.show()

In [34]:
best_dict_path=os.path.join(args.results_dir,'Best_Q_Best_Dict_regularized.json')
print(best_dict_path)

/mnt/Gbenga_Enemy/ramy/results/Best_Q_Best_Dict_regularized.json


In [11]:
model_name = args.modelname #'llama3.2-vision:90b'
ollama.pull(model_name)

timeout_duration = args.timeout

options= {  # new
            "seed": 123,
            "temperature": 0,
            "num_ctx": 2048, # must be set, otherwise slightly random output
        }

model_labels = {}
count = 0

In [12]:
file_path = os.path.join(args.valid_responses)
with open(file_path, 'r') as file:
    valid_responses = json.load(file)
    
    
file_path = os.path.join(args.test_responses)
with open(file_path, 'r') as file:
    test_responses = json.load(file)

In [13]:
data_reformated = {}

for key, item in valid_responses.items():
    q_and_a = []
    for i in range(len(item)):
        tmp = item[i][1:]   
        q_and_a.append(tmp)

    data_reformated[key] = q_and_a

In [14]:
data_reformated_test = {}

for key, item in test_responses.items():
    q_and_a = []
    for i in range(len(item)):
        tmp = item[i][1:]   
        q_and_a.append(tmp)

    data_reformated_test[key] = q_and_a

In [15]:
len(data_reformated)+len(data_reformated_test)

621

In [16]:
binary_vars=[
            0,
            0,
            1,
            0,
            0,
            1,
            1,
            0,
            0,
            0,
            0,
            1,
            0,
            0,
            0
        ]
print(sum(binary_vars))

for index, (key, item) in enumerate(data_reformated.items()):

    formatted_qas = []
    for i, qa in enumerate(item):
        if binary_vars[i] == 0:
            continue
        formatted_qa = f"Q: {qa[0]}\nA: {qa[1]}"
        formatted_qas.append(formatted_qa)

    prompt = "\n".join(formatted_qas)

    question = "Based on the following questions and their answers, determine if there is evidence of an ephemeral gully in the observed area. Carefully analyze all the questions and the responses to assess.\n\n"
    question += prompt
    question += "\n\nAfter considering these responses, provide a clear conclusion: Is there evidence of an ephemeral gully? Answer with yes or no only."
    break

4


In [17]:
binary_vars

[0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0]

In [15]:
print(question)

Based on the following questions and their answers, determine if there is evidence of an ephemeral gully in the observed area. Carefully analyze all the questions and the responses to assess.

Q: Given these six images of the exact same area and collected over a period of 10 years, do you see a low point in the terrain? Answer with yes or no only!
A: No.
Q: Given these six images of the exact same area and collected over a period of 10 years, are there indications of nearby human activity, such as tillage or machinery tracks? Answer with yes or no only!
A: No.

After considering these responses, provide a clear conclusion: Is there evidence of an ephemeral gully? Answer with yes or no only.


In [18]:
def extract_first_yes_no(text):
    # Use regular expressions to find "Yes" or "No" (case-insensitive)
    match = re.search(r'\b(Yes|No)\b', text, re.IGNORECASE)
    if match:
        # Return the first match in its original case
        return match.group(0)
    return None

In [19]:
def evaluate_model(data_reform, responses, binary_vars, model_name, options,trial,VALID=0):
    ground_truth_test = []
    model_response_test = []

    for index, (key, item) in enumerate(data_reform.items()):

        formatted_qas = []
        for i, qa in enumerate(item):
            if binary_vars[i] == 0:
                continue
            formatted_qa = f"Q: {qa[0]}\nA: {qa[1]}"
            formatted_qas.append(formatted_qa)

        prompt = "\n".join(formatted_qas)

        question = "Based on the following questions and their answers, determine if there is evidence of an ephemeral gully in the observed area. Carefully analyze all the questions and the responses to assess.\n\n"
        question += prompt
        question += "\n\nAfter considering these responses, provide a clear conclusion: Is there evidence of an ephemeral gully? Answer with yes or no only."
#         question += "\n\nIs there evidence of an ephemeral gully? Answer with yes or no only."

        response = ollama.generate(model=model_name, prompt=question, options=options)
        text=extract_first_yes_no(response['response'])
        
        
        if text == 'Yes':
            model_response_test.append(1)
        else:
            model_response_test.append(0)
            
        if responses[key][0][0]['label'] == '4':
            ground_truth_test.append(1)
        else:
            ground_truth_test.append(0)
            
            
        if VALID==1 and index%50==0:
        
            macro_f1 = f1_score(ground_truth_test, model_response_test, average='macro')
            macro_f1-=sum(binary_vars)/600
            trial.report(macro_f1, step=index)
            
            if trial.should_prune():
                raise optuna.TrialPruned()

    return ground_truth_test, model_response_test

In [39]:
def log_best_dict(best_dict,file_path):

    wandb.log(best_dict)
    
    table = wandb.Table(data=[list(best_dict.values())], columns=list(best_dict.keys()))
    wandb.log({"best_dict_table": table})
    
    if os.path.isfile(file_path):
        artifact = wandb.Artifact("best_dict", type="dictionary")
        artifact.add_file(file_path)
        wandb.log_artifact(artifact)
    else:
        raise FileNotFoundError(f"The file {file_path} was not created successfully.")

In [40]:
def objective(trial):
    
    global best_dict, trail_no
    
    trail_no+=1
    
    print(trail_no,best_dict['best_macro'])
    
    binary_vars = [trial.suggest_int(f"x{i}", 0, 1) for i in range(15)]
    

    if sum(binary_vars)<2:
        return 0
    
    ground_truth, model_response = evaluate_model(data_reformated, 
                                                  valid_responses, 
                                                  binary_vars, 
                                                  model_name, 
                                                  options,trial, 
                                                  VALID=1)
        
    
    macro_f1 = f1_score(ground_truth, model_response, average='macro')
    
    macro_f1-=sum(binary_vars)/600
    
    
    if macro_f1>best_dict['best_macro']:
        
        
        best_dict['all_best']=[]
        best_dict['all_best'].append(binary_vars)
        best_dict['best_macro']=macro_f1
        best_dict['best_questions']=binary_vars
        
        
        ground_truth_test, model_response_test = evaluate_model(
        data_reformated_test, test_responses, binary_vars, model_name, options, trial
        )


        macro_f1_test = f1_score(ground_truth_test, model_response_test, average='macro')
        best_dict['best_test']=macro_f1_test
        
        print('Got new best', best_dict['best_macro'],best_dict['best_test'])
        
        with open(best_dict_path, 'w') as fp:
            json.dump(best_dict, fp, indent=4)
            
        log_best_dict(best_dict,best_dict_path)
        
    return macro_f1 

In [None]:
trail_no=0
best_dict={}

best_dict['best_macro']=0
best_dict['best_questions']=[]
best_dict['all_best']=[]



study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=1000)


print("Best objective value:", study.best_value)
print("Best parameters:", study.best_params)

[I 2024-11-17 20:01:49,653] A new study created in memory with name: no-name-bab9558c-98d3-40dd-94d6-fbd8e484151c


1 0
Got new best 0.6431138975966562 0.6325858066655613


[I 2024-11-17 20:34:06,476] Trial 0 finished with value: 0.6431138975966562 and parameters: {'x0': 1, 'x1': 1, 'x2': 1, 'x3': 0, 'x4': 1, 'x5': 1, 'x6': 1, 'x7': 1, 'x8': 1, 'x9': 0, 'x10': 0, 'x11': 1, 'x12': 1, 'x13': 1, 'x14': 1}. Best is trial 0 with value: 0.6431138975966562.


2 0.6431138975966562
Got new best 0.6751854642098544 0.6359181559181559


[I 2024-11-17 21:10:48,688] Trial 1 finished with value: 0.6751854642098544 and parameters: {'x0': 1, 'x1': 1, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 1, 'x6': 1, 'x7': 1, 'x8': 1, 'x9': 1, 'x10': 0, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 1 with value: 0.6751854642098544.


3 0.6751854642098544


[I 2024-11-17 21:12:40,551] Trial 2 finished with value: 0.32536098923015555 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 0, 'x7': 1, 'x8': 1, 'x9': 0, 'x10': 0, 'x11': 1, 'x12': 0, 'x13': 1, 'x14': 0}. Best is trial 1 with value: 0.6751854642098544.


4 0.6751854642098544


[I 2024-11-17 21:38:01,090] Trial 3 finished with value: 0.6751854642098544 and parameters: {'x0': 0, 'x1': 1, 'x2': 1, 'x3': 1, 'x4': 0, 'x5': 1, 'x6': 1, 'x7': 1, 'x8': 0, 'x9': 1, 'x10': 0, 'x11': 0, 'x12': 1, 'x13': 1, 'x14': 1}. Best is trial 1 with value: 0.6751854642098544.


5 0.6751854642098544


[I 2024-11-17 21:53:28,033] Trial 4 finished with value: 0.6663716608472876 and parameters: {'x0': 0, 'x1': 1, 'x2': 1, 'x3': 0, 'x4': 1, 'x5': 1, 'x6': 1, 'x7': 1, 'x8': 1, 'x9': 1, 'x10': 1, 'x11': 1, 'x12': 0, 'x13': 0, 'x14': 1}. Best is trial 1 with value: 0.6751854642098544.


6 0.6751854642098544


[I 2024-11-17 22:09:15,583] Trial 5 finished with value: 0.6751854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 1, 'x6': 1, 'x7': 1, 'x8': 1, 'x9': 1, 'x10': 0, 'x11': 0, 'x12': 1, 'x13': 1, 'x14': 1}. Best is trial 1 with value: 0.6751854642098544.


7 0.6751854642098544


[I 2024-11-17 22:09:16,382] Trial 6 pruned. 


8 0.6751854642098544


[I 2024-11-17 22:09:17,124] Trial 7 pruned. 


9 0.6751854642098544
Got new best 0.6785187975431878 0.6359181559181559


[I 2024-11-17 22:37:14,887] Trial 8 finished with value: 0.6785187975431878 and parameters: {'x0': 1, 'x1': 0, 'x2': 1, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 0, 'x11': 1, 'x12': 1, 'x13': 1, 'x14': 0}. Best is trial 8 with value: 0.6785187975431878.


10 0.6785187975431878


[I 2024-11-17 22:37:15,692] Trial 9 pruned. 


11 0.6785187975431878


[I 2024-11-17 22:37:16,383] Trial 10 pruned. 


12 0.6785187975431878
Got new best 0.6801854642098544 0.6359181559181559


[I 2024-11-17 23:01:30,525] Trial 11 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


13 0.6801854642098544


[I 2024-11-17 23:17:31,780] Trial 12 finished with value: 0.6785187975431878 and parameters: {'x0': 1, 'x1': 0, 'x2': 1, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 1, 'x12': 1, 'x13': 0, 'x14': 0}. Best is trial 11 with value: 0.6801854642098544.


14 0.6801854642098544


[I 2024-11-17 23:29:56,028] Trial 13 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


15 0.6801854642098544


[I 2024-11-17 23:42:20,064] Trial 14 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


16 0.6801854642098544


[I 2024-11-17 23:54:44,206] Trial 15 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


17 0.6801854642098544


[I 2024-11-18 00:07:08,315] Trial 16 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


18 0.6801854642098544


[I 2024-11-18 00:07:08,758] Trial 17 pruned. 


19 0.6801854642098544


[I 2024-11-18 00:19:33,198] Trial 18 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


20 0.6801854642098544


[I 2024-11-18 00:31:57,197] Trial 19 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


21 0.6801854642098544


[I 2024-11-18 00:31:57,746] Trial 20 pruned. 


22 0.6801854642098544


[I 2024-11-18 00:44:22,107] Trial 21 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


23 0.6801854642098544


[I 2024-11-18 00:56:46,034] Trial 22 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


24 0.6801854642098544


[I 2024-11-18 01:09:10,889] Trial 23 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


25 0.6801854642098544


[I 2024-11-18 01:21:34,996] Trial 24 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


26 0.6801854642098544


[I 2024-11-18 01:33:58,970] Trial 25 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


27 0.6801854642098544


[I 2024-11-18 01:46:22,810] Trial 26 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


28 0.6801854642098544


[I 2024-11-18 01:58:47,168] Trial 27 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


29 0.6801854642098544


[I 2024-11-18 02:11:11,348] Trial 28 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


30 0.6801854642098544


[I 2024-11-18 02:25:34,067] Trial 29 finished with value: 0.669604053883923 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 0, 'x4': 1, 'x5': 1, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 0, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


31 0.6801854642098544


[I 2024-11-18 02:37:58,328] Trial 30 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


32 0.6801854642098544


[I 2024-11-18 02:50:22,374] Trial 31 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


33 0.6801854642098544


[I 2024-11-18 03:02:46,271] Trial 32 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


34 0.6801854642098544


[I 2024-11-18 03:15:10,255] Trial 33 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


35 0.6801854642098544


[I 2024-11-18 03:15:14,424] Trial 34 pruned. 


36 0.6801854642098544


[I 2024-11-18 03:15:26,150] Trial 35 pruned. 


37 0.6801854642098544


[I 2024-11-18 03:15:32,569] Trial 36 pruned. 


38 0.6801854642098544


[I 2024-11-18 03:27:12,403] Trial 37 finished with value: 0.6801854642098544 and parameters: {'x0': 0, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 1, 'x9': 1, 'x10': 1, 'x11': 1, 'x12': 0, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


39 0.6801854642098544


[I 2024-11-18 03:27:18,420] Trial 38 pruned. 


40 0.6801854642098544


[I 2024-11-18 03:27:19,352] Trial 39 pruned. 


41 0.6801854642098544


[I 2024-11-18 03:27:25,674] Trial 40 pruned. 


42 0.6801854642098544


[I 2024-11-18 03:41:13,859] Trial 41 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


43 0.6801854642098544


[I 2024-11-18 03:55:04,736] Trial 42 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


44 0.6801854642098544


[I 2024-11-18 04:08:54,851] Trial 43 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


45 0.6801854642098544


[I 2024-11-18 04:22:45,015] Trial 44 finished with value: 0.6801854642098544 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 1, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 1}. Best is trial 11 with value: 0.6801854642098544.


46 0.6801854642098544
Got new best 0.6835187975431878 0.6359181559181559


[I 2024-11-18 04:26:35,588] Trial 45 finished with value: 0.6835187975431878 and parameters: {'x0': 1, 'x1': 0, 'x2': 0, 'x3': 1, 'x4': 0, 'x5': 0, 'x6': 1, 'x7': 0, 'x8': 0, 'x9': 1, 'x10': 0, 'x11': 0, 'x12': 1, 'x13': 0, 'x14': 0}. Best is trial 45 with value: 0.6835187975431878.


47 0.6835187975431878


In [30]:
tes=[1]*15

0.6945 -sum(tes)/300

0.6445

In [33]:
print(len(tes))
sum(binary_vars)

15


2

In [None]:
tes[1:12]=[0]*11

In [None]:
print(tes)

In [None]:
for index, (key, item) in enumerate(data_reformated.items()):
    formatted_qas = []
    for i, qa in enumerate(item):
        if tes[i]==0:
            continue
        formatted_qa = f"Q: {qa[0]}\nA: {qa[1]}"
        formatted_qas.append(formatted_qa)
        
    prompt = "\n".join(formatted_qas)
    print(prompt)
    print('________________________________')
    if index ==2:
        break

In [None]:
data_reformated[str(356)]

In [None]:
valid_responses

In [None]:
binary_vars

In [None]:
binary_vars=[1]*15

ground_truth, model_response = evaluate_model(data_reformated_test, 
                                              test_responses, 
                                              binary_vars, 
                                              model_name, 
                                              options, 12
                                            )


macro_f1 = f1_score(ground_truth, model_response, average='macro')

In [None]:
macro_f1

In [None]:
[0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0]

In [22]:
indices = [index for index, value in enumerate(binary_vars) if value == 1]

print(indices)

[2, 5, 6, 11]


In [21]:
ground_truth_test, model_response_test = evaluate_model(
        data_reformated_test, test_responses, binary_vars, model_name, options, 12
        )

In [23]:
y_true=ground_truth_test
y_pred=model_response_test

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='macro')
accuracy = accuracy_score(y_true, y_pred)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

print("True Positives (TP):", tp)
print("False Positives (FP):", fp)
print("False Negatives (FN):", fn)
print("True Negatives (TN):", tn)


print("Precision:", precision)
print("Recall:", recall)


print("Accuracy:", accuracy)

f1_per_class = f1_score(y_true, y_pred, average=None)
print(f1_per_class)

print("F1 Score (Macro):", f1)

True Positives (TP): 110
False Positives (FP): 44
False Negatives (FN): 67
True Negatives (TN): 90
Precision: 0.7142857142857143
Recall: 0.6214689265536724
Accuracy: 0.6430868167202572
[0.6185567  0.66465257]
F1 Score (Macro): 0.6416046345033793


In [None]:
#     if macro_f1==best_dict['best_macro']:
#         best_dict['all_best'].append(binary_vars)
        
        
#         ground_truth_test, model_response_test = evaluate_model(
#             data_reformated_test, test_responses, binary_vars, model_name, options, trial
#         )


#         macro_f1_test = f1_score(ground_truth_test, model_response_test, average='macro')
#         best_dict['all_best_test'].append(macro_f1_test)
        
#         print('Got a new tie', best_dict['best_macro'],best_dict['best_test'])