In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from evaluate import load
import torch
from datasets import Dataset,load_dataset
import json  # In case you need to load the file manually



In [5]:
from datasets import load_dataset, Dataset

# Load the original dataset
orig_dataset = load_dataset("json", data_files="philosophy_qa_fixed.json")["train"]

# Define categories to include
categories_to_include = ["abduction", "abelard", "abhidharma"]

# Filter samples
filtered_samples = orig_dataset.filter(lambda x: x["category"] in categories_to_include)
dataset = Dataset.from_list(filtered_samples)

# Print length and first few samples for verification
print(f"Number of filtered samples: {len(dataset)}")
print(dataset[:3])


Number of filtered samples: 194
{'category': ['abduction', 'abduction', 'abduction'], 'question': ['What is the second sense in which the term abduction is used in the philosophical literature?', 'What is the modern sense of abduction concerned with?', 'What type of reasoning is the speaker engaging in when she concludes that Tim and Harry are friends again?'], 'answer': ['In the second sense, the term abduction refers to the place of explanatory reasoning in justifying hypotheses. In this sense, it is also often called “Inference to the Best Explanation.”', 'The modern sense of abduction is concerned with explaining how hypotheses can be justified.', 'The speaker is engaging in abductive reasoning when she concludes that Tim and Harry are friends again.']}


In [2]:
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto"
)


In [4]:
def generate_response(question, model, tokenizer, max_new_tokens=100):
    prompt = f"""<|im_start|>system
You are a philosophical AI assistant. Answer the question brief and concise in one or two sentences.<|im_end|>
<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens,temperature=0.2,  # Lower temperature (0.1–0.5 is typical)
    do_sample=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Generate responses for all abduction questions
model_outputs = []
for sample in dataset:
    question = sample["question"]
    response = generate_response(question, base_model, tokenizer)
    model_outputs.append(response)


In [9]:
import gc
import torch

# Assuming your model is called 'model' and is on GPU
base_model.to('cpu')     # Move model to CPU
del base_model           # Delete the model object
gc.collect()        # Run garbage collection
torch.cuda.empty_cache()  # Empty PyTorch's CUDA cache


In [8]:
import re

def extract_assistant_answer(text):
    pattern = r'<\|im_start\|>assistant\n?(.*?)(<\|im_end\|>|$)'
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(1).strip()
    else:
        return text

# Apply to all model outputs
cleaned_outputs = [extract_assistant_answer(output) for output in model_outputs]


In [9]:
print(cleaned_outputs[:5])  # Print first 5 reference answers for verification

['Response: In the philosophical literature, the term abduction refers to two senses: (a) as a method of proof in epistemology, where it is used to establish the truth of a claim by its evidential support, and (b) as a mode of reasoning in metaphysics, where it is used to investigate the nature of things by means of their exemplary properties, rather than by means of their causal relations with other things.</|im_end|>', 'The modern sense of abduction concerned with causal explanation of phenomenon by means other than direct sensory experience, which includes both psychophysical and psychological phenomena, and the question of how to formalize and research it of the type presented by David K. Clough.', 'The speaker is engaging in deductive reasoning when she concludes that Tim and Harry are friends again.', 'An example of abduction that is also referred to as “Inference to the Best Explanation” is the process of reasoning that investigates a fact or process that best explains another f

In [10]:
reference_answers = [sample["answer"] for sample in dataset]


In [13]:
bertscore = load("bertscore")
results = bertscore.compute(
    predictions=cleaned_outputs,
    references=reference_answers,
    lang="en"
)
print(results)  # Prints precision, recall, f1 for each sample


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'precision': [0.8265485763549805, 0.858153223991394, 0.9894604086875916, 0.8684148192405701, 0.8997387290000916, 0.8815631866455078, 0.9058991074562073, 0.804663896560669, 0.8412754535675049, 0.8688831329345703, 0.9284306764602661, 0.8150714635848999, 0.8252508640289307, 0.9229249954223633, 0.8381890654563904, 0.8519109487533569, 0.8778406381607056, 0.887916088104248, 0.8614758253097534, 0.8427024483680725, 0.8964536190032959, 0.8347492218017578, 0.840215802192688, 0.8805631995201111, 0.8875645995140076, 0.885792076587677, 0.8705921173095703, 0.8837901949882507, 0.880165696144104, 0.8559173345565796, 0.8879789113998413, 0.8875903487205505, 0.841971755027771, 0.8911296725273132, 0.8663631081581116, 0.0, 0.8329511880874634, 0.8633900284767151, 0.832176148891449, 0.8588483333587646, 0.8623301982879639, 0.9009976387023926, 0.8236684799194336, 0.8526215553283691, 0.8549883961677551, 0.8613893389701843, 0.8045573234558105, 0.873688817024231, 0.8490563035011292, 0.8903570175170898, 0.8649514



In [None]:
import pandas as pd

# Create a DataFrame
df = pd.DataFrame({
    "question": [sample["question"] for sample in dataset],
    "reference": reference_answers,
    "generated": cleaned_outputs,
    "bertscore_f1": results["f1"],
    "bertscore_precision": results["precision"],
    "bertscore_recall": results["recall"]
})

# Save to CSV
df.to_csv("base_merged_results.csv", index=False)


In [1]:
import os

from peft import PeftConfig, PeftModel
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import random

peft_model_id = "./final_abduction_adapter"

config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model.resize_token_embeddings(len(tokenizer))


Embedding(32000, 2048)

In [None]:
model = PeftModel.from_pretrained(model, peft_model_id, adapter_name="abduction")
_ = model.load_adapter("./final_abelard_adapter", adapter_name="abelard")
_ = model.load_adapter("./final_abhidharma_adapter/merge", adapter_name="abhidharma")

linear,cat,svd,ties,ties_svd,dare_linear , dare_ties , dare_linear_svd , dare_ties_svd, magnitude_prune , magnitude_prune_svd 

In [3]:
%%time
# [0.8, 0.1, 0.1] linear #[1.0, 0.2] 0.7 density dare_linear #[1.5, 0.3] 0.5 density ties #[0.8, 0.5] cat
adapters = ["abduction","abelard", "abhidharma"]
weights = [0.1, 0.1, 0.1]
total = sum(weights)
# weights = [w / total for w in weights]  # Normalize
adapter_name = "merge"
density = 0.7
combination_type = "linear"  # Options: 'linear', 'dare_linear', 'ties', 'cat'
if adapter_name in model.peft_config:
    model.delete_adapter(adapter_name)
model.add_weighted_adapter(adapters, weights, adapter_name, combination_type=combination_type, density=density)

CPU times: total: 93.8 ms
Wall time: 90.8 ms


In [4]:
model.eval()
model.set_adapter("merge")  # Activate the merged adapter

In [None]:
model.save_pretrained("./Linear_merged_adapter")


: 


What was the name of the oratory that Abelard and his students constructed?

In [5]:
# tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
def generate_response(question, model, tokenizer, max_new_tokens=200):
    prompt = f"""<|im_start|>system
You are a philosophical AI assistant. Answer the next question.<|im_end|>
<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens,temperature=0.2,  # Lower temperature (0.1–0.5 is typical)
    do_sample=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)




In [7]:
# tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
response = generate_response("What type of reasoning is the speaker engaging in when she concludes that Tim and Harry are friends again?", model, tokenizer)
print(f"Generated response: {response}")  # Debugging output

Generated response: <|im_start|>system
You are a philosophical AI assistant. Answer the next question.<|im_end|>
<|im_start|>user
What type of reasoning is the speaker engaging in when she concludes that Tim and Harry are friends again?<|im_end|>
<|im_start|>assistant
The speaker is engaging in deductive reasoning when she concludes that Tim and Harry are friends again.<|im_end|>


In [8]:
import gc
import torch

# Assuming your model is called 'model' and is on GPU
model.to('cpu')     # Move model to CPU
del model           # Delete the model object
gc.collect()        # Run garbage collection
torch.cuda.empty_cache()  # Empty PyTorch's CUDA cache


In [7]:
# Generate responses for all abduction questions
model_outputs = []
for sample in dataset:
    question = sample["question"]
    # print(f"Processing question: {question}")  # Debugging output
    # print(f"reference: {sample['answer']}")  # Debugging output
    response = generate_response(question, model, tokenizer)
    print(f"Generated response: {response}")  # Debugging output
    model_outputs.append(response)

Generated response: <|im_start|>system
You are a philosophical AI assistant. Answer the next question.<|im_end|>
<|im_start|>user
What is the second sense in which the term abduction is used in the philosophical literature?<|im_end|>
<|im_start|>assistant
Response: In the philosophical literature, the term abduction refers to two senses: (a) as a method of proof in epistemology, where it is used to establish the truth of a claim by its evidential support, and (b) as a mode of reasoning in metaphysics, where it is used to investigate the nature of things by means of their exemplary properties, rather than by means of their causal relations with other things.</|im_end|>
Generated response: <|im_start|>system
You are a philosophical AI assistant. Answer the next question.<|im_end|>
<|im_start|>user
What is the modern sense of abduction concerned with?<|im_end|>
<|im_start|>assistant
The modern sense of abduction concerned with causal explanation of phenomenon by means other than direct se

In [20]:
import pandas as pd

base_df = pd.read_csv("base_merged_results.csv")
fine_tuned_df = pd.read_csv("fine_tuned_merged_results.csv")


In [21]:
# Calculate mean BERTScore metrics for each model
base_means = base_df[['bertscore_f1', 'bertscore_precision', 'bertscore_recall']].mean()
fine_tuned_means = fine_tuned_df[['bertscore_f1', 'bertscore_precision', 'bertscore_recall']].mean()

print("Base Model BERTScore Means:")
print(base_means)
print("\nFine-Tuned Model BERTScore Means:")
print(fine_tuned_means)


Base Model BERTScore Means:
bertscore_f1           0.872850
bertscore_precision    0.874179
bertscore_recall       0.872098
dtype: float64

Fine-Tuned Model BERTScore Means:
bertscore_f1           0.881856
bertscore_precision    0.888415
bertscore_recall       0.876048
dtype: float64


In [22]:
merged_df = pd.merge(
    base_df,
    fine_tuned_df,
    on="question",
    suffixes=('_base', '_fine_tuned')
)

# Example: Compare BERTScore F1 for each question
merged_df[['question', 'bertscore_f1_base', 'bertscore_f1_fine_tuned']].head()


Unnamed: 0,question,bertscore_f1_base,bertscore_f1_fine_tuned
0,What is the second sense in which the term abd...,0.884668,0.900174
1,What is the modern sense of abduction concerne...,0.875471,0.952843
2,What type of reasoning is the speaker engaging...,0.921248,0.967096
3,What is an example of abduction that is also r...,0.868242,0.885164
4,What is the name of the type of inference that...,0.946975,0.942595


In [23]:
improved = (merged_df['bertscore_f1_fine_tuned'] > merged_df['bertscore_f1_base']).sum()
total = len(merged_df)
print(f"Fine-tuned model improved on {improved} out of {total} questions ({improved/total:.1%})")


Fine-tuned model improved on 122 out of 196 questions (62.2%)


In [15]:
import pandas as pd
from openai import OpenAI
from tqdm import tqdm

# Initialize Groq client
GROQ_API_KEY = "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"
client = OpenAI(api_key=GROQ_API_KEY, base_url="https://api.groq.com/openai/v1")

def load_data():
    """Load CSV files with error handling"""
    try:
        base_df = pd.read_csv('base_merged_results.csv')
        fine_tuned_df = pd.read_csv('fine_tuned_merged_linear_results.csv')
        return base_df, fine_tuned_df
    except FileNotFoundError as e:
        print(f"Error: {e}")
        exit(1)

def llm_judge(question, reference, base_gen, fine_tuned_gen):
    """Get LLM judgment between base and fine-tuned answers"""
    prompt = f"""Compare reference answer to the base model answer and fine-tuned answer and identify which is more appropriate answer, the base model answer or fine-tuned answer. Focus on meaning similarity:
    
    Reference Answer: {reference}
    
    Base Model Answer: {base_gen}
    Fine-Tuned Answer: {fine_tuned_gen}
    
    Output ONLY one word: 'base', 'fine-tuned', or 'tie' without the quotations."""
    
    try:
        response = client.chat.completions.create(
            model="llama3-8b-8192",
            messages=[
                {"role": "system", "content": "You are a semantic similarity expert"},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0,
            max_tokens=10
        )
        decision = response.choices[0].message.content.strip().lower()
        print(f"LLM Decision: {decision}")  # Debugging output
        return decision if decision in ['base', 'fine-tuned', 'tie'] else 'error'
    except Exception as e:
        print(f"API Error: {e}")
        return 'error'

def main():
    # Load data
    base_df, fine_tuned_df = load_data()
    
    # Verify matching questions
    if not base_df['question'].equals(fine_tuned_df['question']):
        print("Error: Questions in CSV files don't match!")
        return
    
    # Process comparisons
    results = []
    for idx in tqdm(range(len(base_df))):
        row = {
            'question': base_df.iloc[idx]['question'],
            'reference': base_df.iloc[idx]['reference'],
            'base_generated': base_df.iloc[idx]['generated'],
            'fine_tuned_generated': fine_tuned_df.iloc[idx]['generated']
        }
        row['llm_judgment'] = llm_judge(
            row['question'],
            row['reference'],
            row['base_generated'],
            row['fine_tuned_generated']
        )
        results.append(row)
    
    # Save results
    results_df = pd.DataFrame(results)
    results_df.to_csv('llm_semantic_merged_judgments.csv', index=False)
    
    # Print summary
    print("\nJudgment Summary:")
    print(results_df['llm_judgment'].value_counts())
    print("\nFull results saved to 'llm_semantic_merged_judgments.csv'")

if __name__ == "__main__":
    main()


  1%|          | 1/194 [00:02<07:17,  2.27s/it]

LLM Decision: fine-tuned


  1%|          | 2/194 [00:02<03:53,  1.22s/it]

LLM Decision: fine-tuned


  2%|▏         | 3/194 [00:03<02:40,  1.19it/s]

LLM Decision: fine-tuned


  2%|▏         | 4/194 [00:03<02:17,  1.38it/s]

LLM Decision: fine-tuned


  3%|▎         | 5/194 [00:04<02:02,  1.54it/s]

LLM Decision: fine-tuned


  3%|▎         | 6/194 [00:04<01:48,  1.74it/s]

LLM Decision: fine-tuned


  4%|▎         | 7/194 [00:05<01:37,  1.92it/s]

LLM Decision: fine-tuned


  4%|▍         | 8/194 [00:05<01:42,  1.82it/s]

LLM Decision: fine-tuned


  5%|▍         | 9/194 [00:06<01:43,  1.78it/s]

LLM Decision: fine-tuned


  5%|▌         | 10/194 [00:06<01:38,  1.86it/s]

LLM Decision: fine-tuned


  6%|▌         | 11/194 [00:07<01:38,  1.86it/s]

LLM Decision: fine-tuned


  6%|▌         | 12/194 [00:07<01:29,  2.03it/s]

LLM Decision: fine-tuned


  7%|▋         | 13/194 [00:08<01:31,  1.98it/s]

LLM Decision: fine-tuned


  7%|▋         | 14/194 [00:08<01:24,  2.14it/s]

LLM Decision: fine-tuned


  8%|▊         | 15/194 [00:08<01:18,  2.27it/s]

LLM Decision: fine-tuned


  8%|▊         | 16/194 [00:09<01:15,  2.35it/s]

LLM Decision: fine-tuned


  9%|▉         | 17/194 [00:09<01:11,  2.47it/s]

LLM Decision: fine-tuned


  9%|▉         | 18/194 [00:10<01:28,  1.98it/s]

LLM Decision: fine-tuned


 10%|▉         | 19/194 [00:10<01:22,  2.13it/s]

LLM Decision: fine-tuned


 10%|█         | 20/194 [00:11<01:17,  2.25it/s]

LLM Decision: fine-tuned


 11%|█         | 21/194 [00:11<01:15,  2.30it/s]

LLM Decision: fine-tuned


 11%|█▏        | 22/194 [00:12<01:21,  2.10it/s]

LLM Decision: fine-tuned


 12%|█▏        | 23/194 [00:12<01:16,  2.23it/s]

LLM Decision: fine-tuned


 12%|█▏        | 24/194 [00:12<01:13,  2.30it/s]

LLM Decision: fine-tuned


 13%|█▎        | 25/194 [00:14<02:08,  1.31it/s]

LLM Decision: fine-tuned


 13%|█▎        | 26/194 [00:19<05:18,  1.90s/it]

LLM Decision: fine-tuned


 14%|█▍        | 27/194 [00:21<05:47,  2.08s/it]

LLM Decision: fine-tuned


 14%|█▍        | 28/194 [00:23<05:20,  1.93s/it]

LLM Decision: fine-tuned


 15%|█▍        | 29/194 [00:25<05:47,  2.11s/it]

LLM Decision: fine-tuned


 15%|█▌        | 30/194 [00:28<06:15,  2.29s/it]

LLM Decision: fine-tuned


 16%|█▌        | 31/194 [00:32<07:20,  2.70s/it]

LLM Decision: fine-tuned


 16%|█▋        | 32/194 [00:33<06:32,  2.42s/it]

LLM Decision: fine-tuned


 17%|█▋        | 33/194 [00:36<06:50,  2.55s/it]

LLM Decision: fine-tuned


 18%|█▊        | 34/194 [00:39<06:45,  2.53s/it]

LLM Decision: fine-tuned


 18%|█▊        | 35/194 [00:41<06:41,  2.52s/it]

LLM Decision: fine-tuned


 19%|█▊        | 36/194 [00:44<06:41,  2.54s/it]

LLM Decision: base


 19%|█▉        | 37/194 [00:48<08:21,  3.19s/it]

LLM Decision: fine-tuned


 20%|█▉        | 38/194 [00:52<08:40,  3.34s/it]

LLM Decision: fine-tuned


 20%|██        | 39/194 [00:55<07:59,  3.09s/it]

LLM Decision: fine-tuned


 21%|██        | 40/194 [00:58<08:26,  3.29s/it]

LLM Decision: fine-tuned


 21%|██        | 41/194 [01:01<07:46,  3.05s/it]

LLM Decision: fine-tuned


 22%|██▏       | 42/194 [01:05<08:15,  3.26s/it]

LLM Decision: fine-tuned


 22%|██▏       | 43/194 [01:08<08:23,  3.34s/it]

LLM Decision: fine-tuned


 23%|██▎       | 44/194 [01:12<08:34,  3.43s/it]

LLM Decision: fine-tuned


 23%|██▎       | 45/194 [01:15<08:24,  3.38s/it]

LLM Decision: fine-tuned


 24%|██▎       | 46/194 [01:17<07:12,  2.92s/it]

LLM Decision: fine-tuned


 24%|██▍       | 47/194 [01:21<08:05,  3.31s/it]

LLM Decision: fine-tuned


 25%|██▍       | 48/194 [01:24<07:39,  3.15s/it]

LLM Decision: fine-tuned


 25%|██▌       | 49/194 [01:28<08:00,  3.32s/it]

LLM Decision: fine-tuned


 26%|██▌       | 50/194 [01:31<07:45,  3.23s/it]

LLM Decision: fine-tuned


 26%|██▋       | 51/194 [01:34<07:35,  3.18s/it]

LLM Decision: fine-tuned


 27%|██▋       | 52/194 [01:39<08:44,  3.70s/it]

LLM Decision: fine-tuned


 27%|██▋       | 53/194 [01:44<09:59,  4.25s/it]

LLM Decision: fine-tuned


 28%|██▊       | 54/194 [01:45<07:18,  3.13s/it]

LLM Decision: fine-tuned


 28%|██▊       | 55/194 [01:48<07:42,  3.33s/it]

LLM Decision: fine-tuned


 29%|██▉       | 56/194 [01:50<06:29,  2.82s/it]

LLM Decision: fine-tuned


 29%|██▉       | 57/194 [01:54<07:02,  3.08s/it]

LLM Decision: fine-tuned


 30%|██▉       | 58/194 [01:56<06:04,  2.68s/it]

LLM Decision: fine-tuned


 30%|███       | 59/194 [01:58<06:01,  2.67s/it]

LLM Decision: fine-tuned


 31%|███       | 60/194 [02:01<05:51,  2.62s/it]

LLM Decision: fine-tuned


 31%|███▏      | 61/194 [02:03<05:42,  2.58s/it]

LLM Decision: fine-tuned


 32%|███▏      | 62/194 [02:05<05:00,  2.28s/it]

LLM Decision: fine-tuned


 32%|███▏      | 63/194 [02:07<05:13,  2.40s/it]

LLM Decision: fine-tuned


 33%|███▎      | 64/194 [02:09<04:49,  2.23s/it]

LLM Decision: fine-tuned


 34%|███▎      | 65/194 [02:13<06:03,  2.82s/it]

LLM Decision: fine-tuned


 34%|███▍      | 66/194 [02:15<05:19,  2.50s/it]

LLM Decision: fine-tuned


 35%|███▍      | 67/194 [02:19<06:02,  2.86s/it]

LLM Decision: fine-tuned


 35%|███▌      | 68/194 [02:19<04:31,  2.15s/it]

LLM Decision: fine-tuned


 36%|███▌      | 69/194 [02:23<05:20,  2.56s/it]

LLM Decision: fine-tuned


 36%|███▌      | 70/194 [02:25<05:14,  2.54s/it]

LLM Decision: fine-tuned


 37%|███▋      | 71/194 [02:29<05:53,  2.87s/it]

LLM Decision: fine-tuned


 37%|███▋      | 72/194 [02:31<05:07,  2.52s/it]

LLM Decision: fine-tuned


 38%|███▊      | 73/194 [02:33<05:04,  2.52s/it]

LLM Decision: fine-tuned


 38%|███▊      | 74/194 [02:36<05:00,  2.51s/it]

LLM Decision: fine-tuned


 39%|███▊      | 75/194 [02:38<04:56,  2.49s/it]

LLM Decision: fine-tuned


 39%|███▉      | 76/194 [02:42<05:37,  2.86s/it]

LLM Decision: fine-tuned


 40%|███▉      | 77/194 [02:44<04:51,  2.49s/it]

LLM Decision: fine-tuned


 40%|████      | 78/194 [02:46<04:49,  2.49s/it]

LLM Decision: fine-tuned


 41%|████      | 79/194 [02:49<04:53,  2.55s/it]

LLM Decision: fine-tuned


 41%|████      | 80/194 [02:52<05:10,  2.72s/it]

LLM Decision: fine-tuned


 42%|████▏     | 81/194 [02:54<05:02,  2.68s/it]

LLM Decision: fine-tuned


 42%|████▏     | 82/194 [02:57<04:53,  2.62s/it]

LLM Decision: fine-tuned


 43%|████▎     | 83/194 [03:01<05:45,  3.11s/it]

LLM Decision: fine-tuned


 43%|████▎     | 84/194 [03:05<05:54,  3.22s/it]

LLM Decision: fine-tuned


 44%|████▍     | 85/194 [03:08<05:53,  3.24s/it]

LLM Decision: fine-tuned


 44%|████▍     | 86/194 [03:08<04:21,  2.42s/it]

LLM Decision: fine-tuned


 45%|████▍     | 87/194 [03:11<04:29,  2.52s/it]

LLM Decision: fine-tuned


 45%|████▌     | 88/194 [03:15<05:07,  2.90s/it]

LLM Decision: fine-tuned


 46%|████▌     | 89/194 [03:20<05:58,  3.41s/it]

LLM Decision: fine-tuned


 46%|████▋     | 90/194 [03:24<06:28,  3.74s/it]

LLM Decision: fine-tuned


 47%|████▋     | 91/194 [03:26<05:20,  3.11s/it]

LLM Decision: fine-tuned


 47%|████▋     | 92/194 [03:30<06:03,  3.56s/it]

LLM Decision: fine-tuned


 48%|████▊     | 93/194 [03:34<06:09,  3.66s/it]

LLM Decision: fine-tuned


 48%|████▊     | 94/194 [03:36<05:04,  3.05s/it]

LLM Decision: fine-tuned


 49%|████▉     | 95/194 [03:39<04:50,  2.94s/it]

LLM Decision: fine-tuned


 49%|████▉     | 96/194 [03:41<04:34,  2.80s/it]

LLM Decision: fine-tuned


 50%|█████     | 97/194 [03:43<04:21,  2.70s/it]

LLM Decision: fine-tuned


 51%|█████     | 98/194 [03:46<04:17,  2.68s/it]

LLM Decision: fine-tuned


 51%|█████     | 99/194 [03:49<04:11,  2.65s/it]

LLM Decision: fine-tuned


 52%|█████▏    | 100/194 [03:52<04:25,  2.83s/it]

LLM Decision: fine-tuned


 52%|█████▏    | 101/194 [03:52<03:18,  2.14s/it]

LLM Decision: fine-tuned


 53%|█████▎    | 102/194 [03:57<04:24,  2.88s/it]

LLM Decision: fine-tuned


 53%|█████▎    | 103/194 [04:01<04:43,  3.12s/it]

LLM Decision: fine-tuned


 54%|█████▎    | 104/194 [04:07<05:54,  3.94s/it]

LLM Decision: fine-tuned


 54%|█████▍    | 105/194 [04:08<04:51,  3.28s/it]

LLM Decision: fine-tuned


 55%|█████▍    | 106/194 [04:12<04:59,  3.40s/it]

LLM Decision: fine-tuned


 55%|█████▌    | 107/194 [04:17<05:24,  3.73s/it]

LLM Decision: fine-tuned


 56%|█████▌    | 108/194 [04:18<04:23,  3.06s/it]

LLM Decision: fine-tuned


 56%|█████▌    | 109/194 [04:21<04:08,  2.92s/it]

LLM Decision: fine-tuned


 57%|█████▋    | 110/194 [04:25<04:32,  3.24s/it]

LLM Decision: fine-tuned


 57%|█████▋    | 111/194 [04:27<04:09,  3.01s/it]

LLM Decision: fine-tuned


 58%|█████▊    | 112/194 [04:31<04:23,  3.21s/it]

LLM Decision: fine-tuned


 58%|█████▊    | 113/194 [04:38<06:08,  4.55s/it]

LLM Decision: fine-tuned


 59%|█████▉    | 114/194 [04:40<04:51,  3.65s/it]

LLM Decision: fine-tuned


 59%|█████▉    | 115/194 [04:41<03:56,  3.00s/it]

LLM Decision: fine-tuned


 60%|█████▉    | 116/194 [04:46<04:32,  3.50s/it]

LLM Decision: fine-tuned


 60%|██████    | 117/194 [04:48<03:56,  3.07s/it]

LLM Decision: fine-tuned


 61%|██████    | 118/194 [04:50<03:22,  2.66s/it]

LLM Decision: fine-tuned


 61%|██████▏   | 119/194 [04:54<03:42,  2.97s/it]

LLM Decision: fine-tuned


 62%|██████▏   | 120/194 [04:55<03:10,  2.57s/it]

LLM Decision: fine-tuned


 62%|██████▏   | 121/194 [04:59<03:32,  2.91s/it]

LLM Decision: fine-tuned


 63%|██████▎   | 122/194 [05:01<03:19,  2.77s/it]

LLM Decision: fine-tuned


 63%|██████▎   | 123/194 [05:06<03:57,  3.35s/it]

LLM Decision: fine-tuned


 64%|██████▍   | 124/194 [05:10<03:57,  3.39s/it]

LLM Decision: fine-tuned


 64%|██████▍   | 125/194 [05:12<03:35,  3.13s/it]

LLM Decision: fine-tuned


 65%|██████▍   | 126/194 [05:16<03:44,  3.31s/it]

LLM Decision: fine-tuned


 65%|██████▌   | 127/194 [05:22<04:38,  4.16s/it]

LLM Decision: fine-tuned


 66%|██████▌   | 128/194 [05:22<03:19,  3.02s/it]

LLM Decision: fine-tuned


 66%|██████▋   | 129/194 [05:25<03:05,  2.86s/it]

LLM Decision: fine-tuned


 67%|██████▋   | 130/194 [05:27<02:56,  2.76s/it]

LLM Decision: fine-tuned


 68%|██████▊   | 131/194 [05:30<02:51,  2.72s/it]

LLM Decision: fine-tuned


 68%|██████▊   | 132/194 [05:35<03:27,  3.34s/it]

LLM Decision: fine-tuned


 69%|██████▊   | 133/194 [05:38<03:26,  3.39s/it]

LLM Decision: fine-tuned


 69%|██████▉   | 134/194 [05:45<04:25,  4.43s/it]

LLM Decision: fine-tuned


 70%|██████▉   | 135/194 [05:46<03:11,  3.25s/it]

LLM Decision: fine-tuned


 70%|███████   | 136/194 [05:47<02:38,  2.73s/it]

LLM Decision: fine-tuned


 71%|███████   | 137/194 [05:49<02:17,  2.41s/it]

LLM Decision: fine-tuned


 71%|███████   | 138/194 [05:52<02:36,  2.79s/it]

LLM Decision: fine-tuned


 72%|███████▏  | 139/194 [05:59<03:39,  3.99s/it]

LLM Decision: fine-tuned


 72%|███████▏  | 140/194 [06:03<03:34,  3.98s/it]

LLM Decision: fine-tuned


 73%|███████▎  | 141/194 [06:05<02:55,  3.31s/it]

LLM Decision: fine-tuned


 73%|███████▎  | 142/194 [06:12<03:52,  4.47s/it]

LLM Decision: fine-tuned


 74%|███████▎  | 143/194 [06:13<02:48,  3.31s/it]

LLM Decision: fine-tuned


 74%|███████▍  | 144/194 [06:18<03:09,  3.79s/it]

LLM Decision: fine-tuned


 75%|███████▍  | 145/194 [06:19<02:33,  3.14s/it]

LLM Decision: fine-tuned


 75%|███████▌  | 146/194 [06:23<02:40,  3.34s/it]

LLM Decision: fine-tuned


 76%|███████▌  | 147/194 [06:28<02:57,  3.78s/it]

LLM Decision: fine-tuned


 76%|███████▋  | 148/194 [06:31<02:38,  3.44s/it]

LLM Decision: fine-tuned


 77%|███████▋  | 149/194 [06:34<02:39,  3.55s/it]

LLM Decision: fine-tuned


 77%|███████▋  | 150/194 [06:39<02:49,  3.84s/it]

LLM Decision: fine-tuned


 78%|███████▊  | 151/194 [06:41<02:27,  3.44s/it]

LLM Decision: fine-tuned


 78%|███████▊  | 152/194 [06:45<02:25,  3.47s/it]

LLM Decision: fine-tuned


 79%|███████▉  | 153/194 [06:49<02:24,  3.54s/it]

LLM Decision: fine-tuned


 79%|███████▉  | 154/194 [06:52<02:23,  3.58s/it]

LLM Decision: fine-tuned


 80%|███████▉  | 155/194 [06:57<02:32,  3.92s/it]

LLM Decision: fine-tuned


 80%|████████  | 156/194 [06:59<02:12,  3.50s/it]

LLM Decision: fine-tuned


 81%|████████  | 157/194 [07:03<02:11,  3.57s/it]

LLM Decision: fine-tuned


 81%|████████▏ | 158/194 [07:08<02:21,  3.94s/it]

LLM Decision: fine-tuned


 82%|████████▏ | 159/194 [07:10<01:54,  3.28s/it]

LLM Decision: fine-tuned


 82%|████████▏ | 160/194 [07:17<02:29,  4.39s/it]

LLM Decision: fine-tuned


 83%|████████▎ | 161/194 [07:17<01:45,  3.18s/it]

LLM Decision: fine-tuned


 84%|████████▎ | 162/194 [07:23<02:06,  3.96s/it]

LLM Decision: fine-tuned


 84%|████████▍ | 163/194 [07:30<02:27,  4.77s/it]

LLM Decision: fine-tuned


 85%|████████▍ | 164/194 [07:30<01:44,  3.49s/it]

LLM Decision: fine-tuned


 85%|████████▌ | 165/194 [07:34<01:43,  3.55s/it]

LLM Decision: fine-tuned


 86%|████████▌ | 166/194 [07:36<01:31,  3.26s/it]

LLM Decision: fine-tuned


 86%|████████▌ | 167/194 [07:39<01:21,  3.03s/it]

LLM Decision: fine-tuned


 87%|████████▋ | 168/194 [07:43<01:24,  3.24s/it]

LLM Decision: fine-tuned


 87%|████████▋ | 169/194 [07:45<01:15,  3.02s/it]

LLM Decision: fine-tuned


 88%|████████▊ | 170/194 [07:49<01:17,  3.24s/it]

LLM Decision: fine-tuned


 88%|████████▊ | 171/194 [07:51<01:09,  3.02s/it]

LLM Decision: fine-tuned


 89%|████████▊ | 172/194 [07:56<01:17,  3.51s/it]

LLM Decision: fine-tuned


 89%|████████▉ | 173/194 [08:00<01:14,  3.53s/it]

LLM Decision: fine-tuned


 90%|████████▉ | 174/194 [08:02<01:04,  3.23s/it]

LLM Decision: fine-tuned


 90%|█████████ | 175/194 [08:05<00:57,  3.01s/it]

LLM Decision: fine-tuned


 91%|█████████ | 176/194 [08:08<00:58,  3.24s/it]

LLM Decision: fine-tuned


 91%|█████████ | 177/194 [08:12<00:57,  3.40s/it]

LLM Decision: fine-tuned


 92%|█████████▏| 178/194 [08:17<01:00,  3.76s/it]

LLM Decision: fine-tuned


 92%|█████████▏| 179/194 [08:20<00:56,  3.75s/it]

LLM Decision: fine-tuned


 93%|█████████▎| 180/194 [08:24<00:52,  3.79s/it]

LLM Decision: fine-tuned


 93%|█████████▎| 181/194 [08:28<00:48,  3.70s/it]

LLM Decision: fine-tuned


 94%|█████████▍| 182/194 [08:31<00:43,  3.66s/it]

LLM Decision: fine-tuned


 94%|█████████▍| 183/194 [08:35<00:40,  3.64s/it]

LLM Decision: fine-tuned


 95%|█████████▍| 184/194 [08:39<00:36,  3.62s/it]

LLM Decision: fine-tuned


 95%|█████████▌| 185/194 [08:42<00:32,  3.64s/it]

LLM Decision: fine-tuned


 96%|█████████▌| 186/194 [08:45<00:26,  3.30s/it]

LLM Decision: fine-tuned


 96%|█████████▋| 187/194 [08:49<00:25,  3.68s/it]

LLM Decision: fine-tuned


 97%|█████████▋| 188/194 [08:54<00:22,  3.83s/it]

LLM Decision: fine-tuned


 97%|█████████▋| 189/194 [08:55<00:16,  3.24s/it]

LLM Decision: fine-tuned


 98%|█████████▊| 190/194 [08:58<00:12,  3.02s/it]

LLM Decision: fine-tuned


 98%|█████████▊| 191/194 [09:00<00:08,  2.86s/it]

LLM Decision: fine-tuned


 99%|█████████▉| 192/194 [09:04<00:05,  2.96s/it]

LLM Decision: fine-tuned


 99%|█████████▉| 193/194 [09:05<00:02,  2.56s/it]

LLM Decision: fine-tuned


100%|██████████| 194/194 [09:08<00:00,  2.83s/it]

LLM Decision: fine-tuned

Judgment Summary:
llm_judgment
fine-tuned    193
base            1
Name: count, dtype: int64

Full results saved to 'llm_semantic_merged_judgments.csv'



