In [1]:
from snorkel.labeling import labeling_function
import json
import os
import numpy as np

In [2]:
POSITIVE = 1
NEGATIVE = 0
ABSTAIN = -1

@labeling_function()
def llava_7b(image_name):
    root_path = '../prompting_framework/prompting_results/oxford/interpreter/'
    llava_7b_results = 'oxford_llava7b.json'
    path_to_llava_7b_results = os.path.join(root_path,llava_7b_results)
    with open(path_to_llava_7b_results, 'r') as file:
        data = json.load(file)

    return data.get(image_name, -1)

@labeling_function()
def llava_13b(image_name):
    root_path = '../prompting_framework/prompting_results/oxford/interpreter'
    llava_7b_results = 'oxford_llava13b.json'
    path_to_llava_7b_results = os.path.join(root_path,llava_7b_results)
    with open(path_to_llava_7b_results, 'r') as file:
        data = json.load(file)

    return data.get(image_name, -1)

@labeling_function()
def bakllava(image_name):
    root_path = '../prompting_framework/prompting_results/oxford/interpreter'
    llava_7b_results = 'oxford_bakllava.json'
    path_to_llava_7b_results = os.path.join(root_path,llava_7b_results)
    with open(path_to_llava_7b_results, 'r') as file:
        data = json.load(file)

    return data.get(image_name, -1)

@labeling_function()
def llava_llama3(image_name):
    root_path = '../prompting_framework/prompting_results/oxford/interpreter'
    llava_7b_results = 'oxford_llava_llama3.json'
    path_to_llava_7b_results = os.path.join(root_path,llava_7b_results)
    with open(path_to_llava_7b_results, 'r') as file:
        data = json.load(file)

    return data.get(image_name, -1)


@labeling_function()
def minicpm(image_name):
    root_path = '../prompting_framework/prompting_results/oxford/interpreter'
    llava_7b_results = 'oxford_minicpm.json'
    path_to_llava_7b_results = os.path.join(root_path,llava_7b_results)
    with open(path_to_llava_7b_results, 'r') as file:
        data = json.load(file)

    return data.get(image_name, -1)

In [3]:
train_data_json_path = '../prompting_framework/prompting_results/oxford/interpreter/train_gt.json'
dev_data_json_path = '../prompting_framework/prompting_results/oxford/interpreter/test_gt.json'

with open(train_data_json_path, 'r') as file:
    train_data = json.load(file)

# Extract and pad image names, ensuring they are 5 digits long before the '.png'
train_image_names = []
for item in train_data:
    train_image_names.append(item)

with open(dev_data_json_path, 'r') as file:
    dev_data = json.load(file)
    
dev_image_names = []
Y_dev = []
for item in dev_data:
    Y_dev.append(dev_data[item])
    dev_image_names.append(item)

print(f"There are {len(train_image_names)} images in the Train set.")
print(f"There are {len(dev_image_names)} images in the dev set.")
print(f"There are {len(Y_dev)} labels in the dev set.")


There are 2944 images in the Train set.
There are 3669 images in the dev set.
There are 3669 labels in the dev set.


In [4]:
bakllava(train_image_names[1000])

-1

In [5]:
from snorkel.labeling import LFApplier

list_of_all_the_models = ['llava_13b',
       'llava_7b',
       'llava_llama3',
       'minicpm',
       'bakllava'
       ]

lfs = [llava_13b,
       llava_7b,
       llava_llama3,
       minicpm,
       bakllava
       ]

applier = LFApplier(lfs)

In [6]:
from snorkel.labeling import LFAnalysis

L_dev = applier.apply(dev_image_names)
L_train = applier.apply(train_image_names)

3669it [00:04, 741.06it/s]
2944it [00:03, 742.40it/s]


In [7]:
Y_dev = np.array(Y_dev)
LFAnalysis(L_dev, lfs).lf_summary(Y_dev)

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts,Correct,Incorrect,Emp. Acc.
llava_13b,0,"[0, 4, 5, 6, 7, 8, 10, 16, 18, 23, 24, 25, 26,...",0.403107,0.376124,0.179613,788,691,0.532792
llava_7b,1,"[0, 4, 5, 8, 10, 22, 23, 24, 25, 26, 29, 32, 33]",0.467157,0.415372,0.195149,868,846,0.506418
llava_llama3,2,"[0, 4, 5, 10, 16, 23, 24, 25, 29, 32]",0.235759,0.213137,0.105478,406,459,0.469364
minicpm,3,"[0, 4, 5, 6, 7, 8, 10, 16, 18, 19, 22, 23, 24,...",0.463069,0.380758,0.180703,1319,380,0.776339
bakllava,4,"[0, 4, 6, 10, 23, 24, 25, 32, 33]",0.107386,0.10248,0.054511,224,170,0.568528


In [8]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
import numpy as np

def calculate_metrics(y_true, y_pred, abstain_class=-1):
    # Filter out samples where prediction is -1
    valid_indices = y_pred != abstain_class
    y_true_filtered = y_true[valid_indices]
    y_pred_filtered = y_pred[valid_indices]

    # Compute metrics
    precision = precision_score(y_true_filtered, y_pred_filtered, average='macro')
    recall = recall_score(y_true_filtered, y_pred_filtered, average='macro')
    f1 = f1_score(y_true_filtered, y_pred_filtered, average='macro')
    accuracy = accuracy_score(y_true_filtered, y_pred_filtered)

    return {
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Accuracy': accuracy
    }

In [9]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, recall_score, precision_score, accuracy_score, f1_score

# Example ground truth and predictions for six models
# Replace these arrays with actual predictions from each model
y_true = Y_dev
predictions = {}

for i in range(L_dev.shape[1]):
    predictions[list_of_all_the_models[i]] = L_dev[:,i]
    
# Create a DataFrame to store confusion matrix results and metrics
confusion_data = []

for model_name, y_pred in predictions.items():
    
    y_pred = np.array(y_pred)
    # Confusion Matrix
    metrics = calculate_metrics(Y_dev, y_pred)

    precision = metrics['Precision']
    recall = metrics['Recall']
    f1 = metrics['F1 Score']
    accuracy = metrics['Accuracy']
    # Append data
    confusion_data.append([
        model_name,
        recall, precision, accuracy, f1
    ])

# Convert to a DataFrame for display
confusion_df = pd.DataFrame(confusion_data, columns=[
    'Model', 
    'Recall', 'Precision', 'Accuracy', 'F1 Score'
])

# Display the table with confusion matrix and metrics
confusion_df

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Model,Recall,Precision,Accuracy,F1 Score
0,llava_13b,0.354265,0.31859,0.532792,0.279984
1,llava_7b,0.32474,0.220103,0.506418,0.245728
2,llava_llama3,0.237491,0.200568,0.469364,0.202906
3,minicpm,0.505769,0.489446,0.776339,0.479285
4,bakllava,0.29573,0.33159,0.568528,0.254197


In [10]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=37, verbose=False)
label_model.fit(L_train, Y_dev, n_epochs=5000, log_freq=500, seed=12345)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:03<00:00, 1458.79epoch/s]


In [11]:
from snorkel.analysis import metric_score
from snorkel.utils import probs_to_preds

probs_dev = label_model.predict_proba(L_dev)
preds_dev = probs_to_preds(probs_dev)

metrics = calculate_metrics(Y_dev, preds_dev)
for metric, value in metrics.items():
    print(f"{metric}: {value}")

Precision: 0.23611091617721394
Recall: 0.34430663502092074
F1 Score: 0.25272754650139534
Accuracy: 0.34614336331425455


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
