In [1]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import BinaryClassificationEvaluator
from datasets import load_dataset

# Load a model
model = SentenceTransformer('all-mpnet-base-v2')

# Load a dataset with two text columns and a class label column (https://huggingface.co/datasets/sentence-transformers/quora-duplicates)
eval_dataset = load_dataset("sentence-transformers/quora-duplicates", "pair-class", split="train[-1000:]")

# Initialize the evaluator
binary_acc_evaluator = BinaryClassificationEvaluator(
    sentences1=eval_dataset["sentence1"],
    sentences2=eval_dataset["sentence2"],
    labels=eval_dataset["label"],
    name="quora-duplicates-dev",
)
results = binary_acc_evaluator(model)
'''
Binary Accuracy Evaluation of the model on the quora-duplicates-dev dataset:
Accuracy with Cosine-Similarity:           81.60    (Threshold: 0.8352)
F1 with Cosine-Similarity:                 75.27    (Threshold: 0.7715)
Precision with Cosine-Similarity:          65.81
Recall with Cosine-Similarity:             87.89
Average Precision with Cosine-Similarity:  76.03

Accuracy with Dot-Product:           81.60  (Threshold: 0.8352)
F1 with Dot-Product:                 75.27  (Threshold: 0.7715)
Precision with Dot-Product:          65.81
Recall with Dot-Product:             87.89
Average Precision with Dot-Product:  76.03

Accuracy with Manhattan-Distance:           81.50   (Threshold: 12.0727)
F1 with Manhattan-Distance:                 74.97   (Threshold: 15.2269)
Precision with Manhattan-Distance:          63.89
Recall with Manhattan-Distance:             90.68
Average Precision with Manhattan-Distance:  75.66

Accuracy with Euclidean-Distance:           81.60   (Threshold: 0.5741)
F1 with Euclidean-Distance:                 75.27   (Threshold: 0.6760)
Precision with Euclidean-Distance:          65.81
Recall with Euclidean-Distance:             87.89
Average Precision with Euclidean-Distance:  76.03
'''
print(binary_acc_evaluator.primary_metric)
# => "quora-duplicates-dev_max_ap"
print(results[binary_acc_evaluator.primary_metric])
# => 0.760277070888393

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/4.78k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/35.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/404290 [00:00<?, ? examples/s]

quora-duplicates-dev_max_ap
0.760277070888393


In [16]:
print(f"Dataset: {eval_dataset}")

# Print the number of examples in each split
print(f"Train examples: {len([label for label in eval_dataset['label'] if label == ''])}")
print(f"Test examples: {len(eval_dataset['sentence1'])}")

# Print the first few samples from the dataset
# print(eval_dataset['label'][0])

# Get the dataset info
dataset_info = eval_dataset.info
# print(f"Dataset Info: {dataset_info}")
print(eval_dataset['sentence1'][:10])

Dataset: Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 1000
})
Train examples: 0
Test examples: 1000
['Is it ethical to list a minority race during college admissions when I am only 1/8?', 'Which app is the best educational app?', 'How are resources allocated in a market economy? How are resources used in a market economy?', 'Is India still a British dominion?', "Is it bad that my girlfriend and I don't like the same music?", 'What are some mind-blowing facts about scientists / inventors?', "Would Hillary get women's vote just because she's a female?", 'How many megabytes of data one sheet of A4 paper could hold?', 'Am I in an open relationship?', 'Can we pursue biotechnology after B.tech in mechanical engineering?']


In [15]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForPreTraining

tokenizer = AutoTokenizer.from_pretrained("classla/bcms-bertic")
model = AutoModelForPreTraining.from_pretrained("classla/bcms-bertic")

# Load a dataset with two text columns and a class label column (https://huggingface.co/datasets/sentence-transformers/quora-duplicates)
# eval_dataset = load_dataset("sentence-transformers/quora-duplicates", "pair-class", split="train[-1000:]")
eval_dataset = load_dataset('gordicaleksa/serbian-llm-eval-v1')

# Initialize the evaluator
binary_acc_evaluator = BinaryClassificationEvaluator(
    sentences1=eval_dataset["sentence1"],
    sentences2=eval_dataset["sentence2"],
    labels=eval_dataset["label"],
    name="quora-duplicates-dev",
)
print(binary_acc_evaluator.primary_metric)
# => "quora-duplicates-dev_max_ap"
print(results)


max_ap
{'quora-duplicates-dev_cosine_accuracy': 0.816, 'quora-duplicates-dev_cosine_accuracy_threshold': 0.8351784944534302, 'quora-duplicates-dev_cosine_f1': 0.7526595744680852, 'quora-duplicates-dev_cosine_f1_threshold': 0.7715297937393188, 'quora-duplicates-dev_cosine_precision': 0.6581395348837209, 'quora-duplicates-dev_cosine_recall': 0.8788819875776398, 'quora-duplicates-dev_cosine_ap': 0.760277070888393, 'quora-duplicates-dev_dot_accuracy': 0.816, 'quora-duplicates-dev_dot_accuracy_threshold': 0.8351784944534302, 'quora-duplicates-dev_dot_f1': 0.7526595744680852, 'quora-duplicates-dev_dot_f1_threshold': 0.7715297937393188, 'quora-duplicates-dev_dot_precision': 0.6581395348837209, 'quora-duplicates-dev_dot_recall': 0.8788819875776398, 'quora-duplicates-dev_dot_ap': 0.760277070888393, 'quora-duplicates-dev_manhattan_accuracy': 0.815, 'quora-duplicates-dev_manhattan_accuracy_threshold': 12.072722434997559, 'quora-duplicates-dev_manhattan_f1': 0.7496790757381259, 'quora-duplicates-d