In [4]:
# Disable wandb completely
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"

import zipfile
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from datasets import Dataset
import torch
from sentence_transformers import SentenceTransformer, losses, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.similarity_functions import SimilarityFunction
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
import warnings
warnings.filterwarnings('ignore')

# Configuration
config = {
    "model_path": "microsoft/xtremedistil-l6-h256-uncased",
    "learning_rate": 5e-4,
    "train_batch_size": 32,  # Reduced for cross-encoder
    "eval_batch_size": 64,
    "epochs": 3,
    "warmup_ratio": 0.1,
    "output_dir": "./models/"
}

print("Loading and preprocessing data...")

# Extract and load data
with zipfile.ZipFile("/kaggle/input/quora-question-pairs/train.csv.zip", 'r') as zip_ref:
    zip_ref.extractall("./train/")

# Also extract test data if needed
try:
    with zipfile.ZipFile("/kaggle/input/quora-question-pairs/test.csv.zip", 'r') as zip_ref:
        zip_ref.extractall("./test/")
except:
    print("Test file not found or already extracted")

df = pd.read_csv("./train/train.csv")
df = df.dropna()
df = df.rename(columns={'is_duplicate': "label"})
df = df[["question1", "question2", "label"]]

print(f"Total samples: {len(df)}")
print(f"Duplicate pairs: {df['label'].sum()}")
print(f"Non-duplicate pairs: {len(df) - df['label'].sum()}")

# Create stratified splits
# First split: train+val (80%) and test (20%)
train_val, test_df = model_selection.train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["label"]
)

# Second split: train (64%) and val (16%) from the remaining 80%
train_df, val_df = model_selection.train_test_split(
    train_val, test_size=0.2, random_state=42, stratify=train_val["label"]
)

print(f"Train samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Test samples: {len(test_df)}")

# Convert to datasets
train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
val_ds = Dataset.from_pandas(val_df.reset_index(drop=True))
test_ds = Dataset.from_pandas(test_df.reset_index(drop=True))

def evaluate_model_performance(model, test_dataset, model_type="bi-encoder", threshold=0.5):
    """
    Evaluate model performance using F1-score, precision, recall, and accuracy
    """
    if model_type == "cross-encoder":
        # For cross-encoder, we can directly get predictions
        predictions = []
        true_labels = []
        
        for i in range(len(test_dataset)):
            q1 = test_dataset[i]["question1"]
            q2 = test_dataset[i]["question2"]
            label = test_dataset[i]["label"]
            
            score = model.predict([q1, q2])[0]
            predictions.append(1 if score > threshold else 0)
            true_labels.append(label)
    else:
        # For bi-encoder, compute embeddings and cosine similarity
        embeddings1 = model.encode(test_dataset["question1"], batch_size=config["eval_batch_size"])
        embeddings2 = model.encode(test_dataset["question2"], batch_size=config["eval_batch_size"])
        
        # Compute cosine similarities
        similarities = np.array([
            cosine_similarity([emb1], [emb2])[0][0] 
            for emb1, emb2 in zip(embeddings1, embeddings2)
        ])
        
        predictions = (similarities > threshold).astype(int)
        true_labels = test_dataset["label"]
    
    # Calculate metrics
    f1 = f1_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    accuracy = accuracy_score(true_labels, predictions)
    
    return {
        "f1_score": f1,
        "precision": precision,
        "recall": recall,
        "accuracy": accuracy,
        "threshold": threshold
    }

def find_optimal_threshold(model, val_dataset, model_type="bi-encoder"):
    """
    Find optimal threshold for classification
    """
    thresholds = np.arange(0.1, 0.9, 0.1)
    best_f1 = 0
    best_threshold = 0.5
    
    for threshold in thresholds:
        metrics = evaluate_model_performance(model, val_dataset, model_type, threshold)
        if metrics["f1_score"] > best_f1:
            best_f1 = metrics["f1_score"]
            best_threshold = threshold
    
    return best_threshold, best_f1

# Results storage
results = {}

print("\n" + "="*60)
print("EXPERIMENT 1: BENCHMARK WITH DEFAULT WEIGHTS")
print("="*60)

# Load pre-trained model
model_default = SentenceTransformer(config["model_path"])

# Find optimal threshold
opt_threshold, val_f1 = find_optimal_threshold(model_default, val_ds)
print(f"Optimal threshold: {opt_threshold:.2f} (Val F1: {val_f1:.4f})")

# Evaluate on test set
metrics_default = evaluate_model_performance(model_default, test_ds, threshold=opt_threshold)
results["Benchmark"] = metrics_default
print(f"Test F1-Score: {metrics_default['f1_score']:.4f}")
print(f"Test Precision: {metrics_default['precision']:.4f}")
print(f"Test Recall: {metrics_default['recall']:.4f}")
print(f"Test Accuracy: {metrics_default['accuracy']:.4f}")

print("\n" + "="*60)
print("EXPERIMENT 2: BI-ENCODER WITH COSINE SIMILARITY LOSS")
print("="*60)

model_cosine = SentenceTransformer(config["model_path"])
train_loss_cosine = losses.CosineSimilarityLoss(model=model_cosine)

# Setup evaluator
dev_evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_ds["question1"],
    sentences2=val_ds["question2"],
    scores=val_ds["label"],
    main_similarity=SimilarityFunction.COSINE,
    name="dev-score",
)

# Prepare training examples for cosine similarity loss
train_examples = []
for i in range(len(train_ds)):
    train_examples.append(InputExample(
        texts=[train_ds[i]["question1"], train_ds[i]["question2"]], 
        label=float(train_ds[i]["label"])
    ))

# Use the fit method instead of trainer
model_cosine.fit(
    train_objectives=[(torch.utils.data.DataLoader(train_examples, shuffle=True, batch_size=config["train_batch_size"]), train_loss_cosine)],
    evaluator=dev_evaluator,
    epochs=config["epochs"],
    evaluation_steps=1000,
    warmup_steps=int(len(train_examples) * config["epochs"] * config["warmup_ratio"] / config["train_batch_size"]),
    output_path=config["output_dir"] + "cosine/",
    save_best_model=True,
    optimizer_params={'lr': config["learning_rate"]},
)

# Evaluate
opt_threshold, val_f1 = find_optimal_threshold(model_cosine, val_ds)
print(f"Optimal threshold: {opt_threshold:.2f} (Val F1: {val_f1:.4f})")

metrics_cosine = evaluate_model_performance(model_cosine, test_ds, threshold=opt_threshold)
results["Cosine Similarity"] = metrics_cosine
print(f"Test F1-Score: {metrics_cosine['f1_score']:.4f}")
print(f"Test Precision: {metrics_cosine['precision']:.4f}")
print(f"Test Recall: {metrics_cosine['recall']:.4f}")
print(f"Test Accuracy: {metrics_cosine['accuracy']:.4f}")

print("\n" + "="*60)
print("EXPERIMENT 3: BI-ENCODER WITH CONTRASTIVE LOSS")
print("="*60)

model_contrastive = SentenceTransformer(config["model_path"])

# Prepare training examples for contrastive loss
train_examples = []
for i in range(len(train_ds)):
    train_examples.append(InputExample(
        texts=[train_ds[i]["question1"], train_ds[i]["question2"]], 
        label=float(train_ds[i]["label"])
    ))

# Use ContrastiveLoss
train_loss_contrastive = losses.ContrastiveLoss(model=model_contrastive)

# Use the fit method
model_contrastive.fit(
    train_objectives=[(torch.utils.data.DataLoader(train_examples, shuffle=True, batch_size=config["train_batch_size"]), train_loss_contrastive)],
    evaluator=dev_evaluator,
    epochs=config["epochs"],
    evaluation_steps=1000,
    warmup_steps=int(len(train_examples) * config["epochs"] * config["warmup_ratio"] / config["train_batch_size"]),
    output_path=config["output_dir"] + "contrastive/",
    save_best_model=True,
    optimizer_params={'lr': config["learning_rate"]},
)

# Evaluate
opt_threshold, val_f1 = find_optimal_threshold(model_contrastive, val_ds)
print(f"Optimal threshold: {opt_threshold:.2f} (Val F1: {val_f1:.4f})")

metrics_contrastive = evaluate_model_performance(model_contrastive, test_ds, threshold=opt_threshold)
results["Contrastive Loss"] = metrics_contrastive
print(f"Test F1-Score: {metrics_contrastive['f1_score']:.4f}")
print(f"Test Precision: {metrics_contrastive['precision']:.4f}")
print(f"Test Recall: {metrics_contrastive['recall']:.4f}")
print(f"Test Accuracy: {metrics_contrastive['accuracy']:.4f}")

print("\n" + "="*60)
print("EXPERIMENT 4: BI-ENCODER WITH MULTIPLE NEGATIVE RANKING LOSS")
print("="*60)

model_mnr = SentenceTransformer(config["model_path"])

# Multiple Negative Ranking Loss
train_loss_mnr = losses.MultipleNegativesRankingLoss(model=model_mnr)

# Prepare training examples - only positive pairs for MNR loss
train_examples_mnr = []
positive_pairs = train_df[train_df["label"] == 1]
for _, row in positive_pairs.iterrows():
    train_examples_mnr.append(InputExample(
        texts=[row["question1"], row["question2"]]
    ))

print(f"Using {len(train_examples_mnr)} positive pairs for MNR training")

# Use the fit method
model_mnr.fit(
    train_objectives=[(torch.utils.data.DataLoader(train_examples_mnr, shuffle=True, batch_size=config["train_batch_size"]), train_loss_mnr)],
    evaluator=dev_evaluator,
    epochs=config["epochs"],
    evaluation_steps=1000,
    warmup_steps=int(len(train_examples_mnr) * config["epochs"] * config["warmup_ratio"] / config["train_batch_size"]),
    output_path=config["output_dir"] + "mnr/",
    save_best_model=True,
    optimizer_params={'lr': config["learning_rate"]},
)

# Evaluate
opt_threshold, val_f1 = find_optimal_threshold(model_mnr, val_ds)
print(f"Optimal threshold: {opt_threshold:.2f} (Val F1: {val_f1:.4f})")

metrics_mnr = evaluate_model_performance(model_mnr, test_ds, threshold=opt_threshold)
results["MNR Loss"] = metrics_mnr
print(f"Test F1-Score: {metrics_mnr['f1_score']:.4f}")
print(f"Test Precision: {metrics_mnr['precision']:.4f}")
print(f"Test Recall: {metrics_mnr['recall']:.4f}")
print(f"Test Accuracy: {metrics_mnr['accuracy']:.4f}")

print("\n" + "="*60)
print("EXPERIMENT 5: CROSS-ENCODER")
print("="*60)

# Initialize cross-encoder
cross_encoder = CrossEncoder(config["model_path"], num_labels=1)

# Prepare training data for cross-encoder
train_samples = []
for i in range(len(train_ds)):
    train_samples.append(InputExample(
        texts=[train_ds[i]["question1"], train_ds[i]["question2"]], 
        label=train_ds[i]["label"]
    ))

# Prepare validation data
val_samples = []
for i in range(len(val_ds)):
    val_samples.append(InputExample(
        texts=[val_ds[i]["question1"], val_ds[i]["question2"]], 
        label=val_ds[i]["label"]
    ))

# Setup evaluator for cross-encoder
ce_evaluator = CEBinaryClassificationEvaluator.from_input_examples(
    val_samples, name='dev'
)

# Train cross-encoder using the fit method
cross_encoder.fit(
    train_dataloader=torch.utils.data.DataLoader(train_samples, shuffle=True, batch_size=config["train_batch_size"]),
    evaluator=ce_evaluator,
    epochs=config["epochs"],
    evaluation_steps=1000,
    warmup_steps=int(len(train_samples) * config["epochs"] * config["warmup_ratio"] / config["train_batch_size"]),
    output_path=config["output_dir"] + "cross_encoder/",
    save_best_model=True,
    optimizer_params={'lr': config["learning_rate"]},
)

# Evaluate cross-encoder
opt_threshold, val_f1 = find_optimal_threshold(cross_encoder, val_ds, model_type="cross-encoder")
print(f"Optimal threshold: {opt_threshold:.2f} (Val F1: {val_f1:.4f})")

metrics_cross = evaluate_model_performance(cross_encoder, test_ds, model_type="cross-encoder", threshold=opt_threshold)
results["Cross-Encoder"] = metrics_cross
print(f"Test F1-Score: {metrics_cross['f1_score']:.4f}")
print(f"Test Precision: {metrics_cross['precision']:.4f}")
print(f"Test Recall: {metrics_cross['recall']:.4f}")
print(f"Test Accuracy: {metrics_cross['accuracy']:.4f}")

print("\n" + "="*80)
print("FINAL RESULTS COMPARISON")
print("="*80)

# Create results DataFrame
results_df = pd.DataFrame(results).T
results_df = results_df.round(4)
print(results_df)

# Find best model
best_model = results_df['f1_score'].idxmax()
best_f1 = results_df['f1_score'].max()
print(f"\nBest performing model: {best_model}")
print(f"Best F1-Score: {best_f1:.4f}")

# Print detailed comparison
print("\n" + "-"*50)
print("DETAILED F1-SCORE COMPARISON:")
print("-"*50)
for model_name, metrics in results.items():
    print(f"{model_name:20}: {metrics['f1_score']:.4f}")

print("\nExperiment completed successfully!")

print("\n" + "="*60)
print("CREATING PREDICTIONS FOR OFFICIAL TEST SET")
print("="*60)

# Load official test set (without labels)
try:
    official_test_df = pd.read_csv("./test/test.csv")
    official_test_df = official_test_df.dropna()

    print(f"Official test samples: {len(official_test_df)}")

    # Use the best performing model to create predictions
    best_model_name = results_df['f1_score'].idxmax()
    print(f"Using best model: {best_model_name}")

    # Get the best model and its threshold
    if best_model_name == "Benchmark":
        best_model = model_default
        model_type = "bi-encoder"
    elif best_model_name == "Cosine Similarity":
        best_model = model_cosine
        model_type = "bi-encoder"
    elif best_model_name == "Contrastive Loss":
        best_model = model_contrastive
        model_type = "bi-encoder"
    elif best_model_name == "MNR Loss":
        best_model = model_mnr
        model_type = "bi-encoder"
    else:  # Cross-Encoder
        best_model = cross_encoder
        model_type = "cross-encoder"

    best_threshold = results[best_model_name]["threshold"]

    # Create predictions for official test set
    print("Generating predictions...")
    if model_type == "cross-encoder":
        predictions = []
        for i in range(len(official_test_df)):
            if i % 1000 == 0:
                print(f"Processed {i}/{len(official_test_df)} samples")
            
            q1 = official_test_df.iloc[i]["question1"]
            q2 = official_test_df.iloc[i]["question2"]
            score = best_model.predict([q1, q2])[0]
            predictions.append(1 if score > best_threshold else 0)
    else:
        # Bi-encoder approach
        embeddings1 = best_model.encode(
            official_test_df["question1"].tolist(), 
            batch_size=config["eval_batch_size"],
            show_progress_bar=True
        )
        embeddings2 = best_model.encode(
            official_test_df["question2"].tolist(), 
            batch_size=config["eval_batch_size"],
            show_progress_bar=True
        )
        
        # Compute similarities
        similarities = np.array([
            cosine_similarity([emb1], [emb2])[0][0] 
            for emb1, emb2 in zip(embeddings1, embeddings2)
        ])
        
        predictions = (similarities > best_threshold).astype(int)

    # Create submission file
    submission_df = pd.DataFrame({
        'test_id': official_test_df['test_id'],
        'is_duplicate': predictions
    })

    submission_df.to_csv('submission.csv', index=False)
    print(f"Predictions saved to 'submission.csv'")
    print(f"Predicted duplicates: {sum(predictions)}")
    print(f"Predicted non-duplicates: {len(predictions) - sum(predictions)}")
    print(f"Duplicate ratio: {sum(predictions) / len(predictions):.3f}")

except FileNotFoundError:
    print("Official test file not found. Skipping prediction generation.")

print("\nAll experiments completed successfully!")
print("Files created:")
print("- submission.csv (for Kaggle submission, if test data available)")
print("- Model checkpoints in ./models/ directory")

Loading and preprocessing data...
Total samples: 404287
Duplicate pairs: 149263
Non-duplicate pairs: 255024
Train samples: 258743
Validation samples: 64686
Test samples: 80858

EXPERIMENT 1: BENCHMARK WITH DEFAULT WEIGHTS


Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Optimal threshold: 0.80 (Val F1: 0.5409)


Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Test F1-Score: 0.5409
Test Precision: 0.3708
Test Recall: 0.9996
Test Accuracy: 0.3735

EXPERIMENT 2: BI-ENCODER WITH COSINE SIMILARITY LOSS


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Dev-score Pearson Cosine,Dev-score Spearman Cosine
1000,0.1854,No log,0.520413,0.528578
2000,0.2159,No log,0.315662,0.315943
3000,0.315,No log,0.010843,0.010843
4000,0.2988,No log,-0.002611,-0.002611
4043,0.2988,No log,-0.005139,-0.005139
5000,0.2766,No log,,
6000,0.2665,No log,0.004965,0.004965
7000,0.2656,No log,,
8000,0.2637,No log,,
8086,0.2637,No log,0.010572,0.010572


Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Optimal threshold: 0.10 (Val F1: 0.5393)


Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Test F1-Score: 0.5393
Test Precision: 0.3692
Test Recall: 1.0000
Test Accuracy: 0.3692

EXPERIMENT 3: BI-ENCODER WITH CONTRASTIVE LOSS


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Dev-score Pearson Cosine,Dev-score Spearman Cosine
1000,0.0214,No log,0.531084,0.572962
2000,0.0207,No log,0.531327,0.592452
3000,0.0352,No log,0.233092,0.321016
4000,0.0257,No log,0.221416,0.260622
4043,0.0257,No log,0.207632,0.25681
5000,0.0309,No log,0.197188,0.351729
6000,0.0323,No log,-0.01027,0.055039
7000,0.0321,No log,-0.012795,-0.018197
8000,0.0319,No log,-0.006566,-0.0382
8086,0.0319,No log,-0.006995,-0.036657


Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Optimal threshold: 0.10 (Val F1: 0.5393)


Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Test F1-Score: 0.5393
Test Precision: 0.3692
Test Recall: 1.0000
Test Accuracy: 0.3692

EXPERIMENT 4: BI-ENCODER WITH MULTIPLE NEGATIVE RANKING LOSS
Using 95528 positive pairs for MNR training


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Dev-score Pearson Cosine,Dev-score Spearman Cosine
1000,0.1603,No log,0.450004,0.513506
1493,0.1603,No log,0.458358,0.54048
2000,0.0982,No log,0.471713,0.558528
2986,0.0859,No log,0.479256,0.557477
3000,0.0739,No log,0.485345,0.574856
4000,0.0601,No log,0.48886,0.573488
4479,0.0601,No log,0.48973,0.579213


Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Batches:   0%|          | 0/1011 [00:00<?, ?it/s]

Optimal threshold: 0.80 (Val F1: 0.7147)


Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Batches:   0%|          | 0/1264 [00:00<?, ?it/s]

Test F1-Score: 0.7174
Test Precision: 0.5934
Test Recall: 0.9071
Test Accuracy: 0.7362

EXPERIMENT 5: CROSS-ENCODER


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/xtremedistil-l6-h256-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


README.md: 0.00B [00:00, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Dev Accuracy,Dev Accuracy Threshold,Dev F1,Dev F1 Threshold,Dev Precision,Dev Recall,Dev Average Precision
1000,0.383,No log,0.835436,0.527006,0.789192,0.459072,0.731966,0.856126,0.845576
2000,0.3808,No log,0.847587,0.559932,0.802683,0.448868,0.747365,0.866845,0.858537
3000,0.3858,No log,0.846706,0.59924,0.803125,0.508992,0.746146,0.869525,0.854656
4000,0.3746,No log,0.844912,0.515844,0.799838,0.44041,0.741558,0.86806,0.858981
5000,0.3327,No log,0.855842,0.579472,0.813977,0.491975,0.772748,0.859853,0.871908
6000,0.3383,No log,0.857945,0.549539,0.814776,0.426916,0.765765,0.870488,0.875232
7000,0.3263,No log,0.857481,0.588826,0.814575,0.504925,0.766092,0.869609,0.876066
8000,0.3144,No log,0.86419,0.40511,0.822817,0.300565,0.777327,0.873964,0.878559
9000,0.2687,No log,0.864422,0.652779,0.823051,0.506902,0.77511,0.877313,0.880129
10000,0.2735,No log,0.866617,0.678871,0.827249,0.626775,0.785376,0.873838,0.88848


Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

Batches:   0%|          | 0/2022 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

IndexError: invalid index to scalar variable.