In [1]:
# Cell 1: Install Libraries & Check Transformers Version
!pip install transformers[torch] datasets scikit-learn accelerate -U

import transformers
print(f"Transformers version: {transformers.__version__}")
# EXPECTED: A recent version, e.g., 4.30.0 or newer.
# If it's old, make sure to go to "Runtime" > "Restart runtime" after this cell completes.


[notice] A new release of pip is available: 25.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
  from .autonotebook import tqdm as notebook_tqdm


Transformers version: 4.51.3


In [2]:
# Cell 2: Imports and Basic Configuration
import torch
from datasets import load_dataset, DatasetDict, ClassLabel
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np
import pandas as pd # For inspecting label distributions

# --- Configuration ---
MODEL_NAME = "distilbert-base-uncased"
DATASET_NAME = "fever"
DATASET_CONFIG = "v1.0" # FEVER has different versions, v1.0 is common. v2.0 adds adversarial attacks.
                        # 'wiki_pages' config is just the evidence, not the claims.
                        # Let's start with v1.0 which contains claim-evidence pairs and labels.

# Tokenizer and model parameters
MAX_LENGTH = 512         # Max sequence length for DistilBERT (Claim + Evidence)
BATCH_SIZE = 16          # Adjust based on GPU memory
NUM_EPOCHS = 3           # Number of training epochs (2-4 is typical)

# Reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("Configuration set for FEVER Fact Verification.")

Configuration set for FEVER Fact Verification.


In [3]:
# Cell 3: Load and Prepare Dataset (using pietrolesci/nli_fever - handling 'dev' split)

DATASET_NAME_PREPROCESSED = "pietrolesci/nli_fever"
print(f"Loading preprocessed FEVER-NLI dataset: {DATASET_NAME_PREPROCESSED}")

try:
    raw_dataset_dict = load_dataset(DATASET_NAME_PREPROCESSED)
    print(f"Successfully loaded dataset. Structure: {raw_dataset_dict}")
    available_splits_loaded = list(raw_dataset_dict.keys())
    print(f"Available splits in raw_dataset_dict: {available_splits_loaded}")

    if 'train' not in raw_dataset_dict:
        raise ValueError("Critical 'train' split not found in the loaded dataset.")

    print(f"Train features: {raw_dataset_dict['train'].features}")
    # print(f"First train example: {raw_dataset_dict['train'][0]}") # Optional
except Exception as e:
    print(f"Error loading dataset '{DATASET_NAME_PREPROCESSED}': {e}")
    raise

# --- Prepare Labels ---
num_labels = 3
id_to_label_map = {0: "SUPPORTS", 1: "NOT ENOUGH INFO", 2: "REFUTES"}

label_feature_info_train = raw_dataset_dict['train'].features.get('label')
if isinstance(label_feature_info_train, ClassLabel):
    print(f"\nTrain 'label' feature is ClassLabel. Names: {label_feature_info_train.names}")
    if label_feature_info_train.num_classes != num_labels:
        print(f"Warning: Expected {num_labels} classes, but ClassLabel has {label_feature_info_train.num_classes}. This might be an issue or require adjusting num_labels.")
        # num_labels = label_feature_info_train.num_classes # Uncomment if you want to dynamically set num_labels
    # Verifying if ClassLabel names align with our manual map.
    # Example: if ClassLabel names are ["entailment", "neutral", "contradiction"], it's consistent.
else:
    print("\nTrain 'label' feature is not ClassLabel or not found. Assuming integer labels 0, 1, 2 as per dataset description.")

print(f"Using {num_labels}-class classification.")
print(f"ID to Label map: {id_to_label_map}")


def prepare_labels_for_model(example):
    example['labels'] = example['label']
    return example

processed_dataset_dict = raw_dataset_dict.map(prepare_labels_for_model)


# --- Final Datasets (Dynamically Assign Splits, Prioritizing 'dev' for validation) ---
train_split = processed_dataset_dict['train']
validation_split = None
test_split_from_dataset = None # The dataset's 'test' split (unlabelled)

if 'dev' in processed_dataset_dict: # Check for 'dev' first as it's common for validation
    print("Found 'dev' split, using it as validation.")
    validation_split = processed_dataset_dict['dev']
elif 'validation' in processed_dataset_dict: # Fallback to 'validation'
    print("Found 'validation' split, using it as validation.")
    validation_split = processed_dataset_dict['validation']
else:
    print("Warning: No 'dev' or 'validation' split found. Splitting train set for validation.")
    if len(train_split) < 100:
        raise ValueError("Train split is too small to create a validation set.")
    temp_train_val_split = train_split.train_test_split(test_size=0.1, seed=SEED, stratify_by_column="labels")
    train_split = temp_train_val_split['train']
    validation_split = temp_train_val_split['test']
    print(f"Created validation split from train data. New train size: {len(train_split)}, Validation size: {len(validation_split)}")

if 'test' in processed_dataset_dict:
    test_split_from_dataset = processed_dataset_dict['test']
    print("Found 'test' split from dataset (expected to be unlabelled).")
else:
    print("Warning: No 'test' split found in the original dataset.")


final_datasets = DatasetDict({
    'train': train_split,
    'validation': validation_split
})
if test_split_from_dataset is not None:
    final_datasets['test'] = test_split_from_dataset


# Optional: Subsample for quicker iteration
apply_subsampling = False # Set to True to subsample
if apply_subsampling:
    print("Subsampling for quicker iteration...")
    N_TRAIN_SAMPLES = 10000 # Example size
    N_VAL_SAMPLES = 1000    # Example size
    # N_TEST_SAMPLES = 1000 # Not relevant for labelled evaluation

    if len(final_datasets['train']) > N_TRAIN_SAMPLES:
        final_datasets['train'] = final_datasets['train'].shuffle(seed=SEED).select(range(N_TRAIN_SAMPLES))
    if final_datasets.get('validation') and len(final_datasets['validation']) > N_VAL_SAMPLES:
        final_datasets['validation'] = final_datasets['validation'].shuffle(seed=SEED).select(range(N_VAL_SAMPLES))
    # Test split subsampling is less critical as it's unlabelled for metrics
    if final_datasets.get('test') and len(final_datasets['test']) > N_VAL_SAMPLES : # Using N_VAL_SAMPLES as a proxy size for test
         final_datasets['test'] = final_datasets['test'].shuffle(seed=SEED).select(range(N_VAL_SAMPLES))


print("\nFinal dataset splits prepared:")
print(final_datasets)
if final_datasets.get('train'): print(f"Train samples: {len(final_datasets['train'])}")
if final_datasets.get('validation'): print(f"Validation samples: {len(final_datasets['validation'])}")
if final_datasets.get('test'): print(f"Test samples (unlabelled for metrics): {len(final_datasets['test'])}")


# Check label distribution for train and validation
if final_datasets.get('train'):
    print("\nLabel distribution in training set:")
    train_label_ids = final_datasets['train']['labels']
    train_counts = pd.Series(train_label_ids).value_counts().sort_index()
    print("Counts:")
    for label_id_val, count in train_counts.items():
        print(f"  {id_to_label_map.get(label_id_val, f'ID {label_id_val}')}: {count}")

if final_datasets.get('validation'):
    print("\nLabel distribution in validation set:")
    val_label_ids = final_datasets['validation']['labels']
    val_counts = pd.Series(val_label_ids).value_counts().sort_index()
    print("Counts:")
    for label_id_val, count in val_counts.items():
        print(f"  {id_to_label_map.get(label_id_val, f'ID {label_id_val}')}: {count}")

Loading preprocessed FEVER-NLI dataset: pietrolesci/nli_fever


Repo card metadata block was not found. Setting CardData to empty.


Successfully loaded dataset. Structure: DatasetDict({
    train: Dataset({
        features: ['cid', 'fid', 'premise', 'hypothesis', 'verifiable', 'fever_gold_label', 'label'],
        num_rows: 208346
    })
    test: Dataset({
        features: ['cid', 'fid', 'premise', 'hypothesis', 'verifiable', 'fever_gold_label', 'label'],
        num_rows: 19998
    })
    dev: Dataset({
        features: ['cid', 'fid', 'premise', 'hypothesis', 'verifiable', 'fever_gold_label', 'label'],
        num_rows: 19998
    })
})
Available splits in raw_dataset_dict: ['train', 'test', 'dev']
Train features: {'cid': Value(dtype='int64', id=None), 'fid': Value(dtype='string', id=None), 'premise': Value(dtype='string', id=None), 'hypothesis': Value(dtype='string', id=None), 'verifiable': Value(dtype='int64', id=None), 'fever_gold_label': Value(dtype='string', id=None), 'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)}

Train 'label' feature is ClassLabel. Names: ['entailment', 

In [4]:
# Cell 4: Tokenization

tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)
print(f"\nTokenizer loaded: {MODEL_NAME}")

# In pietrolesci/fever_nli_with_evidence:
# 'hypothesis' is the CLAIM
# 'premise' is the EVIDENCE
claim_col_name = 'hypothesis'
evidence_col_name = 'premise'
print(f"Using columns: '{claim_col_name}' as claim and '{evidence_col_name}' as evidence.")


def preprocess_function_fever(examples):
    claims = examples[claim_col_name]
    evidences = examples[evidence_col_name]

    # Ensure texts are strings
    processed_claims = [str(text) if text is not None else "" for text in claims]
    processed_evidences = [str(text) if text is not None else "" for text in evidences]

    # For text-pair classification, DistilBERT expects claim [SEP] evidence
    tokenized_inputs = tokenizer(
        processed_claims,
        processed_evidences,
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH # From Cell 2
    )
    # 'labels' column already added in Cell 3's mapping
    # tokenized_inputs["labels"] = examples["labels"] # Not needed again if 'labels' is already correct int ID
    return tokenized_inputs

# Columns to remove after tokenization.
# Keep only model inputs: 'input_ids', 'attention_mask', 'labels'.
# 'idx' might be useful for tracking later but not for model.
# 'premise', 'hypothesis', 'label' are the original columns.
original_cols_to_remove = ['idx', 'premise', 'hypothesis', 'label']

print(f"\nApplying tokenization...")
tokenized_datasets = final_datasets.map(
    preprocess_function_fever,
    batched=True,
    remove_columns=[col for col in original_cols_to_remove if col in final_datasets['train'].column_names]
)

# 'labels' column should already exist with integer IDs from Cell 3.
# If it was named something else, e.g., 'label_id', rename it:
# if 'label_id' in tokenized_datasets['train'].column_names and 'labels' not in tokenized_datasets['train'].column_names:
#    tokenized_datasets = tokenized_datasets.rename_column('label_id', 'labels')


tokenized_datasets.set_format('torch')
print("\nTokenization complete. Sample from tokenized train dataset:")
print(tokenized_datasets['train'][0])
# Verify 'input_ids', 'attention_mask', and 'labels' keys are present and 'labels' is an int.


Tokenizer loaded: distilbert-base-uncased
Using columns: 'hypothesis' as claim and 'premise' as evidence.

Applying tokenization...


Map: 100%|██████████████████████████████████████████████████████████████| 19998/19998 [00:06<00:00, 3130.68 examples/s]


Tokenization complete. Sample from tokenized train dataset:
{'cid': tensor(75397), 'fid': 'f3680d64-bcdf-426f-a1ad-e8ae25ea9ba0', 'verifiable': tensor(1), 'fever_gold_label': 'SUPPORTS', 'labels': tensor(0), 'input_ids': tensor([  101,  1996,  4419,  5062,  2194,  1006,  2411, 12641,  2000,  4419,
         1998, 19551,  2004,  4419,  1007,  2003,  2019,  2137,  2394,  2653,
         3293,  3743,  2547,  2897,  2008,  2003,  3079,  2011,  1996,  4419,
         4024,  2177,  7506,  1997,  7398,  2301,  4419,  1012, 24794,  3501,
         3465,  2121,  1011, 24547,  2850,  2226,  1012,  2002,  2059,  2209,
         6317,  2198,  7598,  1999,  1996,  2460,  1011,  2973,  4419,  2547,
         2186,  2047,  7598,  1006,  2263,  1007,  1010,  2004,  2092,  2004,
         6037,  2004,  3581, 12694,  1999,  1996,  2268,  4419,  2547,  2143,
         7484,  3012,  1010,  2761,  3832,  2004,  1037,  4405,  1012,  2002,
         2150,  4235,  2124,  2000,  1037,  5041,  4378,  2005,  2010,  2783




In [5]:
# Cell 5: Load Model

model = DistilBertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels # Should be 3 for FEVER
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"\nModel '{MODEL_NAME}' loaded for sequence classification with {num_labels} labels.")
print(f"Model is on device: {device}")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Model 'distilbert-base-uncased' loaded for sequence classification with 3 labels.
Model is on device: cuda


In [6]:
# Cell 6: Define Metrics Function

def compute_metrics_classification(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, predictions)
    f1_weighted = f1_score(labels, predictions, average="weighted")
    f1_macro = f1_score(labels, predictions, average="macro")

    return {
        "accuracy": acc,
        "f1_weighted": f1_weighted,
        "f1_macro": f1_macro
    }
print("\nMetrics computation function defined.")


Metrics computation function defined.


In [7]:
# Cell 7: Training Arguments

OUTPUT_DIR = './results/fever_fact_verification_distilbert'
metric_to_optimize = "f1_macro" # Good for multi-class, especially if balance is a concern

# Dynamic calculation of logging/warmup steps (optional, good practice)
if len(tokenized_datasets['train']) > 0 and BATCH_SIZE > 0:
    steps_per_epoch = len(tokenized_datasets['train']) // BATCH_SIZE
    if steps_per_epoch == 0: steps_per_epoch = 1
    # Warmup for ~10% of total training steps
    total_training_steps = steps_per_epoch * NUM_EPOCHS
    WARMUP_STEPS = max(1, int(total_training_steps * 0.1))
    LOGGING_STEPS = max(1, steps_per_epoch // 5 if steps_per_epoch > 5 else steps_per_epoch) # Log ~5 times per epoch
else:
    WARMUP_STEPS = 500 # Fallback for very small datasets or if BATCH_SIZE is large
    LOGGING_STEPS = 100 # Fallback

print(f"Using NUM_EPOCHS: {NUM_EPOCHS}")
print(f"Calculated WARMUP_STEPS: {WARMUP_STEPS}, LOGGING_STEPS: {LOGGING_STEPS}")

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=2e-5, # Standard starting LR
    warmup_steps=WARMUP_STEPS,
    weight_decay=0.01,
    logging_dir=f'{OUTPUT_DIR}/logs',
    logging_steps=LOGGING_STEPS,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model=metric_to_optimize,
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="tensorboard",
    seed=SEED,
)
print("\nTrainingArguments defined.")

Using NUM_EPOCHS: 3
Calculated WARMUP_STEPS: 3906, LOGGING_STEPS: 2604

TrainingArguments defined.


In [8]:
# Cell 8: Initialize Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics_classification,
    tokenizer=tokenizer
)
print("\nTrainer initialized.")


Trainer initialized.


  trainer = Trainer(


In [9]:
# Cell 9: Train the Model

print("\nStarting training...")
try:
    train_results = trainer.train()
    print("Training finished.")
    trainer.log_metrics("train", train_results.metrics)
    trainer.save_metrics("train", train_results.metrics)
except Exception as e:
    print(f"An error occurred during training: {e}")
    raise


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,0.3565,0.920031,0.726273,0.721335,0.721335
2,0.2897,0.930143,0.736474,0.733778,0.733778
3,0.2251,1.121698,0.736574,0.73431,0.73431


Training finished.
***** train metrics *****
  epoch                    =        3.0
  total_flos               = 77112237GF
  train_loss               =     0.3232
  train_runtime            = 2:39:54.58
  train_samples_per_second =     65.145
  train_steps_per_second   =      4.072


In [10]:
# Cell 10: Evaluate the Model

print("\nEvaluating on the validation set...")
eval_results_validation = trainer.evaluate(eval_dataset=tokenized_datasets['validation'])
print(f"Validation Metrics: {eval_results_validation}")
trainer.log_metrics("eval_validation", eval_results_validation)
trainer.save_metrics("eval_validation", eval_results_validation)

# --- Test set evaluation removed/commented as it's unlabelled for this dataset ---
# if 'test' in tokenized_datasets and len(tokenized_datasets['test']) > 0:
#     print("\nTest set for 'pietrolesci/nli_fever' is unlabelled. Generating predictions only if needed.")
#     # predictions_on_test = trainer.predict(tokenized_datasets['test'])
#     # print(f"Generated predictions for {len(predictions_on_test.predictions)} test samples.")
# else:
#     print("\nTest set not available or empty.")

# --- Detailed Classification Report and Misclassification Analysis (on VALIDATION set) ---
print("\nGenerating classification report for the VALIDATION set...") # Changed from 'validation/test set'
predictions_output_val = trainer.predict(tokenized_datasets['validation'])
y_true_val = predictions_output_val.label_ids
y_pred_val = np.argmax(predictions_output_val.predictions, axis=-1)

target_names_ordered = [id_to_label_map[i] for i in sorted(id_to_label_map.keys())]
print("Validation Set Classification Report:")
print(classification_report(y_true_val, y_pred_val, target_names=target_names_ordered, digits=4))

# Misclassification Analysis for Validation Set
misclassified_indices_val = np.where(y_true_val != y_pred_val)[0]
print("\n--- Examples of Misclassifications from Validation Set (first 10) ---")
for i in range(min(10, len(misclassified_indices_val))):
    idx = int(misclassified_indices_val[i])
    # Get original example from final_datasets BEFORE tokenization to see text
    original_example = final_datasets['validation'][idx]

    true_label_str = id_to_label_map[y_true_val[idx]]
    pred_label_str = id_to_label_map[y_pred_val[idx]]

    print(f"Example (Original Index: {idx}):")
    print(f"  Claim (Hypothesis): {str(original_example[claim_col_name])[:200]}...")
    print(f"  Evidence (Premise): {str(original_example[evidence_col_name])[:300]}...")
    print(f"  TRUE LABEL: {true_label_str} (ID: {y_true_val[idx]})")
    print(f"  PREDICTED LABEL: {pred_label_str} (ID: {y_pred_val[idx]})")
    print("-" * 30)


Evaluating on the validation set...


Validation Metrics: {'eval_loss': 1.1216976642608643, 'eval_accuracy': 0.7365736573657365, 'eval_f1_weighted': 0.7343098195853892, 'eval_f1_macro': 0.7343098195853893, 'eval_runtime': 85.7766, 'eval_samples_per_second': 233.141, 'eval_steps_per_second': 14.573, 'epoch': 3.0}
***** eval_validation metrics *****
  epoch                   =        3.0
  eval_accuracy           =     0.7366
  eval_f1_macro           =     0.7343
  eval_f1_weighted        =     0.7343
  eval_loss               =     1.1217
  eval_runtime            = 0:01:25.77
  eval_samples_per_second =    233.141
  eval_steps_per_second   =     14.573

Generating classification report for the VALIDATION set...
Validation Set Classification Report:
                 precision    recall  f1-score   support

       SUPPORTS     0.7451    0.8614    0.7991      6666
NOT ENOUGH INFO     0.6781    0.6370    0.6569      6666
        REFUTES     0.7864    0.7114    0.7470      6666

       accuracy                         0.7366  

In [11]:
# Cell 11: Save the Fine-Tuned Model and Tokenizer

best_model_path = f"{OUTPUT_DIR}/best_model_fever_fact_verifier"
trainer.save_model(best_model_path)
print(f"\nBest model and tokenizer saved to {best_model_path}")


Best model and tokenizer saved to ./results/fever_fact_verification_distilbert/best_model_fever_fact_verifier


In [12]:
# Cell 12: Inference Example

model.eval() # Ensure model is in evaluation mode

# Example claim-evidence pairs
sample_claim1 = "The Eiffel Tower is located in Berlin."
sample_evidence1 = "The Eiffel Tower, a wrought-iron lattice tower on the Champ de Mars in Paris, France, is one of the most recognizable structures in the world."

sample_claim2 = "Rainn Wilson plays the character Dwight Schrute."
sample_evidence2 = "Rainn Wilson is an American actor, comedian, and producer. He is best known for his role as Dwight Schrute on the NBC sitcom The Office."

sample_claim3 = "The currency used in Japan is the Euro."
sample_evidence3 = "The official currency of Japan is the Japanese Yen (JPY)."

sample_claim4 = "Humans have landed on Mars."
sample_evidence4 = "Currently, no human has ever set foot on Mars. However, numerous uncrewed missions have explored the planet."


test_pairs = [
    (sample_claim1, sample_evidence1), # Expected: REFUTES
    (sample_claim2, sample_evidence2), # Expected: SUPPORTS
    (sample_claim3, sample_evidence3), # Expected: REFUTES (or NOT ENOUGH INFO if evidence was about something else)
    (sample_claim4, sample_evidence4)  # Expected: REFUTES
]

print("\n--- Inference Examples ---")
for claim, evidence in test_pairs:
    inputs = tokenizer(
        claim,
        evidence,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits

    probabilities = torch.softmax(logits, dim=-1)
    predicted_class_id = torch.argmax(probabilities, dim=-1).item()

    predicted_label = id_to_label_map.get(predicted_class_id, f"Unknown ID {predicted_class_id}")
    confidence = probabilities[0][predicted_class_id].item() if predicted_class_id < probabilities.shape[1] else -1.0

    print(f"Claim: \"{claim}\"")
    print(f"Evidence: \"{evidence[:100]}...\"")
    print(f"Predicted Label: {predicted_label} (ID: {predicted_class_id}), Confidence: {confidence:.4f}")
    # print(f"Probabilities: { {id_to_label_map.get(i, f'ID {i}'): prob.item() for i, prob in enumerate(probabilities[0]) if i in id_to_label_map} }\n")
    print("-" * 20)


--- Inference Examples ---
Claim: "The Eiffel Tower is located in Berlin."
Evidence: "The Eiffel Tower, a wrought-iron lattice tower on the Champ de Mars in Paris, France, is one of the ..."
Predicted Label: NOT ENOUGH INFO (ID: 1), Confidence: 0.9530
--------------------
Claim: "Rainn Wilson plays the character Dwight Schrute."
Evidence: "Rainn Wilson is an American actor, comedian, and producer. He is best known for his role as Dwight S..."
Predicted Label: NOT ENOUGH INFO (ID: 1), Confidence: 0.5069
--------------------
Claim: "The currency used in Japan is the Euro."
Evidence: "The official currency of Japan is the Japanese Yen (JPY)...."
Predicted Label: NOT ENOUGH INFO (ID: 1), Confidence: 0.8819
--------------------
Claim: "Humans have landed on Mars."
Evidence: "Currently, no human has ever set foot on Mars. However, numerous uncrewed missions have explored the..."
Predicted Label: NOT ENOUGH INFO (ID: 1), Confidence: 0.9446
--------------------
