# Impostor Text Hunt - Hybrid Ensemble Pipeline

This notebook implements a hybrid model combining classical ML features and a fine-tuned Transformer model for the task of detecting fake text. The notebook handles everything from preprocessing to prediction and submission file generation, with GPU support enabled where available.


In [None]:
# ⚙️ Install required packages (uncomment if running in a fresh Kaggle environment) (First Cell)
#!pip install peft accelerate transformers datasets catboost xgboost scikit-learn nltk kaggle

import os, re, string, nltk, random, torch, logging
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

# --- Hugging Face & Sentence Transformers ---
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sentence_transformers import SentenceTransformer

# --- Scikit-learn for Classical Models & Ensemble ---
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression # For Meta-Learner
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import scipy # For stats.uniform
from google.colab import drive, files

In [None]:
# 📥 NLTK DOWNLOADS
nltk.download('punkt')
nltk.download('punkt_tab') # Not a standard NLTK download, might cause an error or warning
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# --- Suppress CatBoost verbose output during RandomizedSearchCV ---
logging.getLogger('catboost').setLevel(logging.ERROR)

In [None]:
# --- Configuration ---
# This line will block execution until you manually upload a file.
# In a fully automated script, this needs to be removed or handled differently.
uploaded = files.upload() # This will prompt a file selection dialog

# Secure and move kaggle.json
!mkdir -p ~/.kaggle/
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
print("Kaggle API key configured.")

# Download the competition data
# Replace 'fake-or-real-the-impostor-hunt' with your competition slug if different
KAGGLE_COMPETITION_SLUG = 'fake-or-real-the-impostor-hunt'
!kaggle competitions download -c {KAGGLE_COMPETITION_SLUG}
print(f"Data for competition '{KAGGLE_COMPETITION_SLUG}' downloaded.")

# Unzip the downloaded data
# IMPORTANT: Confirm the exact name of the downloaded .zip file. Use !ls if unsure.
# The default output name is usually the competition slug + .zip
DOWNLOADED_ZIP_FILE = f'{KAGGLE_COMPETITION_SLUG}.zip'
EXTRACT_DIR = './data' # Directory to extract the data into

print(f"Unzipping {DOWNLOADED_ZIP_FILE} to {EXTRACT_DIR}...")
!mkdir -p {EXTRACT_DIR}
!unzip {DOWNLOADED_ZIP_FILE} -d {EXTRACT_DIR} # This will prompt for overwrite if data exists
print("Data unzipped.")

In [None]:
DATA_ROOT_DIR = "./data/data" # This assumes the zip extracts to 'data/data', verify this.
                              # Often it extracts directly into 'data' so it might be './data'
SEED = 42
MODEL_NAME = 'distilbert-base-uncased' #Chosen smaller Transformer model
NUM_FOLDS = 5 # For K-Fold Cross-Validation
N_ITER_RANDOM_SEARCH = 10 # Number of iterations for RandomizedSearchCV

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
# 🧹 TEXT CLEANING FUNCTIONS
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower() # Ensure text is always string, handle potential None/NaN
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    filtered = [lemmatizer.lemmatize(w) for w in tokens if w.isalpha() and w not in stop_words]
    return ' '.join(filtered)

def extract_advanced_features(text_input): # Renamed to text_input to avoid confusion, though 'text' would be fine now.
    text = str(text_input).strip() # Ensure text is string and strip whitespace
    if not text: # Handle empty string case gracefully
        # Return a list of zeros matching the number of features your col_names expect (9 features per text)
        return [0.0] * 9 # Use float 0.0 to match potential numerical type

    sentences = re.split(r'[.!?]', text)
    tokens = text.split() # Splits by whitespace

    num_sentences = len([s for s in sentences if s.strip()]) # Count non-empty sentences
    num_chars = len(text)
    num_words = len(tokens)

    # Handle cases where tokens might be empty to prevent division by zero
    avg_word_len = np.mean([len(w) for w in tokens]) if tokens else 0.0
    unique_tokens = set(t.lower() for t in tokens if t) # Ensure tokens are not empty for lower()
    type_token_ratio = len(unique_tokens) / num_words if num_words else 0.0
    uppercase_ratio = sum(1 for c in text if c.isupper()) / num_chars if num_chars > 0 else 0.0 # Handle num_chars == 0

    punctuations = [',', '.', '?']
    punct_counts = [text.count(p) for p in punctuations]

    return [num_chars, num_words, avg_word_len, num_sentences, type_token_ratio, uppercase_ratio] + punct_counts

# Corrected col_names generation: This needs to be defined AFTER extract_advanced_features
feature_names = ["len", "words", "avg_len", "sentences", "ttr", "upper_ratio", "comma", "period", "question"]
col_names = [f"{name}{i}" for i in (1, 2) for name in feature_names]
# Verification of expected feature count: len(feature_names) should be 9
print(f"Expected number of features per text: {len(feature_names)}") # Should be 9
print(f"Total columns expected for combined features: {len(col_names)}") # Should be 18

In [None]:
# 📚 Load and preprocess training data
print("Loading and preparing training data...")
TRAIN_DIR = os.path.join(DATA_ROOT_DIR, "train")

# Now, you can correctly load train.csv
try:
    train_df_path = os.path.join(DATA_ROOT_DIR, "train.csv")
    train_df = pd.read_csv(train_df_path)
    print(f"Successfully loaded train.csv from: {train_df_path}")
    print("\nFirst 5 rows of train_df:")
    print(train_df.head())

except FileNotFoundError:
    print(f"Error: train.csv not found at {train_df_path}. Please double-check the path after unzipping.")
    # Exit or raise error if train.csv is critical and not found
    raise
except Exception as e:
    print(f"An error occurred while loading train.csv: {e}")
    raise

# Create new columns in train_df to store raw texts and their cleaned versions for feature extraction
all_raw1_loaded, all_raw2_loaded = [], []
all_clean_texts_1_loaded, all_clean_texts_2_loaded = [], []
all_features_loaded, all_labels_loaded = [], []
loaded_ids = [] # Keep track of IDs for which files were successfully loaded

print("Loading and processing training data (raw text, cleaned text, features)...")
for idx, real in tqdm(zip(train_df['id'], train_df['real_text_id']), total=len(train_df)):
    article_path = os.path.join(TRAIN_DIR, f"article_{str(idx).zfill(4)}")
    file1_path = os.path.join(article_path, "file_1.txt")
    file2_path = os.path.join(article_path, "file_2.txt")

    try:
        with open(file1_path, 'r', encoding='utf-8') as f1:
            raw1 = f1.read()
        with open(file2_path, 'r', encoding='utf-8') as f2:
            raw2 = f2.read()
    except FileNotFoundError:
        print(f"File Not Found for article_{str(idx).zfill(4)} (files: {file1_path}, {file2_path}): Skipping.")
        continue
    except Exception as e: # Catch other potential reading errors (e.g., encoding)
        print(f"Error reading files for article_{str(idx).zfill(4)}: {e}. Skipping.")
        continue

    # Store raw texts
    all_raw1_loaded.append(raw1)
    all_raw2_loaded.append(raw2)

    # Clean texts for classical features
    clean1 = clean_text(raw1)
    clean2 = clean_text(raw2)

    all_clean_texts_1_loaded.append(clean1)
    all_clean_texts_2_loaded.append(clean2)

    all_features_loaded.append(extract_advanced_features(clean1) + extract_advanced_features(clean2))
    # Label mapping: 'real_text_id' 1 typically means file_1 is real, 2 means file_2 is real.
    # In binary classification for "fake news detection", you usually classify the *pair*
    # as real (0) or fake (1).
    # If real_text_id == 1 implies the pair is 'real', and real_text_id == 2 implies 'fake':
    # This needs to be clarified based on the competition's exact target definition.
    # Assuming `label = 0 if real == 1 else 1` means:
    # if real_text_id is 1 (file_1 is real) -> label is 0 (real pair)
    # if real_text_id is 2 (file_2 is real) -> label is 1 (fake pair, because the OTHER one is fake)
    # This implies a target where 0 means "text1 is real, text2 is fake" and 1 means "text2 is real, text1 is fake"
    # which is unusual for a single 'fake/real' classification of the *pair*.
    # Re-verify your competition's target definition.
    # For now, keeping your existing logic.
    all_labels_loaded.append(0 if real == 1 else 1) # This determines your binary classification target
    loaded_ids.append(idx)

# Filter train_df to only include successfully loaded IDs
train_df = train_df[train_df['id'].isin(loaded_ids)].reset_index(drop=True)

# Add processed data as new columns to train_df
train_df['raw_text_1'] = all_raw1_loaded
train_df['raw_text_2'] = all_raw2_loaded
train_df['clean_text_1'] = all_clean_texts_1_loaded # Keep for classical features
train_df['clean_text_2'] = all_clean_texts_2_loaded # Keep for classical features

# Add classical features to train_df
# This line is now correct given the `extract_advanced_features` updates
train_df[col_names] = pd.DataFrame(all_features_loaded, index=train_df.index)

# Ensure the 'labels' used for classical models also aligns with the filtered train_df
# Use the `labels` that were collected for the loaded_ids
y_train_labels = pd.Series(all_labels_loaded, index=train_df.index)

# Prepare features for classical models (X_train_features_df)
# This part is redundant as features are already in train_df[col_names]
X_train_features_df = train_df[col_names].copy()

print(f"Prepared {len(X_train_features_df)} samples for classical model training.")
print("Sample X_train_features_df head:")
print(X_train_features_df.head())
print("Sample y_train_labels head:")
print(y_train_labels.head())

In [None]:
# 🤖 Train all models 
# --- Transformer Model Loading (Done once) ---
print(f"Loading Transformer Model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Note: The actual model (sequence_model) will be loaded inside the K-fold loop
# to ensure a fresh model for each fold or fine-tuned from base model.
print("Transformer Tokenizer loaded.")

# --- K-Fold Cross-Validation Setup ---
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)

# Lists to store out-of-fold predictions for meta-learner training
# Initialize with np.float32 for consistency with model outputs if needed
oof_classical_probs = np.zeros(len(X_train_features_df), dtype=np.float32)
oof_transformer_probs = np.zeros(len(X_train_features_df), dtype=np.float32)
oof_embedding_probs = np.zeros(len(X_train_features_df), dtype=np.float32)
oof_true_labels = np.zeros(len(X_train_features_df), dtype=np.int32) # Labels are integers

# Lists to store trained models for final test prediction (ensemble of ensembles)
trained_voting_clfs = []
trained_sequence_models = []
trained_meta_learners = [] # This list will probably only have one meta-learner after training on all OOF
trained_embedding_models = [] # Assuming embedding model is constant across folds

print(f"Starting {NUM_FOLDS}-Fold Cross-Validation...")
for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_features_df, y_train_labels)):
    print(f"\n--- Fold {fold + 1}/{NUM_FOLDS} ---")

    # --- Split Data for Current Fold ---
    X_train_fold, X_val_fold = X_train_features_df.iloc[train_idx], X_train_features_df.iloc[val_idx]
    y_train_fold, y_val_fold = y_train_labels.iloc[train_idx], y_train_labels.iloc[val_idx]

    # Get raw texts for current fold's transformer/embedding training/validation
    raw_text1_train_fold = train_df.iloc[train_idx]['raw_text_1'].tolist() # Use the new 'raw_text_1' column
    raw_text2_train_fold = train_df.iloc[train_idx]['raw_text_2'].tolist() # Use the new 'raw_text_2' column
    raw_text1_val_fold = train_df.iloc[val_idx]['raw_text_1'].tolist()
    raw_text2_val_fold = train_df.iloc[val_idx]['raw_text_2'].tolist()


    # ---------------------------------------------------------------------
    # --- 1. Classical Models (VotingClassifier) Training & Tuning ---
    # ---------------------------------------------------------------------
    print("Training and tuning VotingClassifier...")
    estimators = [
        ('rf', Pipeline([("scaler", StandardScaler()), ("model", RandomForestClassifier(random_state=SEED, class_weight='balanced'))])),
        ('gb', Pipeline([("scaler", StandardScaler()), ("model", GradientBoostingClassifier(random_state=SEED))])),
        ('svm', Pipeline([("scaler", StandardScaler()), ("model", SVC(probability=True, class_weight='balanced', random_state=SEED))])),
        ('xgb', Pipeline([("scaler", StandardScaler()), ("model", XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=SEED))])),
        ('cat', Pipeline([("scaler", StandardScaler()), ("model", CatBoostClassifier(verbose=0, random_state=SEED))]))
    ]

    # Define parameter distributions for RandomizedSearchCV
    param_distributions = {
        'rf__model__n_estimators': scipy.stats.randint(50, 200),
        'rf__model__max_depth': scipy.stats.randint(5, 20),
        'gb__model__n_estimators': scipy.stats.randint(50, 200),
        'gb__model__learning_rate': scipy.stats.uniform(0.01, 0.2),
        'svm__model__C': scipy.stats.loguniform(0.1, 10),
        'svm__model__gamma': scipy.stats.loguniform(0.001, 0.1),
        'xgb__model__n_estimators': scipy.stats.randint(50, 200),
        'xgb__model__learning_rate': scipy.stats.uniform(0.01, 0.2),
        'cat__model__iterations': scipy.stats.randint(50, 200), # CatBoost uses 'iterations' not 'n_estimators'
        'cat__model__learning_rate': scipy.stats.uniform(0.01, 0.2),
        # Weights: Using dirichlet distribution to sample weights that sum to 1.
        # This is a good way to explore the weight space.
        'weights': [list(w) for w in np.random.dirichlet(np.ones(len(estimators)), size=N_ITER_RANDOM_SEARCH)]
    }
    # Create the VotingClassifier (without weights initially, as they are part of search)
    voting_clf_base = VotingClassifier(estimators=estimators, voting='soft', verbose=False)

    # Perform Randomized Search
    random_search = RandomizedSearchCV(
        estimator=voting_clf_base,
        param_distributions=param_distributions,
        n_iter=N_ITER_RANDOM_SEARCH,
        scoring='f1', # Use f1 for tuning
        cv=3, # Mini-CV within each fold for tuning. Ensure this is <= size of train_fold
        random_state=SEED,
        n_jobs=-1, # Use all available cores
        verbose=0 # Set to 0 to suppress output
    )
    # The `fit` method of RandomizedSearchCV will fit the `best_estimator_`
    random_search.fit(X_train_fold, y_train_fold)
    best_voting_clf = random_search.best_estimator_
    trained_voting_clfs.append(best_voting_clf) # Store the best fitted VotingClassifier

    # Get OOF predictions for classical model
    # Ensure X_val_fold is a DataFrame that best_voting_clf expects
    oof_classical_probs[val_idx] = best_voting_clf.predict_proba(X_val_fold)[:, 1]
    print(f"Classical VotingClassifier F1 (Fold {fold+1}): {f1_score(y_val_fold, (oof_classical_probs[val_idx] > 0.5).astype(int)):.4f}")


    # ---------------------------------------------------------------------
    # --- 2. Transformer Model Fine-tuning ---
    # ---------------------------------------------------------------------
    print("Fine-tuning Transformer model...")
    # Load a fresh model for each fold to avoid data leakage / sequential fine-tuning issues
    sequence_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)

    # Create datasets for the Trainer
    class TextDataset(torch.utils.data.Dataset):
        def __init__(self, texts1, texts2, labels, tokenizer):
            # Pass lists directly to tokenizer for batch processing
            self.encodings = tokenizer(texts1, texts2, truncation=True, padding=True, max_length=512, return_tensors='pt')
            self.labels = torch.tensor(labels.tolist()) # Ensure labels are tensor

        def __getitem__(self, idx):
            item = {key: val[idx] for key, val in self.encodings.items()}
            item['labels'] = self.labels[idx]
            return item

        def __len__(self):
            return len(self.labels)

    train_dataset = TextDataset(raw_text1_train_fold, raw_text2_train_fold, y_train_fold, tokenizer)
    val_dataset = TextDataset(raw_text1_val_fold, raw_text2_val_fold, y_val_fold, tokenizer)

    # Training Arguments (can be tuned further, but keep 'report_to="none"')
    training_args = TrainingArguments(
        output_dir=f"./transformer_results_fold_{fold}",
        num_train_epochs=10, # Good starting point, Trainer's load_best_model_at_end handles early stopping implicitly
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True, # Will load the model with the best metric_for_best_model
        metric_for_best_model="f1",
        logging_steps=5,
        seed=SEED,
        save_total_limit=1, # Only save the best model to save disk space
        learning_rate=2e-5,
        warmup_ratio=0.06,
        weight_decay=0.01,
        fp16=True, # Uses mixed precision training if GPU is available
        gradient_checkpointing=True, # Saves memory, slightly slower
        report_to="none", # Disable W&B
        # greater_is_better=True is default for f1
        # no_cuda=False # Not needed as 'device' handles this, and Trainer uses CUDA by default if available
    )

    # Define custom compute_metrics for Trainer
    def compute_metrics(p):
        preds = np.argmax(p.predictions, axis=1)
        # Handle cases where there might be no positive samples in a fold's validation set
        if np.sum(p.label_ids) == 0 and np.sum(preds) == 0:
            f1 = 1.0 # Or nan, or 0.0 depending on how you want to handle it
        else:
            f1 = f1_score(p.label_ids, preds, average='binary', zero_division=0) # Use zero_division
        acc = accuracy_score(p.label_ids, preds)
        return {"f1": f1, "accuracy": acc}

    trainer = Trainer(
        model=sequence_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Load the best model found by the Trainer
    best_transformer_model_path = trainer.state.best_model_checkpoint
    if best_transformer_model_path:
        # Load the model from the best checkpoint
        best_sequence_model_fold = AutoModelForSequenceClassification.from_pretrained(best_transformer_model_path).to(device)
    else:
        # Fallback if no best checkpoint saved (e.g., if training was too short, or metrics didn't improve)
        # This will be the model at the last epoch.
        print(f"Warning: No best model checkpoint found for fold {fold+1}. Using the last trained model.")
        best_sequence_model_fold = sequence_model # Already on device from trainer init
    trained_sequence_models.append(best_sequence_model_fold)

    # Get OOF predictions for Transformer
    # trainer.predict automatically uses the best model if load_best_model_at_end=True
    val_preds_output = trainer.predict(val_dataset)
    val_preds_logits = val_preds_output.predictions
    oof_transformer_probs[val_idx] = torch.softmax(torch.tensor(val_preds_logits), dim=1)[:, 1].cpu().numpy()
    print(f"Transformer F1 (Fold {fold+1}): {f1_score(y_val_fold, (oof_transformer_probs[val_idx] > 0.5).astype(int), zero_division=0):.4f}")

    # Clear memory after each fold (important for GPU)
    del trainer # This should deallocate most of the model memory
    # Only delete sequence_model if you don't need it after appending to trained_sequence_models
    # del sequence_model # The best_sequence_model_fold variable now holds the model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()


    # ---------------------------------------------------------------------
    # --- 3. Embedding-based Dissimilarity Calculation ---
    # ---------------------------------------------------------------------
    print("Calculating Embedding Dissimilarity...")
    # Load SentenceTransformer model once or inside loop if it needs re-initialization.
    # It's more efficient to load it once globally if it's not being fine-tuned per fold.
    # Moved embedding_model loading outside the fold loop to `main` or global scope.
    if fold == 0: # Initialize/load embedding_model only once
        try:
            embedding_model = SentenceTransformer('all-MiniLM-L6-v2').to(device)
            trained_embedding_models.append(embedding_model) # Store the single instance
            print("SentenceTransformer loaded.")
        except Exception as e:
            print(f"Error loading SentenceTransformer: {e}. Skipping embedding features for all folds.")
            embedding_model = None
            trained_embedding_models.append(None) # Store None if it failed to load

    # Retrieve the embedding model (could be None if loading failed)
    current_embedding_model_for_fold = trained_embedding_models[0]

    if current_embedding_model_for_fold:
        current_embedding_model_for_fold.eval()
        val_embedding_probs = []
        # Process in batches to leverage GPU efficiently for SentenceTransformer
        # You might want to define a `batch_size_st` for this
        st_batch_size = 32 # Example batch size for SentenceTransformer
        for i in tqdm(range(0, len(raw_text1_val_fold), st_batch_size), leave=False):
            batch_raw1 = raw_text1_val_fold[i:i+st_batch_size]
            batch_raw2 = raw_text2_val_fold[i:i+st_batch_size]

            with torch.no_grad():
                # encode returns tensors on the specified device
                emb1_batch = current_embedding_model_for_fold.encode(batch_raw1, convert_to_tensor=True, device=str(device))
                emb2_batch = current_embedding_model_for_fold.encode(batch_raw2, convert_to_tensor=True, device=str(device))

                # Compute cosine similarity for the batch
                # Use F.cosine_similarity for batch operations, or manual dot product and norm
                # Manual: (emb1_batch * emb2_batch).sum(dim=1) / (torch.linalg.norm(emb1_batch, dim=1) * torch.linalg.norm(emb2_batch, dim=1) + epsilon)
                # Simpler using util.cos_sim if sentence_transformers.util is imported
                from sentence_transformers import util
                cos_sim_batch = util.cos_sim(emb1_batch, emb2_batch).diagonal() # diagonal because it's pairwise between batch1 and batch2

                val_embedding_probs.extend((1 - cos_sim_batch).cpu().numpy()) # Convert to numpy array

        oof_embedding_probs[val_idx] = np.array(val_embedding_probs)
        print(f"Embedding F1 (Fold {fold+1}): {f1_score(y_val_fold, (oof_embedding_probs[val_idx] > 0.5).astype(int), zero_division=0):.4f}")

    else:
        # If embedding model failed to load, fill with 0.5 (neutral)
        oof_embedding_probs[val_idx] = np.full(len(val_idx), 0.5)
        print(f"Embedding model not loaded. Defaulting to 0.5 probability for fold {fold+1}.")


    # Store true labels for meta-learner training
    oof_true_labels[val_idx] = y_val_fold.values

# --- End of K-Fold Loop ---

In [None]:
# Concatenate OOF predictions to form meta-features
print("\n--- Training Meta-Learner ---")
meta_features = pd.DataFrame({
    'classical_prob': oof_classical_probs,
    'transformer_prob': oof_transformer_probs,
    'embedding_prob': oof_embedding_probs
})

# Train the meta-learner on OOF predictions
# The meta-learner is trained once on ALL OOF predictions, not per fold.
# This is a standard stacking approach.
meta_learner = LogisticRegression(random_state=SEED, solver='liblinear', C=0.1) # C can be tuned
meta_learner.fit(meta_features, oof_true_labels)
trained_meta_learners.append(meta_learner) # Store the single trained meta-learner
print("Meta-Learner trained on OOF predictions.")

# Evaluate overall OOF ensemble performance
oof_combined_preds = meta_learner.predict(meta_features)
oof_final_f1 = f1_score(oof_true_labels, oof_combined_preds, average='binary', zero_division=0)
oof_final_accuracy = accuracy_score(oof_true_labels, oof_combined_preds)
print(f"Overall OOF Meta-Learner F1: {oof_final_f1:.4f}")
print(f"Overall OOF Meta-Learner Accuracy: {oof_final_accuracy:.4f}")

In [None]:
# --- 📤 FINAL TEST PREDICTION (Using the ensemble of trained models from each fold) ---
print("\n--- Starting Final Test Prediction ---")
TEST_DIR = os.path.join(DATA_ROOT_DIR, "test")
# Ensure test_ids are from the main test directory
test_ids = sorted([f for f in os.listdir(TEST_DIR) if os.path.isdir(os.path.join(TEST_DIR, f))])
submission_rows = []

# Collect all predictions for the test set from each fold's meta-learner.
# Each element in this list will be a numpy array of predictions for the entire test set
# from one fold's models.
all_fold_meta_test_probs = []

# Prepare the test data in a structured way to avoid re-reading files for each fold
test_data_loaded = []
print("Loading raw test data...")
for idx_dir in tqdm(test_ids):
    try:
        raw1_test = open(os.path.join(TEST_DIR, idx_dir, "file_1.txt"), 'r', encoding='utf-8').read()
        raw2_test = open(os.path.join(TEST_DIR, idx_dir, "file_2.txt"), 'r', encoding='utf-8').read()
        test_data_loaded.append({'id_dir': idx_dir, 'raw1': raw1_test, 'raw2': raw2_test})
    except FileNotFoundError:
        print(f'File Not Found for {idx_dir} in test set. Skipping this entry.')
        # You might need a strategy to handle missing test files for submission (e.g., predict default)
        continue
    except Exception as e:
        print(f"Error reading test files for {idx_dir}: {e}. Skipping.")
        continue
print(f"Loaded {len(test_data_loaded)} test file pairs.")


for fold_num in range(NUM_FOLDS):
    print(f"Making predictions with Fold {fold_num + 1} models...")
    current_voting_clf = trained_voting_clfs[fold_num]
    current_sequence_model = trained_sequence_models[fold_num]
    current_meta_learner = trained_meta_learners[0] # The single meta-learner trained on OOF
    current_embedding_model = trained_embedding_models[0] # The single loaded embedding model

    fold_test_meta_features = []

    # Ensure models are in evaluation mode
    current_sequence_model.eval()
    if current_embedding_model:
        current_embedding_model.eval()

    # Process test data for the current fold's ensemble
    test_ids_for_fold_preds = [] # To keep track of IDs predicted in this fold
    for entry in tqdm(test_data_loaded, leave=False):
        idx_dir = entry['id_dir']
        raw1 = entry['raw1']
        raw2 = entry['raw2']

        clean1 = clean_text(raw1)
        clean2 = clean_text(raw2)

        # --- Classical Features Prediction ---
        features = extract_advanced_features(clean1) + extract_advanced_features(clean2)
        features_df = pd.DataFrame([features], columns=col_names)
        classical_prob = current_voting_clf.predict_proba(features_df)[0][1]

        # --- Transformer Prediction ---
        # Note: If batching is desired for transformer test predictions, collect inputs and predict in batches
        inputs = tokenizer(raw1, raw2, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            logits = current_sequence_model(**inputs).logits
            probs = torch.softmax(logits, dim=1).cpu().numpy()
        transformer_prob = probs[0, 1]

        # --- Embedding-based Dissimilarity Calculation ---
        embedding_prob = 0.5 # Default if embedding_model failed to load
        if current_embedding_model:
            with torch.no_grad():
                emb1 = current_embedding_model.encode(raw1, convert_to_tensor=True, device=str(device))
                emb2 = current_embedding_model.encode(raw2, convert_to_tensor=True, device=str(device))
                epsilon = 1e-8
                cos_sim_tensor = torch.dot(emb1, emb2) / \
                                 (torch.linalg.norm(emb1) * torch.linalg.norm(emb2) + epsilon)
                embedding_prob = 1 - cos_sim_tensor.item()

        fold_test_meta_features.append([classical_prob, transformer_prob, embedding_prob])
        test_ids_for_fold_preds.append(idx_dir) # Track IDs for this fold's predictions

    # Convert to DataFrame for meta-learner prediction
    # Ensure the columns match what the meta_learner was trained on
    fold_meta_df = pd.DataFrame(fold_test_meta_features, columns=['classical_prob', 'transformer_prob', 'embedding_prob'])
    fold_meta_probs = current_meta_learner.predict_proba(fold_meta_df)[:, 1]
    all_fold_meta_test_probs.append(fold_meta_probs) # Store this fold's predictions

# Average predictions from all folds' meta-learners
# This averages probabilities across folds for each test sample.
# Ensure that all_fold_meta_test_probs contains arrays of the same length and order.
if all_fold_meta_test_probs:
    final_test_meta_probs_avg = np.mean(all_fold_meta_test_probs, axis=0)
else:
    print("Warning: No test predictions generated by any fold.")
    final_test_meta_probs_avg = np.zeros(len(test_data_loaded)) # Fallback to zeros


# Create submission file (using the averaged predictions and the initially loaded test_data_loaded)
# The order of `test_data_loaded` and `final_test_meta_probs_avg` should match.
for i, entry in enumerate(test_data_loaded):
    idx_dir = entry['id_dir']
    idx_number = int(idx_dir.split('_')[-1])
    final_prob = final_test_meta_probs_avg[i] # Get the final probability for this sample

    # Determine real_text_id based on your competition rules.
    # Reconfirm your mapping for real_text_id. Assuming:
    # if final_prob > 0.5, it's a "fake news" pair (Class 1).
    # If competition asks for `real_text_id`:
    # 1: file_1.txt is real (implies file_2.txt is fake, so the *pair* is fake)
    # 2: file_2.txt is real (implies file_1.txt is fake, so the *pair* is fake)
    # This mapping is still ambiguous. Let's assume your original `y_train_labels` logic:
    # `0 if real == 1 else 1` means:
    # 0 -> real_text_id was 1 (file_1 is real)
    # 1 -> real_text_id was 2 (file_2 is real)
    # So if your model predicts `1` (higher probability), it corresponds to `real_text_id=2`.
    # If your model predicts `0` (lower probability), it corresponds to `real_text_id=1`.
    real_text_id = 2 if final_prob > 0.5 else 1 # THIS IS A CRITICAL MAPPING, VERIFY WITH COMPETITION

    submission_rows.append({"id": idx_number, "real_text_id": real_text_id})

submission_df = pd.DataFrame(submission_rows)
submission_df.sort_values("id", inplace=True)
submission_df.to_csv("submission.csv", index=False)
print("✅ submission.csv generated.")