In [None]:
!pip install tensorflow
!pip install -U imbalanced-learn
!pip uninstall scikit-learn imbalanced-learn sklearn-compat -y
pip install scikit-learn==1.4.2 imbalanced-learn==0.12.0

In [7]:
!pip install --upgrade transformers datasets accelerate

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=2.0.0->accelerate)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)


In [None]:
pip install scikit-learn==1.4.2 imbalanced-learn==0.12.0

In [5]:
# -*- coding: utf-8 -*-
"""
Diplomacy Message Classification - Training Script V3.1 (GRU Removed)
Models: TF-IDF+SMOTE+(ML Models+Ensemble) AND LSTM with GloVe.
Based on SENDER LABELS. Focus on minority F1 score.

**MODIFIED: Removed GRU model.**
**MODIFIED: Added SMOTE, Threshold Tuning, Custom Ensemble.**
**MODIFIED: Fine-tuning embeddings (trainable=True).**
"""

import json
import gc
import pandas as pd
import numpy as np
import re
import os
import time
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import sys # For checking executable path

# ML Imports
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier # Removed for brevity
from sklearn.ensemble import RandomForestClassifier, VotingClassifier # Removed GB for brevity
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score, precision_recall_curve
from sklearn.utils import resample, class_weight as sk_class_weight

# Imbalance Handling
from imblearn.over_sampling import SMOTE

# DL Imports
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer as KerasTokenizer # Alias to avoid confusion
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
# Removed GRU from layers import
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping as KerasEarlyStopping # Alias
# import matplotlib.pyplot as plt # Optional for plotting

print(f"TensorFlow Version: {tf.__version__}")
# Make sure imblearn is installed: pip install -U imbalanced-learn

# --- Configuration ---
# File Paths
TRAIN_FILE_PATH = "/kaggle/input/nlpdata/train.jsonl" #<--- CHECK/REPLACE
VALIDATION_FILE_PATH = "/kaggle/input/nlpdata/validation.jsonl" #<--- CHECK/REPLACE
TEST_FILE_PATH = "/kaggle/input/nlpdata/test.jsonl" #<--- CHECK/REPLACE
GLOVE_EMBEDDING_PATH = "/kaggle/input/nlpdata/glove.6B.100d.txt" # <--- !!!!!!!!!!! CHECK/REPLACE THIS PATH !!!!!!!!!!!
SAVE_DIR = "trained_models_v3.1_smote_lstm_ensemble" # Updated save dir name

# General Parameters
RANDOM_STATE = 42
VALIDATION_SET_SIZE = 0.15 # Hold out 15% of balanced data for validation/threshold tuning
NUM_TEST_EXAMPLES_TO_SHOW = 5

# TF-IDF / ML Parameters
TFIDF_MAX_FEATURES = 5000
NGRAM_RANGE = (1, 2)

# Deep Learning Parameters
VOCAB_SIZE = 10000
MAX_LENGTH = 100
EMBEDDING_DIM = 100
LSTM_UNITS = 64
# GRU_UNITS = 64 # <--- REMOVED
DROPOUT_RATE = 0.4
SPATIAL_DROPOUT_RATE = 0.4
DL_EPOCHS = 6
DL_BATCH_SIZE = 64
DL_EARLY_STOPPING_PATIENCE = 3

# --- NLTK Data Check ---
try:
    stopwords.words('english')
except LookupError:
    print("NLTK stopwords download.")
    nltk.download('stopwords', quiet=True)
try:
    word_tokenize("test sentence")
except LookupError:
    print("NLTK punkt download.")
    nltk.download('punkt', quiet=True)

# --- Helper Functions ---
def load_jsonl(file_path):
    """Loads data from a JSON Lines file."""
    data = []
    if not os.path.exists(file_path):
        # Attempt common Kaggle path structure
        base_name = os.path.basename(file_path)
        dir_name = os.path.basename(os.path.dirname(file_path))
        kaggle_path_guess1 = f"/kaggle/input/{base_name}"
        kaggle_path_guess2 = f"/kaggle/input/{dir_name}/{base_name}"
        print(f"File not found at '{file_path}'. Trying Kaggle paths...")
        if os.path.exists(kaggle_path_guess1):
            print(f"Found at: '{kaggle_path_guess1}'")
            file_path = kaggle_path_guess1
        elif os.path.exists(kaggle_path_guess2):
            print(f"Found at: '{kaggle_path_guess2}'")
            file_path = kaggle_path_guess2
        else:
            raise FileNotFoundError(f"❌ Data file not found at '{file_path}' or likely Kaggle paths ('{kaggle_path_guess1}', '{kaggle_path_guess2}').")
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f):
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Warn: Skip JSON line {line_num+1}: {e}\nLine: {line.strip()}")
        print(f"✅ Loaded {len(data)} entries from {file_path}.")
        return data
    except Exception as e:
        raise RuntimeError(f"❌ Error loading {file_path}: {e}")

def data_to_dataframe(jsonl_data):
    """Converts loaded JSONL data into a message DataFrame using SENDER_LABELS."""
    all_messages = []
    all_sender_labels = []
    game_ids = []
    relative_indices = []
    print("Processing data using 'sender_labels' as ground truth...")
    for i, game_data in enumerate(jsonl_data):
        game_id = game_data.get("game_id", f"UNKNOWN_{i}")
        if not isinstance(game_data, dict) or "messages" not in game_data or "sender_labels" not in game_data:
            print(f"Warn: Skip game (idx {i}): Lacks 'messages' or 'sender_labels'. Keys: {list(game_data.keys()) if isinstance(game_data, dict) else 'Not a dict'}")
            continue
        messages = game_data["messages"]
        sender_labels = game_data["sender_labels"]
        if not isinstance(messages, list) or not isinstance(sender_labels, list):
            print(f"Warn: 'messages'/'sender_labels' not lists in game {game_id}. Skip.")
            continue
        num_messages = len(messages)
        num_labels = len(sender_labels)
        if num_messages != num_labels:
            print(f"Warn: Mismatch msg({num_messages})/sender_label({num_labels}) count for game {game_id}. Skip.")
            continue
        # Use provided relative index or generate default
        rel_idx = game_data.get("relative_message_index", list(range(num_messages)))
        if not isinstance(rel_idx, list) or len(rel_idx) != num_messages:
            print(f"Warn: Mismatch/Invalid relative index for game {game_id}. Using default range.")
            rel_idx = list(range(num_messages))

        all_messages.extend(messages)
        all_sender_labels.extend(sender_labels)
        game_ids.extend([game_id] * num_messages)
        relative_indices.extend(rel_idx)

    if not all_messages:
        print("Warn: No valid messages found in data.")
        return pd.DataFrame()

    df = pd.DataFrame({"game_id": game_ids, "relative_index": relative_indices, "messages": all_messages, "labels": all_sender_labels})
    # Ensure 'labels' are consistently 0 or 1
    df['labels'] = df['labels'].apply(lambda x: 1 if str(x).lower() == 'true' else 0)
    print(f"DataFrame created with {len(df)} messages (using sender_labels).")
    if not df.empty:
        print(f"Label distribution (sender):\n{df['labels'].value_counts(normalize=True)}")
    return df

def preprocess_text(text):
    """Basic text preprocessing: lowercase, remove URLs, HTML tags, non-alpha chars, extra whitespace."""
    if not isinstance(text, str):
        text = str(text) # Ensure input is string
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-alphabetic characters (keeping spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def load_glove_embeddings(embedding_path):
    """Loads GloVe embeddings from a file into a dictionary."""
    print(f"Loading GloVe embeddings from: {embedding_path}")
    embeddings_index = {}
    try:
        with open(embedding_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                try:
                    coefs = np.asarray(values[1:], dtype='float32')
                    if len(coefs) == EMBEDDING_DIM: # Ensure dimension match
                         embeddings_index[word] = coefs
                    else:
                         # print(f"Warn: Skipping word '{word}' - embedding dim mismatch ({len(coefs)} vs {EMBEDDING_DIM})") # Optional warning
                         pass
                except ValueError:
                     # print(f"Warn: Skipping line, cannot parse vectors: {line[:50]}...") # Optional warning
                     pass # Ignore lines that are not valid word vectors
        print(f"✅ Found {len(embeddings_index)} word vectors with dim {EMBEDDING_DIM}.")
        if not embeddings_index: print("Warning: No embeddings loaded. Check path and dimensions.")
        return embeddings_index
    except FileNotFoundError:
        print(f"❌ Error: GloVe file not found: {embedding_path}. Update GLOVE_EMBEDDING_PATH.")
        return None
    except Exception as e:
        print(f"❌ Error loading GloVe embeddings: {e}")
        return None

# --- DL Model Definition Functions ---
def build_lstm_model(vocab_size, embedding_dim, max_length, embedding_matrix=None):
    """Builds a Bidirectional LSTM model with trainable embeddings option."""
    print("Building LSTM Model...")
    model = Sequential(name="LSTM_Model")
    if embedding_matrix is not None:
        print("Using pre-trained GloVe embeddings (fine-tuning enabled).")
        model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=True))
    else:
        print("Training embeddings from scratch.")
        model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length, trainable=True))
    model.add(SpatialDropout1D(SPATIAL_DROPOUT_RATE))
    model.add(Bidirectional(LSTM(LSTM_UNITS, dropout=DROPOUT_RATE, recurrent_dropout=DROPOUT_RATE)))
    model.add(Dense(1, activation='sigmoid'))
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    # Include Recall/Precision for Truthful class directly in compile for monitoring during fit
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy',
                                                                            tf.keras.metrics.Recall(class_id=0, name='recall_truthful'),
                                                                            tf.keras.metrics.Precision(class_id=0, name='precision_truthful')])
    print(model.summary())
    return model

# --- build_gru_model function REMOVED ---

# --- Evaluation Helper ---
def print_evaluation_metrics(y_true, y_pred, model_name, target_names, threshold=0.5):
    """Calculates and prints detailed classification metrics using a specific threshold."""
    print(f"\n--- Evaluation: {model_name} (Threshold: {threshold:.3f}) ---") # Increased precision for threshold
    # Ensure y_true and y_pred are numpy arrays for consistent processing
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    if len(y_true) == 0 or len(y_pred) == 0:
         print("Error: Empty true labels or predictions array.")
         return {"accuracy":0.0,"report":"Error","confusion_matrix":None}
    if len(y_true) != len(y_pred):
         print(f"Error: Label ({len(y_true)}) and prediction ({len(y_pred)}) length mismatch.")
         return {"accuracy":0.0,"report":"Error","confusion_matrix":None}

    accuracy = accuracy_score(y_true, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    try:
        # Check unique labels present in BOTH y_true and y_pred for report generation
        unique_labels = np.unique(np.concatenate((y_true, y_pred)))
        report_labels = [0, 1] # Assume standard binary case for report

        if len(unique_labels) < 2:
             print(f"Warn: Only one class ({unique_labels}) present in true/pred. Report may be partial.")
             # Adjust target names if only one class is predicted/present
             report_labels = unique_labels.tolist()
             current_target_names = [target_names[int(l)] for l in report_labels if l < len(target_names)]
             if not current_target_names: current_target_names = [f"Class {l}" for l in report_labels]
        else:
             current_target_names = target_names # Use standard names for binary case

        report_text = classification_report(y_true, y_pred, labels=report_labels, target_names=current_target_names, zero_division=0, output_dict=False)
        report_dict = classification_report(y_true, y_pred, labels=report_labels, target_names=current_target_names, zero_division=0, output_dict=True)
        print(report_text)

        # Safely extract F1 scores, defaulting to 0.0 if class 0 or 1 not in report_dict
        truth_f1 = report_dict.get(target_names[0], {}).get('f1-score', 0.0)
        lie_f1 = report_dict.get(target_names[1], {}).get('f1-score', 0.0)
        macro_f1 = report_dict.get('macro avg', {}).get('f1-score', 0.0)
        weighted_f1 = report_dict.get('weighted avg', {}).get('f1-score', 0.0)

        print("\nKey F1-Scores:")
        print(f"  F1 Score [{target_names[0]}]: {truth_f1:.4f}")
        print(f"  F1 Score [{target_names[1]}]: {lie_f1:.4f}")
        print(f"  F1 Score [Macro Avg]:     {macro_f1:.4f}")
        print(f"  F1 Score [Weighted Avg]:  {weighted_f1:.4f}")

        print("\nConfusion Matrix:")
        cm = confusion_matrix(y_true, y_pred, labels=[0, 1]) # Force labels for standard TN/FP layout
        tn, fp, fn, tp = cm.ravel()
        print(f"            Predicted:")
        print(f"            {target_names[0]:<12} {target_names[1]:<12}")
        print(f"Actual {target_names[0]:<10} [{tn:<12} {fp:<12}] (TN, FP)")
        print(f"Actual {target_names[1]:<10} [{fn:<12} {tp:<12}] (FN, TP)")
        return {"accuracy": accuracy, "report": report_dict, "confusion_matrix": cm, "f1_truthful": truth_f1, "f1_deceptive": lie_f1, "f1_macro": macro_f1}

    except Exception as e:
        print(f"❌ Error generating classification report/CM for {model_name}: {e}")
        try: acc = accuracy_score(y_true, y_pred)
        except: acc = 0.0
        return {"accuracy":acc,"report":"Error","confusion_matrix":None, "f1_truthful": 0.0, "f1_deceptive": 0.0, "f1_macro": 0.0}

def find_best_threshold(y_true, y_proba, target_class_index=0, metric='f1'):
    """Finds the best threshold to maximize F1 score for the target class (default: class 0 - Truthful)."""
    best_threshold = 0.5
    best_score = -1

    y_true = np.asarray(y_true)
    y_proba = np.asarray(y_proba)

    if len(y_true) != len(y_proba):
        print(f"Warn: Label ({len(y_true)}) and probability ({len(y_proba)}) length mismatch in find_best_threshold. Using default 0.5.")
        return 0.5, -1
    if not np.all(np.isfinite(y_proba)):
        print("Warn: Non-finite values (NaN/inf) found in probabilities. Using default 0.5 threshold.")
        return 0.5, -1
    if len(np.unique(y_true)) < 2:
         print(f"Warn: Only one class found in y_true for threshold tuning. Using default 0.5 threshold.")
         return 0.5, -1 # Cannot calculate PR curve with only one class

    try:
        precisions, recalls, thresholds = precision_recall_curve(y_true, y_proba)
    except ValueError as prc_e:
        print(f"Warn: Could not calculate precision-recall curve: {prc_e}. Using default 0.5 threshold.")
        return 0.5, -1

    print(f"Finding best threshold for class {target_class_index} ({target_names[target_class_index]}) using F1 score...")
    best_score = -1
    best_threshold = 0.5 # Default

    for threshold_candidate in np.linspace(0.01, 0.99, 99):
        y_pred_tuned = (y_proba >= threshold_candidate).astype(int)
        current_f1 = f1_score(y_true, y_pred_tuned, pos_label=target_class_index, average='binary', zero_division=0)
        if current_f1 > best_score:
            best_score = current_f1
            best_threshold = threshold_candidate

    # Check the default 0.5 threshold as well
    y_pred_05 = (y_proba >= 0.5).astype(int)
    f1_05 = f1_score(y_true, y_pred_05, pos_label=target_class_index, average='binary', zero_division=0)
    if f1_05 > best_score:
         best_score = f1_05
         best_threshold = 0.5

    print(f"Best threshold found: {best_threshold:.3f} (Maximizes F1 for class {target_class_index} at {best_score:.4f})")
    return best_threshold, best_score

# --- Main Execution ---
if __name__ == "__main__":

    # Ensure target names are defined early and consistently
    target_names = ["Truthful (0)", "Deceptive (1)"]

    # --- 1. Load Data ---
    print("--- Loading Data ---")
    try:
        train_data = load_jsonl(TRAIN_FILE_PATH)
        val_data = load_jsonl(VALIDATION_FILE_PATH)
        train_val_data = train_data + val_data
        test_data = load_jsonl(TEST_FILE_PATH)
        del train_data, val_data
        gc.collect()
    except (FileNotFoundError, RuntimeError) as e:
        print(e)
        exit(1) # Exit if essential data is missing
    if not train_val_data:
        print("❌ Error: No train/validation data loaded. Exiting.")
        exit(1)
    if not test_data:
        print("Warning: No test data loaded. Evaluation on test set will be skipped.")

    # --- 2. Prepare DataFrames ---
    print("\n--- Preparing DataFrames (using sender_labels) ---")
    train_val_df = data_to_dataframe(train_val_data)
    test_df = data_to_dataframe(test_data) if test_data else pd.DataFrame()
    del train_val_data, test_data # Free memory
    gc.collect()
    if train_val_df.empty:
        print("❌ Error: Train+Validation DataFrame is empty after processing. Exiting.")
        exit(1)

    # --- 3. Initial Balancing & Train/Validation Split ---
    print("\n--- Initial Balancing & Train/Validation Split ---")
    label_counts = train_val_df['labels'].value_counts()
    train_df_balanced = pd.DataFrame() # Initialize

    if len(label_counts) < 2:
        print("Warning: Only one class found in the combined train/val data. Skipping initial balancing.")
        train_df_balanced = train_val_df.copy()
    else:
        majority_class_label = label_counts.idxmax()
        minority_class_label = label_counts.idxmin()
        majority_count = label_counts.max()
        minority_count = label_counts.min()

        if minority_count <= 0:
             print(f"Error: Minority class ({minority_class_label}) has zero samples. Cannot balance.")
             train_df_balanced = train_val_df.copy()
        elif majority_count == minority_count:
            print("Data is already balanced. Skipping initial upsampling.")
            train_df_balanced = train_val_df.copy()
        else:
            print(f"Initial Upsampling minority class ({minority_class_label}) from {minority_count} to {majority_count}.")
            df_minority = train_val_df[train_val_df.labels == minority_class_label]
            df_majority = train_val_df[train_val_df.labels == majority_class_label]
            df_minority_upsampled = resample(df_minority, replace=True, n_samples=majority_count, random_state=RANDOM_STATE)
            train_df_balanced = pd.concat([df_majority, df_minority_upsampled])
            del df_minority, df_majority, df_minority_upsampled
            gc.collect()

    train_df_balanced = train_df_balanced.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
    print(f"Initially balanced dataset size: {len(train_df_balanced)}")
    del train_val_df
    gc.collect()

    df_train, df_val = train_test_split(
        train_df_balanced,
        test_size=VALIDATION_SET_SIZE,
        random_state=RANDOM_STATE,
        stratify=train_df_balanced['labels']
    )
    print(f"Final Train Set Size: {len(df_train)}, Validation Set Size: {len(df_val)}")
    print(f"Train label distribution:\n{df_train['labels'].value_counts(normalize=True)}")
    print(f"Validation label distribution:\n{df_val['labels'].value_counts(normalize=True)}")
    del train_df_balanced
    gc.collect()

    train_texts_raw = df_train['messages'].tolist()
    y_train = np.array(df_train['labels'].tolist())
    val_texts_raw = df_val['messages'].tolist()
    y_val = np.array(df_val['labels'].tolist())
    test_texts_raw = []
    y_test = np.array([])
    if not test_df.empty:
        test_texts_raw = test_df['messages'].tolist()
        y_test = np.array(test_df['labels'].tolist())
    else:
        print("Warning: Test DataFrame is empty or was not loaded.")
    print(f"Data shapes: Train Text={len(train_texts_raw)}, Train Labels={y_train.shape}")
    print(f"             Val Text={len(val_texts_raw)}, Val Labels={y_val.shape}")
    print(f"             Test Text={len(test_texts_raw)}, Test Labels={y_test.shape}")

    # --- 4. Text Preprocessing ---
    print("\n--- Text Preprocessing ---")
    start_preprocess = time.time()
    train_texts_processed = [preprocess_text(text) for text in train_texts_raw]
    val_texts_processed = [preprocess_text(text) for text in val_texts_raw]
    test_texts_processed = [preprocess_text(text) for text in test_texts_raw] if test_texts_raw else []
    print(f"Text preprocessing took {time.time() - start_preprocess:.2f} seconds.")
    del train_texts_raw, val_texts_raw, test_texts_raw
    gc.collect()

    # --- Initialize Result Storage ---
    ml_models = {}
    ml_results = {}
    ml_probas_val = {}
    ml_probas_test = {}
    vectorizer = None
    dl_models = {} # Will store LSTM model
    dl_results = {}
    dl_probas_val = {}
    dl_probas_test = {}
    keras_tokenizer = None
    embedding_matrix = None
    best_thresholds = {}
    custom_ensemble_results = {}
    avg_proba_test = None
    avg_threshold = 0.5

    # =====================================================
    # == Section A: TF-IDF + ML Models Pipeline          ==
    # =====================================================
    print("\n" + "="*50 + "\n== Section A: TF-IDF + ML Models Pipeline\n" + "="*50)
    X_train_ml, y_train_ml = None, None
    X_val_ml, y_val_ml = None, None
    X_test_ml = None

    try:
        # --- A.1 TF-IDF ---
        print("\n--- A.1 Feature Extraction (TF-IDF) ---")
        start_time = time.time()
        vectorizer = TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, ngram_range=NGRAM_RANGE, stop_words='english', min_df=5)
        print("Fitting TF-IDF on training data...")
        X_train_tfidf = vectorizer.fit_transform(train_texts_processed)
        print("Transforming validation and test data...")
        X_val_ml = vectorizer.transform(val_texts_processed)
        y_val_ml = y_val
        X_test_ml = vectorizer.transform(test_texts_processed) if test_texts_processed else None
        print(f"TF-IDF transformation took {time.time() - start_time:.2f}s.")
        print(f"TF-IDF Shapes: Train={X_train_tfidf.shape}, Val={X_val_ml.shape}", end="")
        if X_test_ml is not None: print(f", Test={X_test_ml.shape}")
        else: print(", No test data.")

        # --- A.2 Apply SMOTE ---
        print("\n--- A.2 Applying SMOTE to TF-IDF Training Data ---")
        start_time = time.time()
        unique_train_labels, counts_train_labels = np.unique(y_train, return_counts=True)
        min_class_count_train = counts_train_labels.min() if len(counts_train_labels)>0 else 0
        if len(unique_train_labels) < 2 or min_class_count_train < 6:
             print(f"Warn: Skipping SMOTE (only one class or minority count < 6).")
             X_train_ml, y_train_ml = X_train_tfidf, y_train
        else:
             print("Applying SMOTE...")
             smote = SMOTE(random_state=RANDOM_STATE, n_jobs=-1)
             try:
                 X_train_ml, y_train_ml = smote.fit_resample(X_train_tfidf, y_train)
                 print(f"SMOTE took {time.time() - start_time:.2f}s. Shape: {X_train_ml.shape}")
                 print(f"Label distribution after SMOTE:\n{pd.Series(y_train_ml).value_counts(normalize=True)}")
             except Exception as smote_err:
                 print(f"❌ Error during SMOTE: {smote_err}. Using original training data for ML.")
                 X_train_ml, y_train_ml = X_train_tfidf, y_train
        del X_train_tfidf
        gc.collect()

        # --- A.3 Train/Evaluate ML Models ---
        print("\n--- A.3 Training and Evaluating ML Models ---")
        classifiers = {
            "Logistic Regression": LogisticRegression(random_state=RANDOM_STATE, max_iter=1000, solver='liblinear'),
            "Random Forest": RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=100, n_jobs=-1),
            "Multinomial NB": MultinomialNB(),
        }
        for name, model in classifiers.items():
            print(f"\nTraining {name}...")
            start_time = time.time()
            try:
                model.fit(X_train_ml, y_train_ml)
                print(f"✅ Training completed in {time.time() - start_time:.2f} seconds.")
                ml_models[name] = model
                if X_test_ml is not None and len(y_test) > 0:
                    preds_test = model.predict(X_test_ml)
                    results = print_evaluation_metrics(y_test, preds_test, f"{name} @0.5 (Test)", target_names, threshold=0.5)
                    ml_results[name] = results
                else:
                    print(f"Skipping TEST set evaluation for {name}.")
                    ml_results[name] = None
                if hasattr(model, "predict_proba"):
                    try:
                        ml_probas_val[name] = model.predict_proba(X_val_ml)[:, 1]
                        if X_test_ml is not None: ml_probas_test[name] = model.predict_proba(X_test_ml)[:, 1]
                        else: ml_probas_test[name] = None
                    except Exception as pp_e:
                        print(f"Warn: Could not get predict_proba for {name}: {pp_e}")
                        ml_probas_val[name], ml_probas_test[name] = None, None
                else:
                    print(f"Warn: {name} does not support predict_proba.")
                    ml_probas_val[name], ml_probas_test[name] = None, None
            except Exception as e:
                print(f"❌ Error training/evaluating {name}: {e}")
                ml_models[name], ml_results[name], ml_probas_val[name], ml_probas_test[name] = None, None, None, None
                gc.collect()

        # --- A.4 Voting Ensemble ---
        print("\n--- A.4 Setting up and Evaluating ML Voting Ensemble ---")
        estimators = []
        for name, model in ml_models.items():
            if model is not None and ml_probas_val.get(name) is not None: # Check probas exist for soft voting
                estimators.append((name, model))
            else: print(f"Warn: Excluding '{name}' from soft voting ensemble.")
        if len(estimators) >= 2:
            print(f"Creating Soft Voting Ensemble with: {', '.join([n for n,_ in estimators])}")
            ensemble_soft = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)
            print("Fitting Soft Voting Ensemble...")
            start_time = time.time()
            try:
                ensemble_soft.fit(X_train_ml, y_train_ml)
                print(f"✅ Ensemble fitting completed in {time.time() - start_time:.2f} seconds.")
                ml_models["Ensemble (Soft)"] = ensemble_soft
                if X_test_ml is not None and len(y_test) > 0:
                    ensemble_preds_test = ensemble_soft.predict(X_test_ml)
                    results = print_evaluation_metrics(y_test, ensemble_preds_test, "Ensemble (Soft) @0.5 (Test)", target_names, threshold=0.5)
                    ml_results["Ensemble (Soft)"] = results
                else:
                    print("Skipping TEST evaluation for ML Ensemble.")
                    ml_results["Ensemble (Soft)"] = None
                try:
                    ml_probas_val["Ensemble (Soft)"] = ensemble_soft.predict_proba(X_val_ml)[:, 1]
                    if X_test_ml is not None: ml_probas_test["Ensemble (Soft)"] = ensemble_soft.predict_proba(X_test_ml)[:, 1]
                    else: ml_probas_test["Ensemble (Soft)"] = None
                except Exception as pp_e:
                    print(f"Warn: Could not get predict_proba for ML Ensemble: {pp_e}")
                    ml_probas_val["Ensemble (Soft)"], ml_probas_test["Ensemble (Soft)"] = None, None
            except Exception as e:
                print(f"❌ Error fitting/evaluating ML Ensemble: {e}")
                ml_models["Ensemble (Soft)"], ml_results["Ensemble (Soft)"], ml_probas_val["Ensemble (Soft)"], ml_probas_test["Ensemble (Soft)"] = None, None, None, None
                gc.collect()
        else:
            print("Skipping ML Voting Ensemble (need >= 2 valid base models).")
            ml_models["Ensemble (Soft)"], ml_results["Ensemble (Soft)"], ml_probas_val["Ensemble (Soft)"], ml_probas_test["Ensemble (Soft)"] = None, None, None, None

    except Exception as pipeline_e:
        print(f"\n❌❌❌ ERROR in TF-IDF/ML Pipeline: {pipeline_e} ❌❌❌")
    finally:
        del X_train_ml, y_train_ml, X_val_ml, y_val_ml, X_test_ml
        gc.collect()

    # =====================================================
    # == Section B: Deep Learning Model Pipeline (LSTM Only) == # <--- Modified Section Title
    # =====================================================
    print("\n" + "="*50 + "\n== Section B: Deep Learning Model Pipeline (LSTM Only)\n" + "="*50) # <--- Modified Section Title
    X_train_dl, X_val_dl, X_test_dl = None, None, None # Initialize

    try:
        # --- B.1 Tokenization & Padding ---
        print("\n--- B.1 Tokenization & Padding for LSTM ---") # <--- Modified Title
        start_time = time.time()
        keras_tokenizer = KerasTokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>") # Renamed variable
        keras_tokenizer.fit_on_texts(train_texts_processed)
        word_index = keras_tokenizer.word_index
        print(f"Found {len(word_index)} unique tokens in Keras tokenizer.")
        train_sequences = keras_tokenizer.texts_to_sequences(train_texts_processed)
        val_sequences = keras_tokenizer.texts_to_sequences(val_texts_processed)
        test_sequences = keras_tokenizer.texts_to_sequences(test_texts_processed) if test_texts_processed else []
        X_train_dl = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
        X_val_dl = pad_sequences(val_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
        X_test_dl = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post') if test_sequences else None
        print(f"Keras Tokenization/Padding took {time.time() - start_time:.2f}s.")
        print(f"Padded Shapes: Train={X_train_dl.shape}, Val={X_val_dl.shape}", end="")
        if X_test_dl is not None: print(f", Test={X_test_dl.shape}")
        else: print(", No test data.")
        del train_sequences, val_sequences, test_sequences
        gc.collect()

        # --- B.2 Load Embeddings & Create Matrix ---
        print("\n--- B.2 Preparing GloVe Embedding Matrix for LSTM ---") # <--- Modified Title
        start_time = time.time()
        embeddings_index = load_glove_embeddings(GLOVE_EMBEDDING_PATH)
        if embeddings_index:
            embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
            hits, misses = 0, 0
            for word, i in word_index.items():
                if i >= VOCAB_SIZE: continue
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    embedding_matrix[i], hits = embedding_vector, hits + 1
                else: misses += 1
            print(f"Embedding matrix created in {time.time() - start_time:.2f}s. {hits} hits, {misses} misses.")
            del embeddings_index; gc.collect()
        else:
             print("GloVe embeddings failed. LSTM trains embeddings from scratch.")
             embedding_matrix = None

        # --- B.3 Define Class Weights & Early Stopping ---
        print("\n--- B.3 Calculating Class Weights & Setting Early Stopping ---")
        unique_classes, class_counts = np.unique(y_train, return_counts=True)
        dl_class_weight_dict = None
        if len(unique_classes) == 2:
             dl_class_weights_computed = sk_class_weight.compute_class_weight('balanced', classes=unique_classes, y=y_train)
             dl_class_weight_dict = dict(zip(unique_classes, dl_class_weights_computed))
             print(f"Calculated DL Class Weights: {dl_class_weight_dict}")
        else: print("Warn: Not using class weights (y_train not binary).")
        dl_early_stopping = KerasEarlyStopping(monitor='val_loss', patience=DL_EARLY_STOPPING_PATIENCE, restore_best_weights=True, verbose=1)

        # --- B.4 Train & Evaluate LSTM --- # <--- Renumbered Section
        print("\n--- B.4 Training LSTM Model ---") # <--- Renumbered Section
        lstm_model = build_lstm_model(VOCAB_SIZE, EMBEDDING_DIM, MAX_LENGTH, embedding_matrix)
        start_time = time.time()
        history_lstm = lstm_model.fit(X_train_dl, y_train, epochs=DL_EPOCHS, batch_size=DL_BATCH_SIZE,
                                      validation_data=(X_val_dl, y_val),
                                      callbacks=[dl_early_stopping], class_weight=dl_class_weight_dict, verbose=1)
        train_time = time.time() - start_time
        stopped_epoch_lstm = dl_early_stopping.stopped_epoch
        print(f"✅ LSTM Training completed in {train_time:.2f} seconds (stopped epoch: {stopped_epoch_lstm if stopped_epoch_lstm > 0 else 'Not stopped early'}).")
        dl_models["LSTM"] = lstm_model # Store trained model

        # Evaluate LSTM
        if X_test_dl is not None and len(y_test) > 0:
            print("\nEvaluating LSTM on Test Set...")
            start_time = time.time()
            dl_probas_test["LSTM"] = lstm_model.predict(X_test_dl, verbose=0).flatten()
            test_preds_lstm_05 = (dl_probas_test["LSTM"] > 0.5).astype(int)
            print(f"LSTM Prediction took {time.time() - start_time:.2f}s.")
            results = print_evaluation_metrics(y_test, test_preds_lstm_05, "LSTM @0.5 (Test)", target_names, threshold=0.5)
            dl_results["LSTM"] = results
            print("Getting LSTM probabilities on Validation Set...")
            dl_probas_val["LSTM"] = lstm_model.predict(X_val_dl, verbose=0).flatten()
        else:
            print("Skipping LSTM evaluation on test set.")
            dl_results["LSTM"], dl_probas_val["LSTM"], dl_probas_test["LSTM"] = None, None, None

        # --- GRU Section Removed ---

    except Exception as pipeline_e:
        print(f"\n❌❌❌ ERROR in DL (LSTM) Pipeline: {pipeline_e} ❌❌❌")
        if "LSTM" not in dl_models: dl_models["LSTM"] = None # Ensure marked as None if error before storage
    finally:
         del X_train_dl, X_val_dl, X_test_dl
         if 'embedding_matrix' in locals(): del embedding_matrix
         gc.collect()

    # =====================================================
    # == Section C: Threshold Tuning (using Validation Set) ==
    # =====================================================
    print("\n" + "="*50 + "\n== Section C: Threshold Tuning (using Validation Set)\n" + "="*50)
    # Combine probabilities from ALL models that produced validation probabilities
    # GRU key will simply not exist in dl_probas_val if it wasn't run
    all_val_probas = {**ml_probas_val, **dl_probas_val}

    if not y_val.size:
        print("Validation set (y_val) is empty. Skipping threshold tuning.")
    elif not all_val_probas:
        print("No model probabilities available for validation set. Skipping threshold tuning.")
    else:
        print(f"Finding best thresholds for models: {list(all_val_probas.keys())}")
        for name, y_proba_val in all_val_probas.items():
            if y_proba_val is None or not isinstance(y_proba_val, np.ndarray) or not len(y_proba_val):
                print(f"Skipping threshold tuning for '{name}' (no valid probabilities).")
                best_thresholds[name] = 0.5
                continue
            if len(y_proba_val) != len(y_val):
                print(f"Warn: Skipping threshold tuning for '{name}'. Proba length mismatch.")
                best_thresholds[name] = 0.5
                continue

            print(f"\nTuning threshold for {name}...")
            best_thresh, best_f1 = find_best_threshold(y_val, y_proba_val, target_class_index=0, metric='f1')
            best_thresholds[name] = best_thresh

    print("\nBest Thresholds found (optimizing F1 for Truthful class on Validation Set):")
    for name, thresh in best_thresholds.items(): print(f"  {name:<25}: {thresh:.3f}")


    # =====================================================
    # == Section D: Custom Ensemble (ML+LSTM Average Prob) == # <--- Modified Title
    # =====================================================
    print("\n" + "="*50 + "\n== Section D: Custom Ensemble (ML+LSTM Average Prob)\n" + "="*50) # <--- Modified Title

    # Combine TEST probabilities from available models
    test_probas_to_average = []
    valid_model_names_for_avg = []

    # ... (inside Section D) ...
    test_probas_to_average = []
    valid_model_names_for_avg = []

    # Check ML Ensemble
    if "Ensemble (Soft)" in ml_probas_test and isinstance(ml_probas_test["Ensemble (Soft)"], np.ndarray) and len(ml_probas_test["Ensemble (Soft)"]) > 0:
        test_probas_to_average.append(ml_probas_test["Ensemble (Soft)"])
        valid_model_names_for_avg.append("Ensemble (Soft)")
        # ...
    # Check LSTM
    if "LSTM" in dl_probas_test and isinstance(dl_probas_test["LSTM"], np.ndarray) and len(dl_probas_test["LSTM"]) > 0:
        test_probas_to_average.append(dl_probas_test["LSTM"])
        valid_model_names_for_avg.append("LSTM")
        # ...

    # --- This is the crucial check ---
    if len(test_probas_to_average) >= 2 and len(y_test) > 0:
        # ... (check lengths match y_test) ...
        if all(l == len(y_test) for l in lengths):
             # !!! Only calculate avg_proba_test HERE !!!
             avg_proba_test = np.mean(np.array(test_probas_to_average), axis=0)
        else:
             # Length mismatch case
             avg_proba_test = None # Stays None
    else:
        # Didn't meet conditions case
        print("\nSkipping custom ensemble (need >= 2 models with valid test probabilities).")
        # avg_proba_test stays None here too

    


    # =====================================================
    # == Section E: Saving Artifacts                       ==
    # =====================================================
    print("\n" + "="*50 + "\n== Section E: Saving Artifacts\n" + "="*50)
    models_saved = False
    try:
        os.makedirs(SAVE_DIR, exist_ok=True)
        print(f"Using save directory: {SAVE_DIR}")
        models_saved = True
    except OSError as e:
        print(f"❌ Error creating/accessing directory {SAVE_DIR}: {e}.")

    if models_saved:
        # Save TF-IDF Vectorizer
        if vectorizer:
            try: joblib.dump(vectorizer, os.path.join(SAVE_DIR, "tfidf_vectorizer.joblib")); print(f"✅ Saved vectorizer")
            except Exception as e: print(f"❌ Error saving vectorizer: {e}")
        else: print("ℹ️ Vectorizer not available, skipping save.")

        # Save ML Models
        print("\nSaving ML Models...")
        for name, model in ml_models.items():
            if model is not None:
                sanitized_name = name.replace(' ', '_').replace('(', '').replace(')', '')
                try: joblib.dump(model, os.path.join(SAVE_DIR, f"ml_{sanitized_name}.joblib")); print(f"  ✅ Saved ML model '{name}'")
                except Exception as e: print(f"  ❌ Error saving ML model '{name}': {e}")
            else: print(f"  ℹ️ ML Model '{name}' not available, skipping save.")

        # Save DL Components
        print("\nSaving DL (LSTM) Components...")
        if keras_tokenizer: # Renamed variable
            try: joblib.dump(keras_tokenizer, os.path.join(SAVE_DIR, "keras_tokenizer.joblib")); print(f"  ✅ Saved Keras tokenizer") # Renamed variable
            except Exception as e: print(f"  ❌ Error saving Keras tokenizer: {e}")
        else: print(f"  ℹ️ Keras Tokenizer not available, skipping save.")

        if dl_models.get("LSTM"): # Use .get() for safety
            try: dl_models["LSTM"].save(os.path.join(SAVE_DIR, "lstm_model.keras")); print(f"  ✅ Saved LSTM model")
            except Exception as e: print(f"  ❌ Error saving LSTM model: {e}")
        else: print(f"  ℹ️ LSTM Model not available, skipping save.")

        # GRU Saving block removed

        # Save Best Thresholds
        print("\nSaving Best Thresholds...")
        if best_thresholds:
            try: joblib.dump(best_thresholds, os.path.join(SAVE_DIR, "best_thresholds.joblib")); print(f"  ✅ Saved best thresholds")
            except Exception as e: print(f"  ❌ Error saving thresholds: {e}")
        else: print(f"  ℹ️ Best thresholds dictionary empty, skipping save.")

    else: print("\nArtifact saving skipped.")


    # =====================================================
    # == Section F: Example Predictions                  ==
    # =====================================================
    print("\n" + "="*50 + "\n== Section F: Example Predictions on Test Set\n" + "="*50)

    if not test_df.empty and len(y_test) > 0 and len(test_df) >= NUM_TEST_EXAMPLES_TO_SHOW:
        print(f"\nShowing predictions for first {NUM_TEST_EXAMPLES_TO_SHOW} test messages using BEST thresholds:")
        example_df = test_df.head(NUM_TEST_EXAMPLES_TO_SHOW)
        pred_to_label = {0: target_names[0], 1: target_names[1]}
        example_orig_locs = list(range(NUM_TEST_EXAMPLES_TO_SHOW)) # Indices for head()

        # Pre-fetch probabilities for the examples if available
        # Use .get() with default empty array to avoid errors if key missing
        example_ml_probas = {name: ml_probas_test.get(name, np.array([]))[example_orig_locs]
                             for name in ml_models if ml_probas_test.get(name) is not None and len(ml_probas_test.get(name, [])) > max(example_orig_locs)}
        example_dl_probas = {name: dl_probas_test.get(name, np.array([]))[example_orig_locs]
                             for name in dl_models if dl_probas_test.get(name) is not None and len(dl_probas_test.get(name, [])) > max(example_orig_locs)}
        example_avg_probas = avg_proba_test[example_orig_locs] if avg_proba_test is not None and len(avg_proba_test) > max(example_orig_locs) else None

        for i, index in enumerate(example_df.index):
            current_loc = example_orig_locs[i]
            row = example_df.loc[index]
            message = row['messages']
            true_label_int = row['labels']
            true_label_str = pred_to_label[true_label_int]

            print(f"\n--- Test Example (Index: {index}, Loc: {current_loc}) ---")
            print(f"  True Label (Sender): {true_label_str}")
            print(f"  Message: '{message[:250]}...'")
            print("  Model Predictions (using best thresholds):")

            # ML Predictions
            for name in ml_models:
                if name in example_ml_probas and i < len(example_ml_probas[name]):
                    try:
                        proba_class1 = example_ml_probas[name][i]
                        threshold = best_thresholds.get(name, 0.5)
                        pred_int = (proba_class1 >= threshold).astype(int)
                        pred_label = pred_to_label.get(pred_int, "ERR")
                        print(f"    {name:<20}: {pred_label} (Prob: {proba_class1:.3f}, Thr: {threshold:.3f})")
                    except Exception as e: print(f"    {name:<20}: Error getting prediction - {e}")
                elif ml_models.get(name) is not None: print(f"    {name:<20}: Probas N/A")

            # LSTM Prediction
            if dl_models.get("LSTM") and "LSTM" in example_dl_probas and i < len(example_dl_probas["LSTM"]):
                 try:
                     proba_class1 = example_dl_probas["LSTM"][i]
                     threshold = best_thresholds.get("LSTM", 0.5)
                     pred_int = (proba_class1 >= threshold).astype(int)
                     pred_label = pred_to_label.get(pred_int, "ERR")
                     print(f"    {'LSTM':<20}: {pred_label} (Prob: {proba_class1:.3f}, Thr: {threshold:.3f})")
                 except Exception as e: print(f"    {'LSTM':<20}: Error predicting - {e}")
            elif dl_models.get("LSTM"): print(f"    {'LSTM':<20}: Probas N/A")

            # GRU Prediction block removed

            # Custom Ensemble Prediction
            if example_avg_probas is not None and i < len(example_avg_probas):
                try:
                    proba_class1 = example_avg_probas[i]
                    pred_int = (proba_class1 >= avg_threshold).astype(int) # Use threshold calculated in section D
                    pred_label = pred_to_label.get(pred_int, "ERR")
                    print(f"    {'CUSTOM ENSEMBLE':<20}: {pred_label} (AvgProb: {proba_class1:.3f}, Thr: {avg_threshold:.3f})")
                except Exception as e: print(f"    {'CUSTOM ENSEMBLE':<20}: Error predicting - {e}")
            else: print(f"    {'CUSTOM ENSEMBLE':<20}: Avg Probas N/A")

    elif not y_test.size: print("\nTest data is empty, cannot show examples.")
    else: print(f"\nTest DataFrame has < {NUM_TEST_EXAMPLES_TO_SHOW} samples or other issue.")

    # =====================================================
    # == Final Cleanup                                   ==
    # =====================================================
    print("\n--- Cleaning up main script memory ---")
    del y_train, y_val, y_test
    del train_texts_processed, val_texts_processed, test_texts_processed
    if 'df_train' in locals(): del df_train
    if 'df_val' in locals(): del df_val
    if 'test_df' in locals(): del test_df
    if 'example_df' in locals(): del example_df
    if 'vectorizer' in locals(): del vectorizer
    if 'ml_models' in locals(): del ml_models
    if 'dl_models' in locals(): del dl_models # Removes LSTM ref
    if 'keras_tokenizer' in locals(): del keras_tokenizer
    # GRU model already removed
    gc.collect()
    print("\n--- Full Training Script V3.1 (GRU Removed) Finished ---")

TensorFlow Version: 2.18.0
--- Loading Data ---
✅ Loaded 189 entries from /kaggle/input/nlpdata/train.jsonl.
✅ Loaded 21 entries from /kaggle/input/nlpdata/validation.jsonl.
✅ Loaded 42 entries from /kaggle/input/nlpdata/test.jsonl.

--- Preparing DataFrames (using sender_labels) ---
Processing data using 'sender_labels' as ground truth...
DataFrame created with 14548 messages (using sender_labels).
Label distribution (sender):
labels
1    0.955527
0    0.044473
Name: proportion, dtype: float64
Processing data using 'sender_labels' as ground truth...
DataFrame created with 2741 messages (using sender_labels).
Label distribution (sender):
labels
1    0.912441
0    0.087559
Name: proportion, dtype: float64

--- Initial Balancing & Train/Validation Split ---
Initial Upsampling minority class (0) from 647 to 13901.
Initially balanced dataset size: 27802
Final Train Set Size: 23631, Validation Set Size: 4171
Train label distribution:
labels
1    0.500021
0    0.499979
Name: proportion, dtyp



SMOTE took 4.76s. Shape: (23632, 5000)
Label distribution after SMOTE:
0    0.5
1    0.5
Name: proportion, dtype: float64

--- A.3 Training and Evaluating ML Models ---

Training Logistic Regression...
✅ Training completed in 0.09 seconds.

--- Evaluation: Logistic Regression @0.5 (Test) (Threshold: 0.500) ---
Accuracy: 0.8282

Classification Report:
               precision    recall  f1-score   support

 Truthful (0)       0.14      0.19      0.16       240
Deceptive (1)       0.92      0.89      0.90      2501

     accuracy                           0.83      2741
    macro avg       0.53      0.54      0.53      2741
 weighted avg       0.85      0.83      0.84      2741


Key F1-Scores:
  F1 Score [Truthful (0)]: 0.1604
  F1 Score [Deceptive (1)]: 0.9043
  F1 Score [Macro Avg]:     0.5324
  F1 Score [Weighted Avg]:  0.8392

Confusion Matrix:
            Predicted:
            Truthful (0) Deceptive (1)
Actual Truthful (0) [45           195         ] (TN, FP)
Actual Deceptive (1) 



None
Epoch 1/6
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 229ms/step - accuracy: 0.5668 - loss: 0.6777 - precision_truthful: 0.5692 - recall_truthful: 0.5850 - val_accuracy: 0.6310 - val_loss: 0.6343 - val_precision_truthful: 0.6529 - val_recall_truthful: 0.5592
Epoch 2/6
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 226ms/step - accuracy: 0.6471 - loss: 0.6243 - precision_truthful: 0.6483 - recall_truthful: 0.6343 - val_accuracy: 0.7387 - val_loss: 0.5141 - val_precision_truthful: 0.7516 - val_recall_truthful: 0.7127
Epoch 3/6
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 225ms/step - accuracy: 0.7438 - loss: 0.5170 - precision_truthful: 0.7514 - recall_truthful: 0.7285 - val_accuracy: 0.8655 - val_loss: 0.3293 - val_precision_truthful: 0.9015 - val_recall_truthful: 0.8206
Epoch 4/6
[1m370/370[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 224ms/step - accuracy: 0.8299 - loss: 0.3794 - precision_truthful: 0.841

In [8]:
# -*- coding: utf-8 -*-
"""
Diplomacy Message Classification - Inference Script for V3.1 Models
Loads artifacts from V3.1 training (TF-IDF/ML, LSTM) and predicts on the test set.
Calculates overall evaluation metrics and shows examples based on TRUE labels.

**MODIFIED: Example selection now based on TRUE labels, not predicted labels.**
"""

import json
import gc
import pandas as pd
import numpy as np
import re
import os
import time
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import sys
import warnings

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning, module='joblib')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # Suppress TensorFlow INFO messages

# ML Imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
from sklearn.exceptions import NotFittedError

# DL Imports
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer as KerasTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model as load_keras_model

# --- Configuration ---
# !!! MUST MATCH the V3.1 training script outputs !!!
SAVE_DIR = "trained_models_v3.1_smote_lstm_ensemble" # <--- CHECK/SET THIS PATH !!!
TEST_FILE_PATH = "/kaggle/input/nlpdata/test.jsonl" # <--- CHECK/SET THIS PATH (Test data to evaluate)

# Parameters needed for loading/padding (should match V3.1 training)
DL_MAX_LENGTH = 100

# Target names for output labels
TARGET_NAMES = ["Truthful (0)", "Deceptive (1)"]
NUM_EXAMPLES_TO_SHOW = 5

# --- Preprocessing Function (identical to training V3.1) ---
try:
    STOPWORDS = set(stopwords.words('english'))
except LookupError:
    print("NLTK stopwords not found. Downloading...")
    nltk.download('stopwords', quiet=True)
    STOPWORDS = set(stopwords.words('english'))

def preprocess_text(text):
    """Basic text preprocessing."""
    if not isinstance(text, str): text = str(text)
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --- Helper Functions (Copied from Training Script) ---
def load_jsonl(file_path):
    """Loads data from a JSON Lines file."""
    data = []
    if not os.path.exists(file_path):
        base_name = os.path.basename(file_path); dir_name = os.path.basename(os.path.dirname(file_path))
        kg1=f"/kaggle/input/{base_name}"; kg2=f"/kaggle/input/{dir_name}/{base_name}"; print(f"File not found: '{file_path}'. Trying Kaggle paths...")
        if os.path.exists(kg1): print(f"Found: '{kg1}'"); file_path = kg1
        elif os.path.exists(kg2): print(f"Found: '{kg2}'"); file_path = kg2
        else: raise FileNotFoundError(f"❌ Data file not found: '{file_path}' or Kaggle paths.")
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for ln, line in enumerate(f):
                try: data.append(json.loads(line))
                except json.JSONDecodeError as e: print(f"Warn: Skip JSON line {ln+1}: {e}")
        print(f"✅ Loaded {len(data)} entries from {file_path}.")
        return data
    except Exception as e: raise RuntimeError(f"❌ Error loading {file_path}: {e}")

def data_to_dataframe(jsonl_data):
    """Converts loaded JSONL data into a message DataFrame using SENDER_LABELS."""
    msgs, labels, gids, ridxs = [], [], [], []
    print("Processing data using 'sender_labels'...")
    for i, gdata in enumerate(jsonl_data):
        gid = gdata.get("game_id", f"UNK_{i}")
        if not isinstance(gdata, dict) or "messages" not in gdata or "sender_labels" not in gdata: print(f"Warn: Skip game {i}: Invalid data."); continue
        m, sl = gdata["messages"], gdata["sender_labels"]
        if not isinstance(m,list) or not isinstance(sl,list): print(f"Warn: Game {gid} msgs/labels not lists."); continue
        n_m, n_l = len(m), len(sl)
        if n_m != n_l: print(f"Warn: Game {gid} mismatch ({n_m}/{n_l})."); continue
        ri = gdata.get("relative_message_index", list(range(n_m)))
        if not isinstance(ri, list) or len(ri)!=n_m: print(f"Warn: Game {gid} bad rel_idx."); ri = list(range(n_m))
        msgs.extend(m); labels.extend(sl); gids.extend([gid]*n_m); ridxs.extend(ri)
    if not msgs: print("Warn: No valid messages found."); return pd.DataFrame()
    df = pd.DataFrame({"game_id": gids, "relative_index": ridxs, "messages": msgs, "labels": labels})
    df['labels'] = df['labels'].apply(lambda x: 1 if str(x).lower()=='true' else 0)
    print(f"DataFrame created with {len(df)} messages.")
    if not df.empty: print(f"Label distribution:\n{df['labels'].value_counts(normalize=True)}")
    return df

def print_evaluation_metrics(y_true, y_pred, model_name, target_names, threshold=0.5):
    """Calculates and prints detailed classification metrics using a specific threshold."""
    print(f"\n--- Evaluation: {model_name} (Threshold Used: {threshold:.3f}) ---") # Clarified threshold is the one used for prediction
    y_t, y_p = np.asarray(y_true), np.asarray(y_pred)
    if len(y_t)!=len(y_p) or len(y_t)==0: print("Error: Invalid labels/preds."); return {"acc":0.0,"f1_T":0.0,"f1_D":0.0,"f1_M":0.0}
    acc = accuracy_score(y_t, y_p); print(f"Accuracy: {acc:.4f}")
    print("\nClassification Report:")
    try:
        cr_txt = classification_report(y_t, y_p, labels=[0,1], target_names=target_names, zero_division=0); print(cr_txt)
        cr_dict = classification_report(y_t, y_p, labels=[0,1], target_names=target_names, zero_division=0, output_dict=True)
        f1_t = cr_dict.get(target_names[0],{}).get('f1-score',0.0); f1_d = cr_dict.get(target_names[1],{}).get('f1-score',0.0)
        f1_m = cr_dict.get('macro avg',{}).get('f1-score',0.0); f1_w = cr_dict.get('weighted avg',{}).get('f1-score',0.0)
        print("\nKey F1-Scores:"); print(f" F1[{target_names[0]}]: {f1_t:.4f}"); print(f" F1[{target_names[1]}]: {f1_d:.4f}"); print(f" F1[Macro]: {f1_m:.4f}"); print(f" F1[Weighted]: {f1_w:.4f}")
        print("\nConfusion Matrix:"); cm = confusion_matrix(y_t, y_p, labels=[0,1]); tn, fp, fn, tp = cm.ravel()
        print(f"      Pred:{target_names[0]:<12}{target_names[1]:<12}"); print(f"Act {target_names[0]:<5} [{tn:<12}{fp:<12}]"); print(f"Act {target_names[1]:<5} [{fn:<12}{tp:<12}]")
        return {"accuracy":acc, "report":cr_dict, "f1_truthful":f1_t, "f1_deceptive":f1_d, "f1_macro":f1_m}
    except Exception as e: print(f"❌ Metrics Error: {e}"); return {"accuracy":acc,"report":"Error"}


# --- Artifact Loading Function (V3.1 specific) ---
def load_artifacts_v3_1(save_dir):
    """Loads V3.1 artifacts (ML, LSTM) from the specified directory."""
    print(f"\n--- Loading Artifacts (V3.1) from: {save_dir} ---")
    artifacts = {
        "tfidf_vectorizer": None, "ml_models": {}, "keras_tokenizer": None,
        "lstm_model": None, "best_thresholds": None
    }
    critical_load_ok = True

    # 1. TF-IDF Vectorizer
    try:
        vectorizer_path = os.path.join(save_dir, "tfidf_vectorizer.joblib")
        artifacts["tfidf_vectorizer"] = joblib.load(vectorizer_path); print(f"  (+) TF-IDF Vectorizer")
    except Exception as e: print(f"  (-) Error loading TF-IDF Vectorizer: {e}"); critical_load_ok = False

    # 2. ML Models
    ml_model_keys = ["Logistic Regression", "Random Forest", "Multinomial NB", "Ensemble (Soft)"]
    for name in ml_model_keys:
        sanitized_name = name.replace(' ', '_').replace('(', '').replace(')', '')
        try:
            model_path = os.path.join(save_dir, f"ml_{sanitized_name}.joblib")
            artifacts["ml_models"][name] = joblib.load(model_path); print(f"  (+) ML Model: {name}")
        except FileNotFoundError: print(f"  (-) Info: ML Model '{name}' not found.")
        except Exception as e: print(f"  (-) Warn: Loading ML Model '{name}' failed: {e}")

    # 3. Keras Tokenizer
    try:
        keras_tok_path = os.path.join(save_dir, "keras_tokenizer.joblib") # Matches corrected V3.1 saving
        artifacts["keras_tokenizer"] = joblib.load(keras_tok_path); print(f"  (+) Keras Tokenizer")
    except Exception as e: print(f"  (-) Error loading Keras Tokenizer: {e}"); critical_load_ok = False

    # 4. LSTM Model
    try:
        lstm_path = os.path.join(save_dir, "lstm_model.keras")
        artifacts["lstm_model"] = load_keras_model(lstm_path); print(f"  (+) LSTM Model")
    except Exception as e: print(f"  (-) Warn: Loading LSTM Model failed: {e}")

    # 5. Best Thresholds
    try:
        thresh_path = os.path.join(save_dir, "best_thresholds.joblib")
        artifacts["best_thresholds"] = joblib.load(thresh_path); print(f"  (+) Best Thresholds: {artifacts['best_thresholds']}")
    except Exception as e: print(f"  (-) Error loading Best Thresholds: {e}. Using default 0.5."); critical_load_ok = False; artifacts["best_thresholds"] = {}

    print("--- Artifact Loading Complete ---")
    if not critical_load_ok: print("\n❌ Error: One or more critical artifacts failed to load.")
    return artifacts, critical_load_ok

# === Main Inference Execution ===
if __name__ == "__main__":

    print(f"Starting V3.1 Inference Script...")
    print(f"Python Executable: {sys.executable}")

    # --- Load Artifacts ---
    artifacts, artifacts_ok = load_artifacts_v3_1(SAVE_DIR)
    if not artifacts_ok:
        print("\nExiting due to critical artifact loading failure.")
        sys.exit(1)

    # --- Load Test Data ---
    print("\n--- Loading Test Data ---")
    try:
        test_data = load_jsonl(TEST_FILE_PATH)
        test_df = data_to_dataframe(test_data)
        if test_df.empty: raise ValueError("Test DataFrame is empty after loading.")
        test_texts_raw = test_df['messages'].tolist()
        y_test = np.array(test_df['labels'].tolist()) # True labels
        print(f"Loaded {len(test_texts_raw)} test messages.")
    except (FileNotFoundError, RuntimeError, ValueError) as e:
        print(f"❌ Error loading or processing test data from {TEST_FILE_PATH}: {e}")
        print("Cannot perform evaluation. Exiting.")
        sys.exit(1)
    del test_data # Free memory
    gc.collect()

    # --- Preprocess Test Data ---
    print("\n--- Preprocessing Test Data ---")
    start_time = time.time()
    test_texts_processed = [preprocess_text(text) for text in test_texts_raw]
    print(f"Preprocessing took {time.time() - start_time:.2f}s.")

    # --- Feature Extraction for Test Data ---
    print("\n--- Extracting Features for Test Data ---")
    X_test_tfidf = None
    if artifacts["tfidf_vectorizer"]:
        try:
            X_test_tfidf = artifacts["tfidf_vectorizer"].transform(test_texts_processed)
            print(f"  TF-IDF features generated. Shape: {X_test_tfidf.shape}")
        except Exception as e: print(f"  ❌ Error generating TF-IDF features: {e}")
    else: print("  Skipping TF-IDF (vectorizer not loaded).")

    X_test_keras_padded = None
    if artifacts["keras_tokenizer"]:
        try:
            test_sequences = artifacts["keras_tokenizer"].texts_to_sequences(test_texts_processed)
            X_test_keras_padded = pad_sequences(test_sequences, maxlen=DL_MAX_LENGTH, padding='post', truncating='post')
            print(f"  Keras padded sequences generated. Shape: {X_test_keras_padded.shape}")
        except Exception as e: print(f"  ❌ Error generating Keras sequences: {e}")
    else: print("  Skipping Keras padding (tokenizer not loaded).")


    # --- Generate Predictions on Full Test Set ---
    print("\n" + "="*50 + "\n== Generating Predictions for Full Test Set ==\n" + "="*50)
    all_test_preds = {}
    all_test_probas = {} # Store class 1 probabilities
    model_thresholds = artifacts.get("best_thresholds", {})

    # ML Model Predictions
    if X_test_tfidf is not None:
        for name, model in artifacts["ml_models"].items():
            if model:
                try:
                    print(f"Predicting with {name}...")
                    if hasattr(model, "predict_proba"):
                        probas = model.predict_proba(X_test_tfidf)[:, 1]
                        thresh = model_thresholds.get(name, 0.5) # Use tuned threshold or default
                        preds = (probas >= thresh).astype(int)
                        all_test_probas[name] = probas
                        all_test_preds[name] = preds
                    else:
                        preds = model.predict(X_test_tfidf)
                        all_test_preds[name] = preds
                        all_test_probas[name] = np.array([np.nan]*len(preds)) # Indicate no probability
                        print(f"  (Used predict(), no probabilities for soft ensemble from {name})")
                except Exception as e: print(f"  ❌ Error predicting with {name}: {e}")
            else: print(f"  Skipping {name} (not loaded).")

    # LSTM Model Prediction
    if X_test_keras_padded is not None and artifacts["lstm_model"]:
        try:
            print(f"Predicting with LSTM...")
            probas = artifacts["lstm_model"].predict(X_test_keras_padded, verbose=0).flatten()
            thresh = model_thresholds.get("LSTM", 0.5)
            preds = (probas >= thresh).astype(int)
            all_test_probas["LSTM"] = probas
            all_test_preds["LSTM"] = preds
        except Exception as e: print(f"  ❌ Error predicting with LSTM: {e}")

    # Calculate Ensemble Prediction
    print("\nCalculating Ensemble Prediction...")
    valid_probas_list = []
    valid_model_names_for_avg = []
    # Define models eligible for probability averaging
    # Prioritize the VotingClassifier if it exists and has probabilities
    if "Ensemble (Soft)" in all_test_probas and all_test_probas["Ensemble (Soft)"] is not None and not np.isnan(all_test_probas["Ensemble (Soft)"]).any():
         valid_probas_list.append(all_test_probas["Ensemble (Soft)"])
         valid_model_names_for_avg.append("Ensemble (Soft)")
    # If not, use base ML models that have probabilities
    elif not artifacts["ml_models"].get("Ensemble (Soft)"):
         for name in ["Logistic Regression", "Random Forest", "Multinomial NB"]:
             if name in all_test_probas and all_test_probas[name] is not None and not np.isnan(all_test_probas[name]).any() and artifacts["ml_models"].get(name):
                 valid_probas_list.append(all_test_probas[name])
                 valid_model_names_for_avg.append(name)

    # Add LSTM if its probabilities are valid
    if "LSTM" in all_test_probas and all_test_probas["LSTM"] is not None and not np.isnan(all_test_probas["LSTM"]).any():
         valid_probas_list.append(all_test_probas["LSTM"])
         valid_model_names_for_avg.append("LSTM")

    final_avg_threshold = 0.5 # Default
    if len(valid_probas_list) >= 2:
        print(f"  Ensemble based on: {', '.join(valid_model_names_for_avg)}")
        # Ensure all arrays in list have the same length before stacking
        first_len = len(valid_probas_list[0])
        if all(len(arr) == first_len for arr in valid_probas_list):
            avg_proba_test_full = np.mean(np.stack(valid_probas_list, axis=0), axis=0) # Use np.stack
            relevant_thresholds = [model_thresholds.get(name, 0.5) for name in valid_model_names_for_avg]
            final_avg_threshold = np.mean(relevant_thresholds)
            ensemble_pred_full = (avg_proba_test_full >= final_avg_threshold).astype(int)
            all_test_probas["ENSEMBLE"] = avg_proba_test_full
            all_test_preds["ENSEMBLE"] = ensemble_pred_full
            print(f"  Ensemble calculated using average threshold: {final_avg_threshold:.3f}")
        else:
            print("  ❌ Error: Probability arrays have inconsistent lengths. Cannot calculate ensemble.")
            all_test_probas["ENSEMBLE"] = None
            all_test_preds["ENSEMBLE"] = None
    elif len(valid_probas_list) == 1:
        print(f"  Only one model ({valid_model_names_for_avg[0]}) produced valid probabilities. Using its prediction as 'ensemble'.")
        single_model_name = valid_model_names_for_avg[0]
        all_test_probas["ENSEMBLE"] = all_test_probas[single_model_name]
        final_avg_threshold = model_thresholds.get(single_model_name, 0.5)
        all_test_preds["ENSEMBLE"] = (all_test_probas["ENSEMBLE"] >= final_avg_threshold).astype(int)
    else:
        print("  Skipping Ensemble calculation (fewer than 2 valid model probabilities).")
        all_test_probas["ENSEMBLE"] = None
        all_test_preds["ENSEMBLE"] = None


    # --- Overall Evaluation Metrics ---
    print("\n\n" + "="*50 + "\n== Overall Test Set Evaluation Metrics ==\n" + "="*50)
    evaluation_results = {}
    # Define the order for printing evaluations
    eval_order = ["Logistic Regression", "Random Forest", "Multinomial NB", "Ensemble (Soft)", "LSTM", "ENSEMBLE"]

    for name in eval_order:
        preds = all_test_preds.get(name) # Use .get() to handle missing keys safely
        if preds is not None:
            threshold_used = 0.5 # Default
            if name == "ENSEMBLE":
                 threshold_used = final_avg_threshold # Use calculated ensemble threshold
            elif name in model_thresholds:
                 threshold_used = model_thresholds[name] # Use specific model threshold

            evaluation_results[name] = print_evaluation_metrics(
                y_test,
                preds,
                model_name=f"{name} (Test Set)",
                target_names=TARGET_NAMES,
                threshold=threshold_used
            )
        else:
            print(f"\n--- Evaluation: {name} (Test Set) ---")
            print("  Skipped (predictions not available).")


    # --- Display Example Predictions based on TRUE Label --- ## MODIFIED SECTION ##
    print("\n\n" + "="*60 + "\n== Example Predictions by ENSEMBLE (Selected by True Label) ==\n" + "="*60)

    if all_test_preds.get("ENSEMBLE") is not None:
        # Create DataFrame for easy filtering
        results_df = pd.DataFrame({
            'Original_Text': test_texts_raw,
            'True_Label_Int': y_test,
            'Ensemble_Pred_Int': all_test_preds["ENSEMBLE"]
        })
        # Add string labels for easier reading
        results_df['True_Label'] = results_df['True_Label_Int'].map({0: TARGET_NAMES[0], 1: TARGET_NAMES[1]})
        results_df['Ensemble_Pred'] = results_df['Ensemble_Pred_Int'].map({0: TARGET_NAMES[0], 1: TARGET_NAMES[1]})

        # Examples where TRUE label is Truthful (0)
        print(f"\n--- {NUM_EXAMPLES_TO_SHOW} Examples with TRUE Label = TRUTHFUL ---")
        truthful_examples = results_df[results_df['True_Label_Int'] == 0].head(NUM_EXAMPLES_TO_SHOW)
        if truthful_examples.empty:
            print("No examples found with true label Truthful in the test set.")
        else:
            for i, row in truthful_examples.iterrows(): # Use original index i
                print(f"\nExample (Index {i}):")
                print(f"  True Label:      {row['True_Label']}")
                print(f"  Ensemble Pred:   {row['Ensemble_Pred']}") # Show what the ensemble predicted
                print(f"  Original Text:   '{row['Original_Text'][:300]}...'")

        # Examples where TRUE label is Deceptive (1)
        print(f"\n--- {NUM_EXAMPLES_TO_SHOW} Examples with TRUE Label = DECEPTIVE ---")
        deceptive_examples = results_df[results_df['True_Label_Int'] == 1].head(NUM_EXAMPLES_TO_SHOW)
        if deceptive_examples.empty:
            print("No examples found with true label Deceptive in the test set.")
        else:
            for i, row in deceptive_examples.iterrows(): # Use original index i
                print(f"\nExample (Index {i}):")
                print(f"  True Label:      {row['True_Label']}")
                print(f"  Ensemble Pred:   {row['Ensemble_Pred']}") # Show what the ensemble predicted
                print(f"  Original Text:   '{row['Original_Text'][:300]}...'")

    else:
        print("\nCannot show examples because Ensemble predictions were not generated.")


    # --- Final Cleanup ---
    print("\n--- Cleaning up inference memory ---")
    del artifacts # Remove loaded artifacts
    if 'test_df' in locals(): del test_df
    if 'results_df' in locals(): del results_df
    if 'X_test_tfidf' in locals(): del X_test_tfidf
    if 'X_test_keras_padded' in locals(): del X_test_keras_padded
    gc.collect()
    print("--- V3.1 Inference Script Finished ---")

Starting V3.1 Inference Script...
Python Executable: /usr/bin/python3

--- Loading Artifacts (V3.1) from: trained_models_v3.1_smote_lstm_ensemble ---
  (+) TF-IDF Vectorizer
  (+) ML Model: Logistic Regression
  (+) ML Model: Random Forest
  (+) ML Model: Multinomial NB
  (-) Info: ML Model 'Ensemble (Soft)' not found.
  (+) Keras Tokenizer
  (+) LSTM Model
  (+) Best Thresholds: {'Logistic Regression': 0.46, 'Random Forest': 0.49, 'Multinomial NB': 0.5800000000000001, 'Ensemble (Soft)': 0.5, 'LSTM': 0.23}
--- Artifact Loading Complete ---

--- Loading Test Data ---
✅ Loaded 42 entries from /kaggle/input/nlpdata/test.jsonl.
Processing data using 'sender_labels'...
DataFrame created with 2741 messages.
Label distribution:
labels
1    0.912441
0    0.087559
Name: proportion, dtype: float64
Loaded 2741 test messages.

--- Preprocessing Test Data ---
Preprocessing took 0.04s.

--- Extracting Features for Test Data ---
  TF-IDF features generated. Shape: (2741, 5000)
  Keras padded sequence