In [1]:
# Cell 1: Install Required Packages


import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TRANSFORMERS_VERBOSITY'] = 'error'

!pip install -q transformers>=4.41.0
!pip install -q accelerate>=0.25.0
!pip install -q bitsandbytes>=0.41.0
!pip install -q sentence-transformers>=2.7.0
!pip install -q faiss-cpu
!pip install -q scikit-learn>=1.3.0

print("All packages installed successfully!")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m91.2 MB/s[0m eta [36m0:00:00[0m
[?25hAll packages installed successfully!


In [2]:
# Cell 2: Import All Required Libraries


import os
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
import gc
import torch

# Suppress warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TRANSFORMERS_VERBOSITY'] = 'error'

# ML Libraries
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import faiss

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Utils
from tqdm import tqdm

print("All libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")


E0000 00:00:1768109787.484406      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768109787.531781      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768109787.927437      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768109787.927470      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768109787.927473      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768109787.927475      24 computation_placer.cc:177] computation placer already registered. Please check linka

All libraries imported successfully!
PyTorch version: 2.8.0+cu126
CUDA available: True
GPU: Tesla P100-PCIE-16GB


In [3]:
# Cell 3: Load Dataset and Create Validation Split

# Dataset path - narrative-consistency-dataset uploaded to Kaggle
DATA_DIR = Path('/kaggle/input/narrative-consistency-dataset/Dataset')
BOOKS_DIR = DATA_DIR / 'Books'

print("LOADING DATA")
print("-"*80)

# Load datasets
train = pd.read_csv(DATA_DIR / 'train.csv')
test = pd.read_csv(DATA_DIR / 'test.csv')

print(f"\nTrain: {len(train)} examples")
print(f"Test: {len(test)} examples")

# Analyze training data
print("\n--- Training Data Analysis ---")
print(f"Label distribution:\n{train['label'].value_counts()}")
print(f"\nBooks: {train['book_name'].unique()}")
print(f"Unique characters: {train['char'].nunique()}")

# Create validation split (80/20 stratified)
print("\n--- Creating Validation Split ---")
train_data, val_data = train_test_split(
    train, 
    test_size=0.2, 
    stratify=train['label'], 
    random_state=42
)

print(f"Train: {len(train_data)} examples")
print(f"Validation: {len(val_data)} examples")

# Display sample
print("\n--- Sample Training Example ---")
print(train_data.iloc[0])


LOADING DATA
--------------------------------------------------------------------------------

Train: 80 examples
Test: 60 examples

--- Training Data Analysis ---
Label distribution:
label
consistent    51
contradict    29
Name: count, dtype: int64

Books: ['In Search of the Castaways' 'The Count of Monte Cristo']
Unique characters: 6

--- Creating Validation Split ---
Train: 64 examples
Validation: 16 examples

--- Sample Training Example ---
id                                                          13
book_name                           In Search of the Castaways
char                                           Jacques Paganel
caption                                                    NaN
content      His deference to Lady Glenarvan echoed the tan...
label                                               consistent
Name: 19, dtype: object


In [4]:
# Cell 4: Load Novels and Create Semantic Chunks

print("NOVEL PROCESSING & CHUNKING")
print("-"*80)

# Load novels
print("\n--- Loading Novels ---")
books = {}
book_paths = {
    'The Count of Monte Cristo': BOOKS_DIR / 'The Count of Monte Cristo.txt',
    'In Search of the Castaways': BOOKS_DIR / 'In search of the castaways.txt'
}

for book_name, path in book_paths.items():
    with open(path, 'r', encoding='utf-8') as f:
        books[book_name] = f.read()
    print(f"{book_name}: {len(books[book_name]):,} characters")

# Chunking function
def semantic_chunk(text, chunk_size=1000, overlap=150):
    """Chunk text with overlap, preserving paragraph boundaries"""
    from transformers import AutoTokenizer
    import warnings
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")
    
    # Split into paragraphs
    paragraphs = [p for p in text.split('\n\n') if p.strip()]
    
    chunks = []
    current_chunk = []
    current_length = 0
    
    for para in paragraphs:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            para_tokens = len(tokenizer.encode(para, add_special_tokens=False))
        
        if current_length + para_tokens > chunk_size and current_chunk:
            # Save current chunk
            chunk_text = '\n\n'.join(current_chunk)
            chunks.append(chunk_text)
            
            # Start new chunk with overlap
            overlap_paras = current_chunk[-2:] if len(current_chunk) >= 2 else current_chunk
            current_chunk = overlap_paras + [para]
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                current_length = sum(len(tokenizer.encode(p, add_special_tokens=False)) for p in current_chunk)
        else:
            current_chunk.append(para)
            current_length += para_tokens
    
    # Add last chunk
    if current_chunk:
        chunks.append('\n\n'.join(current_chunk))
    
    return chunks

# Chunk both novels
print("\n--- Chunking Novels ---")
book_chunks = {}
for book_name, text in books.items():
    print(f"Chunking {book_name}...")
    book_chunks[book_name] = semantic_chunk(text, chunk_size=1000, overlap=150)
    print(f"Created {len(book_chunks[book_name])} chunks")

print(f"\nTotal chunks: {sum(len(chunks) for chunks in book_chunks.values())}")


NOVEL PROCESSING & CHUNKING
--------------------------------------------------------------------------------

--- Loading Novels ---
The Count of Monte Cristo: 2,646,614 characters
In Search of the Castaways: 826,131 characters

--- Chunking Novels ---
Chunking The Count of Monte Cristo...


tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Created 759 chunks
Chunking In Search of the Castaways...
Created 225 chunks

Total chunks: 984


In [5]:
# Cell 5: Generate Embeddings and Build FAISS Indices

print("EMBEDDING & INDEXING")
print("-"*80)

# Load embedding model
print("\n--- Loading Embedding Model ---")
embedder = SentenceTransformer("BAAI/bge-base-en-v1.5")
print("BGE-base-en-v1.5 loaded")

# Create FAISS index for each book
print("\n--- Creating FAISS Indices ---")
book_indices = {}
book_chunk_lists = {}

for book_name, chunks in book_chunks.items():
    print(f"\nEmbedding {book_name}...")
    
    # Embed chunks
    embeddings = embedder.encode(
        chunks, 
        show_progress_bar=True,
        batch_size=32,
        convert_to_numpy=True
    )
    
    # Create FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatIP(dimension)  # Inner product (cosine similarity)
    
    # Normalize embeddings for cosine similarity
    faiss.normalize_L2(embeddings)
    index.add(embeddings)
    
    book_indices[book_name] = index
    book_chunk_lists[book_name] = chunks
    
    print(f"Indexed {len(chunks)} chunks ({dimension} dimensions)")

print("\nAll books indexed successfully!")


EMBEDDING & INDEXING
--------------------------------------------------------------------------------

--- Loading Embedding Model ---


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

BGE-base-en-v1.5 loaded

--- Creating FAISS Indices ---

Embedding The Count of Monte Cristo...


Batches:   0%|          | 0/24 [00:00<?, ?it/s]

Indexed 759 chunks (768 dimensions)

Embedding In Search of the Castaways...


Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Indexed 225 chunks (768 dimensions)

All books indexed successfully!


In [6]:
# Cell 6: Define Retrieval Functions

print("RETRIEVAL SYSTEM")
print("-"*80)

def retrieve_evidence(claim, book_name, char_name, top_k=10):
    """
    Retrieve relevant chunks using dual-query strategy
    
    Args:
        claim: Backstory claim text
        book_name: Name of the novel
        char_name: Character name
        top_k: Number of chunks to retrieve
    
    Returns:
        List of evidence dictionaries
    """
    # Get book index and chunks
    index = book_indices[book_name]
    chunks = book_chunk_lists[book_name]
    
    # Dual-query strategy
    query1 = claim
    query2 = f"{char_name}: {claim}"
    
    # Embed queries
    q1_emb = embedder.encode([query1])
    q2_emb = embedder.encode([query2])
    
    # Normalize
    faiss.normalize_L2(q1_emb)
    faiss.normalize_L2(q2_emb)
    
    # Search
    k = min(top_k, len(chunks))
    scores1, indices1 = index.search(q1_emb, k)
    scores2, indices2 = index.search(q2_emb, k)
    
    # Merge and deduplicate
    all_indices = list(indices1[0]) + list(indices2[0])
    all_scores = list(scores1[0]) + list(scores2[0])
    
    # Sort by score and deduplicate
    seen = set()
    evidence = []
    for idx, score in sorted(zip(all_indices, all_scores), key=lambda x: -x[1]):
        if idx not in seen:
            evidence.append({
                'text': chunks[idx],
                'score': float(score),
                'index': int(idx)
            })
            seen.add(idx)
        if len(evidence) >= top_k:
            break
    
    return evidence

print("Retrieval system ready")

# Test retrieval
print("\n--- Testing Retrieval ---")
sample = train_data.iloc[0]
sample_evidence = retrieve_evidence(
    sample['content'], 
    sample['book_name'], 
    sample['char'],
    top_k=5
)
print(f"Retrieved {len(sample_evidence)} chunks")
print(f"  Top score: {sample_evidence[0]['score']:.4f}")
print(f"  Preview: {sample_evidence[0]['text'][:200]}...")


RETRIEVAL SYSTEM
--------------------------------------------------------------------------------
Retrieval system ready

--- Testing Retrieval ---
Retrieved 5 chunks
  Top score: 0.7025
  Preview: "Of Thalcave. He thought it might be useful to us, and gave it to me
before going back to Thaouka."

"Brave and generous Indian!" cried Glenarvan.

"Yes," added Tom Austin, "if all the Patagonians are...


In [7]:
# Cell 7: Load Mistral-7B with 4-bit Quantization

print("LOADING LLM (MISTRAL-7B)")
print("-"*80)

# Load Mistral-7B with 4-bit quantization
print("\n--- Loading Mistral-7B-Instruct (4-bit) ---")
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

print("Mistral-7B loaded successfully")
print(f"  Model device: {model.device}")
print(f"  Memory allocated: {torch.cuda.memory_allocated()/1e9:.2f} GB")

def check_consistency_llm(claim, evidence_list):
    """Use LLM to check consistency with Chain-of-Thought"""
    
    # Format evidence (top 5 only)
    evidence_text = "\n\n".join([
        f"[{i+1}] {ev['text'][:500]}..." 
        for i, ev in enumerate(evidence_list[:5])
    ])
    
    # Chain-of-Thought prompt
    prompt = f"""Claim: {claim}

Relevant excerpts from the novel:
{evidence_text}

Think step-by-step:
1. What does the claim assert?
2. What do the excerpts say?
3. Are they compatible or contradictory?

Reasoning: [Your brief analysis]
Answer: [SUPPORTED or CONTRADICTED or NOT_MENTIONED]"""

    # Format for Mistral
    messages = [{"role": "user", "content": prompt}]
    formatted = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    
    # Generate
    inputs = tokenizer(formatted, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            temperature=0.0,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[1]:], 
        skip_special_tokens=True
    )
    
    # Parse verdict
    response_lower = response.lower()
    if 'contradicted' in response_lower or 'contradict' in response_lower:
        verdict = 'CONTRADICTED'
    elif 'supported' in response_lower or 'support' in response_lower:
        verdict = 'SUPPORTED'
    else:
        verdict = 'NOT_MENTIONED'
    
    return verdict, response

print("\nLLM reasoning engine ready")

# Test LLM
print("\n--- Testing LLM ---")
sample_verdict, sample_response = check_consistency_llm(
    sample['content'], 
    sample_evidence
)
print(f"LLM verdict: {sample_verdict}")
print(f"  Ground truth: {sample['label']}")
print(f"  Response preview: {sample_response[:200]}...")


LOADING LLM (MISTRAL-7B)
--------------------------------------------------------------------------------

--- Loading Mistral-7B-Instruct (4-bit) ---


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Mistral-7B loaded successfully
  Model device: cuda:0
  Memory allocated: 4.92 GB

LLM reasoning engine ready

--- Testing LLM ---
LLM verdict: CONTRADICTED
  Ground truth: consistent
  Response preview: The claim asserts that Paganel's deference to Lady Glenarvan and his role as the expedition's "living map" are connected to his past feelings towards his mother and his failure to protect her.

The ex...


In [8]:
# Cell 8: Feature Extraction Function

print("FEATURE EXTRACTION")
print("-"*80)

def extract_features(row, evidence_list, llm_verdict):
    """
    Extract 12 features for classifier
    
    Features:
    - Retrieval: max/mean/min similarity, num_high_sim
    - LLM: verdict flags (3)
    - Character: total/max mentions
    - Claim: length
    - Book: is_monte_cristo
    - Evidence: num_evidence
    """
    claim = row['content']
    char_name = row['char']
    
    # Similarity scores
    similarities = [ev['score'] for ev in evidence_list]
    
    # Character mentions
    char_mentions = [
        ev['text'].lower().count(char_name.lower()) 
        for ev in evidence_list
    ]
    
    features = {
        # Retrieval features (4)
        'max_similarity': max(similarities) if similarities else 0,
        'mean_similarity': np.mean(similarities) if similarities else 0,
        'min_similarity': min(similarities) if similarities else 0,
        'num_high_sim': sum(1 for s in similarities if s > 0.7),
        
        # LLM features (3)
        'llm_verdict_supported': 1 if llm_verdict == 'SUPPORTED' else 0,
        'llm_verdict_contradicted': 1 if llm_verdict == 'CONTRADICTED' else 0,
        'llm_verdict_not_mentioned': 1 if llm_verdict == 'NOT_MENTIONED' else 0,
        
        # Character features (2)
        'total_char_mentions': sum(char_mentions),
        'max_char_mentions': max(char_mentions) if char_mentions else 0,
        
        # Claim features (1)
        'claim_length': len(claim.split()),
        
        # Book features (1)
        'is_monte_cristo': 1 if row['book_name'] == 'The Count of Monte Cristo' else 0,
        
        # Evidence quality (1)
        'num_evidence': len(evidence_list),
    }
    
    return features

print("Feature extraction function ready")
print("\n--- Feature List (12 total) ---")
sample_features = extract_features(sample, sample_evidence, sample_verdict)
for i, (feat, val) in enumerate(sample_features.items(), 1):
    print(f"{i:2d}. {feat:30s} = {val}")


FEATURE EXTRACTION
--------------------------------------------------------------------------------
Feature extraction function ready

--- Feature List (12 total) ---
 1. max_similarity                 = 0.7025200128555298
 2. mean_similarity                = 0.6935618758201599
 3. min_similarity                 = 0.6844485402107239
 4. num_high_sim                   = 1
 5. llm_verdict_supported          = 0
 6. llm_verdict_contradicted       = 1
 7. llm_verdict_not_mentioned      = 0
 8. total_char_mentions            = 4
 9. max_char_mentions              = 4
10. claim_length                   = 31
11. is_monte_cristo                = 0
12. num_evidence                   = 5


In [9]:
# Cell 9: Extract Features from Training Data

print("PROCESSING TRAINING DATA")
print("-"*80)
print("\nEstimated time: 30-60 minutes")

train_features = []
train_labels = []

for idx, row in tqdm(train_data.iterrows(), total=len(train_data), desc="Training"):
    try:
        # Retrieve evidence
        evidence = retrieve_evidence(
            row['content'], 
            row['book_name'], 
            row['char'],
            top_k=10
        )
        
        # Get LLM verdict
        llm_verdict, _ = check_consistency_llm(row['content'], evidence)
        
        # Extract features
        features = extract_features(row, evidence, llm_verdict)
        train_features.append(features)
        
        # Label (convert to binary)
        label = 1 if row['label'] == 'consistent' else 0
        train_labels.append(label)
        
        # Clear GPU cache periodically
        if idx % 10 == 0:
            torch.cuda.empty_cache()
            gc.collect()
            
    except Exception as e:
        print(f"\nError processing row {idx}: {e}")
        # Use default features on error
        train_features.append({k: 0 for k in ['max_similarity', 'mean_similarity', 'min_similarity', 
                                                'num_high_sim', 'llm_verdict_supported', 
                                                'llm_verdict_contradicted', 'llm_verdict_not_mentioned',
                                                'total_char_mentions', 'max_char_mentions', 
                                                'claim_length', 'is_monte_cristo', 'num_evidence']})
        train_labels.append(0)

# Convert to DataFrame
train_features_df = pd.DataFrame(train_features)
train_labels = np.array(train_labels)

print(f"\nTraining features extracted: {train_features_df.shape}")
print(f"Labels: {len(train_labels)}")
print(f"\nLabel distribution: {pd.Series(train_labels).value_counts().to_dict()}")

# Save intermediate results
train_features_df.to_csv('train_features.csv', index=False)
np.save('train_labels.npy', train_labels)
print("\nSaved intermediate results (train_features.csv, train_labels.npy)")


PROCESSING TRAINING DATA
--------------------------------------------------------------------------------

Estimated time: 30-60 minutes


Training: 100%|██████████| 64/64 [09:44<00:00,  9.14s/it]


Training features extracted: (64, 12)
Labels: 64

Label distribution: {1: 41, 0: 23}

Saved intermediate results (train_features.csv, train_labels.npy)





In [10]:
# Cell 10: Train Random Forest Classifier

print("TRAINING CLASSIFIER")
print("-"*80)

print("\n--- Training Random Forest ---")
clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

clf.fit(train_features_df, train_labels)
print("\nClassifier trained successfully")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': train_features_df.columns,
    'importance': clf.feature_importances_
}).sort_values('importance', ascending=False)

print("\n--- Top 10 Most Important Features ---")
print(feature_importance.head(10).to_string(index=False))


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.1s


TRAINING CLASSIFIER
--------------------------------------------------------------------------------

--- Training Random Forest ---

Classifier trained successfully

--- Top 10 Most Important Features ---
                  feature  importance
           min_similarity    0.178536
           max_similarity    0.160233
          mean_similarity    0.158458
             claim_length    0.127301
      total_char_mentions    0.099860
        max_char_mentions    0.085744
llm_verdict_not_mentioned    0.061402
          is_monte_cristo    0.048232
 llm_verdict_contradicted    0.042703
    llm_verdict_supported    0.025599


[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    0.3s finished


In [11]:
# Cell 11: Process Validation Data and Evaluates

print("VALIDATION")
print("-"*80)

print("\n--- Processing Validation Data ---")
val_features = []
val_labels = []

for idx, row in tqdm(val_data.iterrows(), total=len(val_data), desc="Validation"):
    try:
        evidence = retrieve_evidence(row['content'], row['book_name'], row['char'])
        llm_verdict, _ = check_consistency_llm(row['content'], evidence)
        features = extract_features(row, evidence, llm_verdict)
        val_features.append(features)
        val_labels.append(1 if row['label'] == 'consistent' else 0)
        
        if idx % 10 == 0:
            torch.cuda.empty_cache()
            gc.collect()
            
    except Exception as e:
        print(f"\nError processing validation row {idx}: {e}")
        val_features.append({k: 0 for k in train_features_df.columns})
        val_labels.append(0)

val_features_df = pd.DataFrame(val_features)
val_labels = np.array(val_labels)

print(f"\nValidation features extracted: {val_features_df.shape}")

# Predict
print("\n--- Making Predictions ---")
val_preds = clf.predict(val_features_df)

# Evaluate
print("\n" + "="*80)
print("VALIDATION RESULTS")
print("="*80)
accuracy = accuracy_score(val_labels, val_preds)
print(f"\nAccuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

if accuracy >= 0.85:
    print("EXCELLENT! Target accuracy achieved (>=85%)")
elif accuracy >= 0.75:
    print("GOOD! Above baseline (>=75%)")
else:
    print("Below target. Consider tuning parameters.")

print("\n--- Classification Report ---")
print(classification_report(val_labels, val_preds, target_names=['contradict', 'consistent']))

print("\n--- Confusion Matrix ---")
cm = confusion_matrix(val_labels, val_preds)
print(cm)
print(f"\nTrue Negatives:  {cm[0,0]}")
print(f"False Positives: {cm[0,1]}")
print(f"False Negatives: {cm[1,0]}")
print(f"True Positives:  {cm[1,1]}")


VALIDATION
--------------------------------------------------------------------------------

--- Processing Validation Data ---


Validation: 100%|██████████| 16/16 [02:33<00:00,  9.57s/it]


Validation features extracted: (16, 12)

--- Making Predictions ---

VALIDATION RESULTS

Accuracy: 0.6875 (68.75%)
Below target. Consider tuning parameters.

--- Classification Report ---
              precision    recall  f1-score   support

  contradict       0.67      0.33      0.44         6
  consistent       0.69      0.90      0.78        10

    accuracy                           0.69        16
   macro avg       0.68      0.62      0.61        16
weighted avg       0.68      0.69      0.66        16


--- Confusion Matrix ---
[[2 4]
 [1 9]]

True Negatives:  2
False Positives: 4
False Negatives: 1
True Positives:  9



[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.0s finished


In [12]:
# Cell 12: Process Test Data and Generate Submission

print("TEST PREDICTIONS")
print("-"*80)

print("\n--- Processing Test Data ---")
test_features = []

for idx, row in tqdm(test.iterrows(), total=len(test), desc="Test"):
    try:
        evidence = retrieve_evidence(row['content'], row['book_name'], row['char'])
        llm_verdict, _ = check_consistency_llm(row['content'], evidence)
        features = extract_features(row, evidence, llm_verdict)
        test_features.append(features)
        
        if idx % 10 == 0:
            torch.cuda.empty_cache()
            gc.collect()
            
    except Exception as e:
        print(f"\nError processing test row {idx}: {e}")
        test_features.append({k: 0 for k in train_features_df.columns})

test_features_df = pd.DataFrame(test_features)

print(f"\nTest features extracted: {test_features_df.shape}")

# Predict
print("\n--- Making Final Predictions ---")
test_preds = clf.predict(test_features_df)

# ✅ FIXED submission format
submission = pd.DataFrame({
    'id': test['id'],
    'prediction': test_preds
})

submission.to_csv('results.csv', index=False)

print("\n" + "="*80)
print("PIPELINE COMPLETE!")
print("="*80)
print(f"\nSubmission saved to: results.csv")
print(f"Total test predictions: {len(test_preds)}")
print(f"\nPrediction distribution:")
print(f"  Consistent (1): {(test_preds == 1).sum()}")
print(f"  Contradict (0): {(test_preds == 0).sum()}")

print("\n--- First 10 Predictions ---")
print(submission.head(10))


TEST PREDICTIONS
--------------------------------------------------------------------------------

--- Processing Test Data ---


Test: 100%|██████████| 60/60 [09:19<00:00,  9.33s/it]


Test features extracted: (60, 12)

--- Making Final Predictions ---

PIPELINE COMPLETE!

Submission saved to: results.csv
Total test predictions: 60

Prediction distribution:
  Consistent (1): 52
  Contradict (0): 8

--- First 10 Predictions ---
    id  prediction
0   95           1
1  136           1
2   59           1
3   60           1
4  124           1
5  111           0
6  135           1
7   27           1
8  110           1
9   42           1



[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    0.0s finished


In [13]:
res = pd.read_csv("results.csv")

assert len(res) == len(test)
assert list(res.columns) == ["id", "prediction"]
assert set(res["prediction"]).issubset({0, 1})
assert all(res["id"] == test["id"])

print("✅ results.csv is PERFECT and ready for submission")


✅ results.csv is PERFECT and ready for submission
