# Import Library

Anda mungkin menemui library yang tertulis 2 kali pada cell ini atau dibawahnya.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss, accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
import json
import ast # For parsing string representations of lists
import gc
from catboost import CatBoostClassifier
import re
import unicodedata
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from collections import Counter
from sklearn.model_selection import train_test_split

import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import warnings
import json
import re
from scipy.sparse import hstack, csr_matrix
warnings.filterwarnings('ignore')

import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

# Loading Dataset and Review Distribution

Sebaiknya lakukan EDA secara manual karena kurang efektif jika ditampilkan dengan visualisasi untuk dataset ini.

In [None]:
print("Loading datasets...")
train_df = pd.read_csv('../input/llm-classification-finetuning/train.csv')
test_df = pd.read_csv('../input/llm-classification-finetuning/test.csv')
sample_submission = pd.read_csv('../input/llm-classification-finetuning/sample_submission.csv')

print("Loading data...")
print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")

# --- Configuration ---
SEED = 42
N_FOLDS = 5
TARGET_COLS = ['winner_model_a', 'winner_model_b', 'winner_tie']

# Preprocessing

Saat melakukan preprocessing mungkin anda menemui masalah dengan unicodenya jadi perlu fallback untuk kembali ke ASCII

In [None]:
# --- Enhanced Preprocessing ---
print("Starting preprocessing...")

def clean_json_like_string(text):
    """
    Cleans text that looks like a JSON array of strings.
    Example: "[\"hello\", \"world\"]" -> "hello [SEP] world"
    """
    if isinstance(text, str) and text.startswith('[') and text.endswith(']'):
        try:
            text_list = ast.literal_eval(text)
            if isinstance(text_list, list):
                # Use separator to maintain structure info
                return " [SEP] ".join(str(item) for item in text_list)
        except (ValueError, SyntaxError):
            return text
    return text

def robust_text_cleaning(text):
    """
    Robust text cleaning to handle Unicode issues and various text problems.
    """
    if pd.isna(text) or text is None:
        return ""
    
    text = str(text)
    
    # Remove or replace problematic Unicode characters
    try:
        # First, try to normalize Unicode
        text = unicodedata.normalize('NFKD', text)
        
        # Remove surrogate characters and other problematic Unicode
        text = text.encode('utf-8', 'ignore').decode('utf-8', 'ignore')
        
        # Remove control characters except common whitespace
        text = ''.join(char for char in text if unicodedata.category(char)[0] != 'C' or char in '\t\n\r ')
        
        # Fix common escape sequences
        text = re.sub(r'\\n', '\n', text)
        text = re.sub(r'\\t', '\t', text)
        
        # Replace multiple whitespace with single space
        text = re.sub(r'\s+', ' ', text)
        
        # Strip leading/trailing whitespace
        text = text.strip()
        
        # Ensure we have a valid string
        if not text:
            return ""
            
        # Final encoding check
        text.encode('utf-8')
        
    except (UnicodeError, UnicodeDecodeError, UnicodeEncodeError) as e:
        print(f"Unicode error encountered, applying fallback cleaning: {e}")
        # Fallback: keep only ASCII characters
        text = ''.join(char for char in text if ord(char) < 128)
        text = re.sub(r'\s+', ' ', text).strip()
    
    return text



## Feature Engineering

Feature engineering ini telah disesuaikan dengan fitur importance yang dijelaskan di akhir notebook.

In [None]:
# --- BERT Setup ---
BERT_PATH = '../input/huggingface-bert-variants/bert-base-uncased/bert-base-uncased' 
# thanks mr.saurav
print(f"Loading BERT model and tokenizer from local path: {BERT_PATH}...")
try:
    # Memuat tokenizer dari path lokal
    tokenizer = BertTokenizer.from_pretrained(BERT_PATH)
    # Memuat model dari path lokal
    bert_model = BertModel.from_pretrained(BERT_PATH)
    print("BERT model and tokenizer loaded successfully from local path.")
except Exception as e:
    print(f"Error loading BERT model from local path: {e}. BERT features will not be available.")
    tokenizer = None
    bert_model = None

# Pindahkan model ke GPU jika tersedia (jaga-jaga kalau habis komputasinya wkwk)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if bert_model:
    bert_model.to(device)
    bert_model.eval() # Set model ke mode evaluasi
print(f"BERT model on: {device}")

def get_bert_embeddings(texts, batch_size=16):
    """
    Generates BERT embeddings for a list of texts.
    Uses mean pooling of last hidden state.
    """
    if not bert_model or not tokenizer:
        print("BERT model not loaded, returning zero embeddings.")
        # Return zero embeddings of expected dimension (768 for bert-base-uncased)
        return np.zeros((len(texts), 768))

    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="BERT Embedding"):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
        with torch.no_grad():
            outputs = bert_model(**inputs)
        
        # Mean pooling: take the average of the last hidden state, ignoring padding tokens
        last_hidden_states = outputs.last_hidden_state
        attention_mask = inputs['attention_mask']
        mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
        sum_embeddings = torch.sum(last_hidden_states * mask_expanded, 1)
        sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9) # Avoid division by zero
        mean_embeddings = sum_embeddings / sum_mask
        all_embeddings.append(mean_embeddings.cpu().numpy())
        
    return np.concatenate(all_embeddings, axis=0)
    
def extract_bert_features(df):
    """Extracts BERT-based features"""
    if not bert_model or not tokenizer:
        print("BERT model not available. Skipping BERT feature extraction.")
        return pd.DataFrame()

    print("  - Extracting BERT embeddings (this may take a while)...")
    features = pd.DataFrame()

    # Get embeddings for prompt, response_a, response_b
    print("    - Processing prompts...")
    prompt_embeddings = get_bert_embeddings(df['prompt'].tolist())
    print("    - Processing response_a...")
    response_a_embeddings = get_bert_embeddings(df['response_a'].tolist())
    print("    - Processing response_b...")
    response_b_embeddings = get_bert_embeddings(df['response_b'].tolist())

    bert_embedding_dim = prompt_embeddings.shape[1]

    # Features based on differences and products (often more useful for comparison)
    diff_a_b = response_a_embeddings - response_b_embeddings
    prod_a_b = response_a_embeddings * response_b_embeddings # Element-wise product

    for i in range(bert_embedding_dim):
        features[f'bert_diff_a_b_{i}'] = diff_a_b[:, i]
        features[f'bert_prod_a_b_{i}'] = prod_a_b[:, i]

    # Cosine similarities
    def row_cosine_similarity(u, v):
        # u, v are 2D arrays (N_samples, N_features)
        # Calculate cosine similarity row-wise
        numerator = np.sum(u * v, axis=1)
        denominator = np.linalg.norm(u, axis=1) * np.linalg.norm(v, axis=1)
        # Handle potential division by zero if a vector is all zeros
        similarity = np.divide(numerator, denominator, out=np.zeros_like(numerator), where=denominator!=0)
        return similarity

    features['bert_sim_prompt_a'] = row_cosine_similarity(prompt_embeddings, response_a_embeddings)
    features['bert_sim_prompt_b'] = row_cosine_similarity(prompt_embeddings, response_b_embeddings)
    features['bert_sim_a_b'] = row_cosine_similarity(response_a_embeddings, response_b_embeddings)
    
    # Difference in similarities
    features['bert_sim_prompt_diff_a_b'] = features['bert_sim_prompt_a'] - features['bert_sim_prompt_b']

    print(f"  - Created {features.shape[1]} BERT-based features")
    return features

# --- Feature Engineering Functions ---
def extract_length_features(df):
    """Extract length-based features that are highly predictive for preference tasks"""
    features = pd.DataFrame()
    
    # Basic lengths
    features['prompt_length'] = df['prompt'].str.len()
    features['response_a_length'] = df['response_a'].str.len()
    features['response_b_length'] = df['response_b'].str.len()
    
    # Length ratios and differences (very important for preference)
    features['len_ratio_a_b'] = features['response_a_length'] / (features['response_b_length'] + 1)
    features['len_ratio_b_a'] = features['response_b_length'] / (features['response_a_length'] + 1)
    features['len_diff_a_b'] = features['response_a_length'] - features['response_b_length']
    features['len_diff_abs'] = np.abs(features['len_diff_a_b'])
    
    # Relative to prompt
    features['response_a_to_prompt_ratio'] = features['response_a_length'] / (features['prompt_length'] + 1)
    features['response_b_to_prompt_ratio'] = features['response_b_length'] / (features['prompt_length'] + 1)
    
    return features

def extract_structure_features(df):
    """Extract structural features that indicate response quality"""
    features = pd.DataFrame()
    
    # Line and paragraph structure
    features['response_a_lines'] = df['response_a'].str.count('\n')
    features['response_b_lines'] = df['response_b'].str.count('\n')
    features['response_a_paragraphs'] = df['response_a'].str.count('\n\n') + 1
    features['response_b_paragraphs'] = df['response_b'].str.count('\n\n') + 1
    
    # Sentence structure
    features['response_a_sentences'] = df['response_a'].str.count(r'[.!?]+')
    features['response_b_sentences'] = df['response_b'].str.count(r'[.!?]+')
    
    # List structure (often indicates organized response)
    features['response_a_lists'] = df['response_a'].str.count(r'^\s*[-*•]\s')
    features['response_b_lists'] = df['response_b'].str.count(r'^\s*[-*•]\s')
    features['response_a_numbered'] = df['response_a'].str.count(r'^\s*\d+\.\s')
    features['response_b_numbered'] = df['response_b'].str.count(r'^\s*\d+\.\s')
    
    # Structure ratios
    features['lines_ratio_a_b'] = (features['response_a_lines'] + 1) / (features['response_b_lines'] + 1)
    features['paragraphs_ratio_a_b'] = features['response_a_paragraphs'] / (features['response_b_paragraphs'] + 1)
    
    return features

def extract_content_features(df):
    """Extract content-based features that indicate response quality"""
    features = pd.DataFrame()
    
    # Question marks (engagement)
    features['response_a_questions'] = df['response_a'].str.count(r'\?')
    features['response_b_questions'] = df['response_b'].str.count(r'\?')
    
    # Numbers and specificity
    features['response_a_numbers'] = df['response_a'].str.count(r'\d+')
    features['response_b_numbers'] = df['response_b'].str.count(r'\d+')
    
    # URLs and references
    features['response_a_urls'] = df['response_a'].str.count(r'http[s]?://')
    features['response_b_urls'] = df['response_b'].str.count(r'http[s]?://')
    
    # Quotations (citations)
    features['response_a_quotes'] = df['response_a'].str.count(r'"[^"]*"')
    features['response_b_quotes'] = df['response_b'].str.count(r'"[^"]*"')
    
    # Capital letters (might indicate emphasis or poor formatting)
    features['response_a_caps_ratio'] = df['response_a'].str.count(r'[A-Z]') / (df['response_a'].str.len() + 1)
    features['response_b_caps_ratio'] = df['response_b'].str.count(r'[A-Z]') / (df['response_b'].str.len() + 1)
    
    # Exclamation marks (enthusiasm vs professionalism)
    features['response_a_exclamations'] = df['response_a'].str.count(r'!')
    features['response_b_exclamations'] = df['response_b'].str.count(r'!')
    
    return features

def extract_bias_features(df):
    """Extract features that help mitigate known biases"""
    features = pd.DataFrame()
    
    # Position bias indicators
    features['a_longer_than_b'] = (df['response_a'].str.len() > df['response_b'].str.len()).astype(int)
    features['b_longer_than_a'] = (df['response_b'].str.len() > df['response_a'].str.len()).astype(int)
    
    # Verbosity bias
    features['extreme_length_diff'] = (np.abs(df['response_a'].str.len() - df['response_b'].str.len()) > 1000).astype(int)
    
    # Confidence indicators
    confidence_words = r'(?i)\b(definitely|certainly|absolutely|clearly|obviously|undoubtedly)\b'
    features['response_a_confidence'] = df['response_a'].str.count(confidence_words)
    features['response_b_confidence'] = df['response_b'].str.count(confidence_words)
    
    # Politeness indicators
    polite_words = r'(?i)\b(please|thank you|sorry|excuse me|would you|could you)\b'
    features['response_a_politeness'] = df['response_a'].str.count(polite_words)
    features['response_b_politeness'] = df['response_b'].str.count(polite_words)
    
    return features

# --- Semantic Features ---
def extract_semantic_features(df):
    """Extract advanced semantic features that indicate response quality and relevance"""
    features = pd.DataFrame()
    
    print("  - Extracting code and technical content features...")
    # Code blocks and technical content
    features['response_a_code_blocks'] = df['response_a'].str.count(r'```|`[^`\n]+`')
    features['response_b_code_blocks'] = df['response_b'].str.count(r'```|`[^`\n]+`')
    features['response_a_code_inline'] = df['response_a'].str.count(r'`[^`\n]+`')
    features['response_b_code_inline'] = df['response_b'].str.count(r'`[^`\n]+`')
    
    print("  - Extracting mathematical expressions...")
    # Mathematical expressions and formulas
    features['response_a_math'] = df['response_a'].str.count(r'\$[^$\n]+\$|\\\([^)]+\\\)|\\begin\{|\\end\{')
    features['response_b_math'] = df['response_b'].str.count(r'\$[^$\n]+\$|\\\([^)]+\\\)|\\begin\{|\\end\{')
    
    print("  - Extracting step-by-step indicators...")
    # Step-by-step explanations
    step_patterns = r'(?i)\b(step \d+|first[ly]*|second[ly]*|third[ly]*|fourth[ly]*|fifth[ly]*|finally|lastly|next|then)\b'
    features['response_a_steps'] = df['response_a'].str.count(step_patterns)
    features['response_b_steps'] = df['response_b'].str.count(step_patterns)
    
    print("  - Extracting uncertainty and confidence markers...")
    # Uncertainty vs confidence indicators
    uncertainty_words = r'(?i)\b(maybe|perhaps|might|could be|uncertain|not sure|possibly|probably|likely)\b'
    features['response_a_uncertainty'] = df['response_a'].str.count(uncertainty_words)
    features['response_b_uncertainty'] = df['response_b'].str.count(uncertainty_words)
    
    # Strong assertion words
    assertion_words = r'(?i)\b(always|never|must|will|cannot|impossible|guaranteed|proven|fact)\b'
    features['response_a_assertions'] = df['response_a'].str.count(assertion_words)
    features['response_b_assertions'] = df['response_b'].str.count(assertion_words)
    
    print("  - Extracting educational and explanatory features...")
    # Educational/explanatory patterns
    explanation_words = r'(?i)\b(because|since|therefore|thus|hence|as a result|due to|explained|example|for instance)\b'
    features['response_a_explanations'] = df['response_a'].str.count(explanation_words)
    features['response_b_explanations'] = df['response_b'].str.count(explanation_words)
    
    # Question answering patterns
    answer_patterns = r'(?i)\b(answer|solution|result|conclusion|summary|in summary|to summarize)\b'
    features['response_a_answers'] = df['response_a'].str.count(answer_patterns)
    features['response_b_answers'] = df['response_b'].str.count(answer_patterns)
    
    print("  - Extracting formatting and presentation features...")
    # Formatting quality indicators
    features['response_a_headers'] = df['response_a'].str.count(r'^#+\s', flags=re.MULTILINE)
    features['response_b_headers'] = df['response_b'].str.count(r'^#+\s', flags=re.MULTILINE)
    features['response_a_bold'] = df['response_a'].str.count(r'\*\*[^*]+\*\*|__[^_]+__')
    features['response_b_bold'] = df['response_b'].str.count(r'\*\*[^*]+\*\*|__[^_]+__')
    features['response_a_italic'] = df['response_a'].str.count(r'\*[^*]+\*|_[^_]+_')
    features['response_b_italic'] = df['response_b'].str.count(r'\*[^*]+\*|_[^_]+_')
    
    print("  - Extracting conversation and interaction features...")
    # Conversational elements
    greeting_words = r'(?i)\b(hello|hi|hey|good morning|good afternoon|good evening)\b'
    features['response_a_greetings'] = df['response_a'].str.count(greeting_words)
    features['response_b_greetings'] = df['response_b'].str.count(greeting_words)
    
    # Direct addressing
    address_words = r'(?i)\b(you|your|yourself)\b'
    features['response_a_direct_address'] = df['response_a'].str.count(address_words)
    features['response_b_direct_address'] = df['response_b'].str.count(address_words)
    
    print("  - Extracting domain-specific features...")
    # Domain-specific indicators
    technical_words = r'(?i)\b(algorithm|function|method|process|system|data|analysis|implementation)\b'
    features['response_a_technical'] = df['response_a'].str.count(technical_words)
    features['response_b_technical'] = df['response_b'].str.count(technical_words)
    
    # Creative/subjective words
    creative_words = r'(?i)\b(beautiful|amazing|wonderful|creative|artistic|inspiring|emotional)\b'
    features['response_a_creative'] = df['response_a'].str.count(creative_words)
    features['response_b_creative'] = df['response_b'].str.count(creative_words)
    
    print("  - Computing semantic ratios and differences...")
    # Compute ratios and differences for key semantic features
    semantic_keys = ['code_blocks', 'math', 'steps', 'explanations', 'technical']
    for key in semantic_keys:
        col_a = f'response_a_{key}'
        col_b = f'response_b_{key}'
        if col_a in features.columns and col_b in features.columns:
            features[f'{key}_ratio_a_b'] = (features[col_a] + 1) / (features[col_b] + 1)
            features[f'{key}_diff_a_b'] = features[col_a] - features[col_b]
    
    print(f"  - Created {features.shape[1]} semantic features")
    return features

# --- Word overlap and similarity features ---
def extract_similarity_features(df):
    """Extract features based on word overlap and text similarity"""
    features = pd.DataFrame()
    
    print("  - Computing word overlap features...")
    
    def word_overlap_ratio(text1, text2):
        """Compute word overlap ratio between two texts"""
        if pd.isna(text1) or pd.isna(text2) or text1 == "" or text2 == "":
            return 0.0
        
        words1 = set(str(text1).lower().split())
        words2 = set(str(text2).lower().split())
        
        if len(words1) == 0 and len(words2) == 0:
            return 1.0
        if len(words1) == 0 or len(words2) == 0:
            return 0.0
            
        overlap = len(words1.intersection(words2))
        return overlap / len(words1.union(words2))
    
    # Word overlap between prompt and responses
    features['prompt_response_a_overlap'] = [
        word_overlap_ratio(p, a) for p, a in zip(df['prompt'], df['response_a'])
    ]
    features['prompt_response_b_overlap'] = [
        word_overlap_ratio(p, b) for p, b in zip(df['prompt'], df['response_b'])
    ]
    
    # Word overlap between responses
    features['response_a_b_overlap'] = [
        word_overlap_ratio(a, b) for a, b in zip(df['response_a'], df['response_b'])
    ]
    
    # Unique word ratios
    def unique_word_ratio(text1, text2):
        """Compute ratio of unique words in text1 vs text2"""
        if pd.isna(text1) or pd.isna(text2) or text1 == "" or text2 == "":
            return 0.5
            
        words1 = set(str(text1).lower().split())
        words2 = set(str(text2).lower().split())
        
        if len(words1) == 0:
            return 0.0
        
        unique_in_1 = len(words1 - words2)
        return unique_in_1 / len(words1)
    
    features['response_a_unique_ratio'] = [
        unique_word_ratio(a, b) for a, b in zip(df['response_a'], df['response_b'])
    ]
    features['response_b_unique_ratio'] = [
        unique_word_ratio(b, a) for a, b in zip(df['response_a'], df['response_b'])
    ]
    
    print(f"  - Created {features.shape[1]} similarity features")
    return features
 
def extract_advanced_text_features(df):
    """
    Extract advanced text features including TF-IDF, semantic similarity, and text quality metrics
    """
    print("Extracting advanced text features...")
    features = pd.DataFrame()
    
    # === 1. TF-IDF Features ===
    #print("  - Computing TF-IDF features...")
    
    # Combine all text for TF-IDF
    #all_texts = []
    #for idx in range(len(df)):
    #    prompt = str(df.iloc[idx]['prompt'])
    #    resp_a = str(df.iloc[idx]['response_a'])
     #   resp_b = str(df.iloc[idx]['response_b'])
      #  combined = f"{prompt} [SEP] {resp_a} [SEP] {resp_b}"
      #  all_texts.append(combined)
    
    # TF-IDF Vectorizer with optimized parameters
    #tfidf = TfidfVectorizer(
     #   max_features=3000,  # Reduced for memory efficiency
     #   ngram_range=(1, 2),  # Unigrams and bigrams
     #   stop_words='english',
    #    min_df=2,  # Minimum document frequency
     #   max_df=0.95,  # Maximum document frequency
      #  lowercase=True,
     #   strip_accents='unicode'
   # )
    
   # try:
   #     tfidf_matrix = tfidf.fit_transform(all_texts)
        
        # Use SVD for dimensionality reduction
    #    svd = TruncatedSVD(n_components=50, random_state=42)
    #    tfidf_reduced = svd.fit_transform(tfidf_matrix)
        
        # Add TF-IDF features
    #    for i in range(tfidf_reduced.shape[1]):
    #        features[f'tfidf_svd_{i}'] = tfidf_reduced[:, i]
   #     
    #    print(f"    Created {tfidf_reduced.shape[1]} TF-IDF SVD features")
        
    #except Exception as e:
    #    print(f"    TF-IDF extraction failed: {e}")
        # Add dummy features to maintain consistency
  #      for i in range(50):
    #        features[f'tfidf_svd_{i}'] = 0.0
    
    # === 2. Response-Specific TF-IDF Similarity ===
   # print("  - Computing TF-IDF response similarities...")
    
  #  try:
        # Separate TF-IDF for responses only
     #   response_texts = []
    #    for idx in range(len(df)):
      #      resp_a = str(df.iloc[idx]['response_a'])
      #      resp_b = str(df.iloc[idx]['response_b'])
      #      response_texts.extend([resp_a, resp_b])
      #  
      #  resp_tfidf = TfidfVectorizer(
        #    max_features=1000,
       #     ngram_range=(1, 2),
       #     stop_words='english',
       #     min_df=2
      #  )
        
       # resp_tfidf_matrix = resp_tfidf.fit_transform(response_texts)
       # 
        # Compute similarities
      #  similarities = []
      #  for i in range(0, len(response_texts), 2):
      #      if i + 1 < len(response_texts):
       #         sim = cosine_similarity(
             #       resp_tfidf_matrix[i:i+1], 
         #           resp_tfidf_matrix[i+1:i+2]
        #        )[0, 0]
       #         similarities.append(sim)
       #     else:
       #         similarities.append(0.0)
        
     #   features['tfidf_response_similarity'] = similarities
    #    
        # Prompt-Response TF-IDF similarities
    #    prompt_response_sims_a = []
      #  prompt_response_sims_b = []
        
       # for idx in range(len(df)):
       #     prompt = str(df.iloc[idx]['prompt'])
       #     resp_a = str(df.iloc[idx]['response_a'])
        #    resp_b = str(df.iloc[idx]['response_b'])
            
         #   try:
        #        temp_texts = [prompt, resp_a, resp_b]
          #      temp_tfidf = TfidfVectorizer(stop_words='english', min_df=1)
           #     temp_matrix = temp_tfidf.fit_transform(temp_texts)
                
           #     sim_a = cosine_similarity(temp_matrix[0:1], temp_matrix[1:2])[0, 0]
           #     sim_b = cosine_similarity(temp_matrix[0:1], temp_matrix[2:3])[0, 0]
                
          #      prompt_response_sims_a.append(sim_a)
         #       prompt_response_sims_b.append(sim_b)
                
       #     except:
          #      prompt_response_sims_a.append(0.0)
          #      prompt_response_sims_b.append(0.0)
        
       # features['tfidf_prompt_response_a_sim'] = prompt_response_sims_a
       # features['tfidf_prompt_response_b_sim'] = prompt_response_sims_b
     #   features['tfidf_prompt_response_diff'] = np.array(prompt_response_sims_a) - np.array(prompt_response_sims_b)
        
     #   print(f"    Created TF-IDF similarity features")
        
#    except Exception as e:
  #      print(f"    TF-IDF similarity extraction failed: {e}")
 #       features['tfidf_response_similarity'] = 0.0
     #   features['tfidf_prompt_response_a_sim'] = 0.0
   #     features['tfidf_prompt_response_b_sim'] = 0.0
     #   features['tfidf_prompt_response_diff'] = 0.0
    
    # === 3. Advanced Text Quality Features ===
    print("  - Computing text quality features...")
    
    def compute_text_quality(text):
        """Compute various text quality metrics"""
        if pd.isna(text) or text == "":
            return {
                'avg_word_length': 0,
                'avg_sentence_length': 0,
                'vocabulary_richness': 0,
                'punctuation_ratio': 0,
                'uppercase_ratio': 0
            }
        
        text = str(text)
        words = text.split()
        sentences = re.split(r'[.!?]+', text)
        sentences = [s.strip() for s in sentences if s.strip()]
        
        # Average word length
        avg_word_len = np.mean([len(word) for word in words]) if words else 0
        
        # Average sentence length
        avg_sent_len = np.mean([len(sent.split()) for sent in sentences]) if sentences else 0
        
        # Vocabulary richness (unique words / total words)
        vocab_richness = len(set(words)) / len(words) if words else 0
        
        # Punctuation ratio
        punct_count = len(re.findall(r'[.,;:!?]', text))
        punct_ratio = punct_count / len(text) if text else 0
        
        # Uppercase ratio
        upper_count = sum(1 for c in text if c.isupper())
        upper_ratio = upper_count / len(text) if text else 0
        
        return {
            'avg_word_length': avg_word_len,
            'avg_sentence_length': avg_sent_len,
            'vocabulary_richness': vocab_richness,
            'punctuation_ratio': punct_ratio,
            'uppercase_ratio': upper_ratio
        }
    
    # Apply quality metrics
    for col in ['response_a', 'response_b']:
        quality_metrics = df[col].apply(compute_text_quality)
        
        for metric in ['avg_word_length', 'avg_sentence_length', 'vocabulary_richness', 
                      'punctuation_ratio', 'uppercase_ratio']:
            features[f'{col}_{metric}'] = [m[metric] for m in quality_metrics]
    
    # Quality comparison features
    features['vocab_richness_diff'] = features['response_a_vocabulary_richness'] - features['response_b_vocabulary_richness']
    features['word_length_diff'] = features['response_a_avg_word_length'] - features['response_b_avg_word_length']
    features['sentence_length_diff'] = features['response_a_avg_sentence_length'] - features['response_b_avg_sentence_length']
    
    # === 4. N-gram Analysis ===
    print("  - Computing n-gram features...")
    
    def extract_ngram_features(text, n=2):
        """Extract n-gram based features"""
        if pd.isna(text) or text == "":
            return 0, 0
        
        words = str(text).lower().split()
        if len(words) < n:
            return 0, 0
        
        ngrams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]
        ngram_counts = Counter(ngrams)
        
        # Most common n-gram frequency
        max_freq = max(ngram_counts.values()) if ngram_counts else 0
        # Unique n-grams ratio
        unique_ratio = len(ngram_counts) / len(ngrams) if ngrams else 0
        
        return max_freq, unique_ratio
    
    # Bigram features
    for col in ['response_a', 'response_b']:
        bigram_data = df[col].apply(lambda x: extract_ngram_features(x, 2))
        features[f'{col}_bigram_max_freq'] = [x[0] for x in bigram_data]
        features[f'{col}_bigram_unique_ratio'] = [x[1] for x in bigram_data]
    
    # === 5. Readability Features ===
    print("  - Computing readability features...")
    
    def simple_readability_score(text):
        """Simple readability score based on sentence and word length"""
        if pd.isna(text) or text == "":
            return 0
        
        text = str(text)
        sentences = len(re.findall(r'[.!?]+', text))
        words = len(text.split())
        
        if sentences == 0 or words == 0:
            return 0
        
        # Simple approximation of readability
        avg_words_per_sentence = words / sentences
        avg_chars_per_word = len(text.replace(' ', '')) / words
        
        # Lower score = more readable
        readability = avg_words_per_sentence * 0.5 + avg_chars_per_word * 2
        return readability
    
    features['response_a_readability'] = df['response_a'].apply(simple_readability_score)
    features['response_b_readability'] = df['response_b'].apply(simple_readability_score)
    features['readability_diff'] = features['response_a_readability'] - features['response_b_readability']
    
    # === 6. Semantic Coherence Features ===
    print("  - Computing semantic coherence features...")
    
    def compute_coherence(text):
        """Compute text coherence based on word repetition and structure"""
        if pd.isna(text) or text == "":
            return 0
        
        text = str(text).lower()
        words = text.split()
        
        if len(words) < 2:
            return 0
        
        # Word repetition score
        word_counts = Counter(words)
        repetition_score = sum(count for count in word_counts.values() if count > 1)
        repetition_ratio = repetition_score / len(words)
        
        return repetition_ratio
    
    features['response_a_coherence'] = df['response_a'].apply(compute_coherence)
    features['response_b_coherence'] = df['response_b'].apply(compute_coherence)
    features['coherence_diff'] = features['response_a_coherence'] - features['response_b_coherence']
    
    print(f"  - Created {features.shape[1]} advanced text features")
    return features

def extract_response_quality_features(df):
    """
    Extract features that specifically measure response quality and appropriateness
    """
    print("Extracting response quality features...")
    features = pd.DataFrame()
    
    # === Response Completeness Features ===
    print("  - Computing response completeness...")
    
    def is_complete_response(text):
        """Check if response seems complete"""
        if pd.isna(text) or text == "":
            return 0
        
        text = str(text)
        
        # Check for completion indicators
        completion_indicators = [
            r'(?i)\b(in conclusion|to conclude|finally|in summary|overall)\b',
            r'(?i)\b(hope this helps|let me know|feel free to ask)\b',
            r'[.!]$',  # Ends with proper punctuation
        ]
        
        score = 0
        for pattern in completion_indicators:
            if re.search(pattern, text):
                score += 1
        
        # Check if response seems cut off
        cutoff_indicators = [
            r'(?i)\b(continued|more on this|as mentioned)\s*$',
            r'[,:]$',  # Ends with comma or colon
        ]
        
        for pattern in cutoff_indicators:
            if re.search(pattern, text):
                score -= 1
        
        return max(0, score)
    
    features['response_a_completeness'] = df['response_a'].apply(is_complete_response)
    features['response_b_completeness'] = df['response_b'].apply(is_complete_response)
    features['completeness_diff'] = features['response_a_completeness'] - features['response_b_completeness']
    
    # === Response Helpfulness Indicators ===
    print("  - Computing helpfulness indicators...")
    
    helpful_patterns = [
        r'(?i)\b(example|for instance|such as|like this)\b',
        r'(?i)\b(step|steps|first|second|next|then)\b',
        r'(?i)\b(you can|you should|try|consider)\b',
        r'(?i)\b(here|this|these|solution|answer)\b'
    ]
    
    for col in ['response_a', 'response_b']:
        helpfulness_score = df[col].apply(
            lambda x: sum(len(re.findall(pattern, str(x))) for pattern in helpful_patterns)
        )
        features[f'{col}_helpfulness'] = helpfulness_score
    
    features['helpfulness_diff'] = features['response_a_helpfulness'] - features['response_b_helpfulness']
    
    # === Response Specificity ===
    print("  - Computing response specificity...")
    
    def compute_specificity(text):
        """Compute how specific/detailed the response is"""
        if pd.isna(text) or text == "":
            return 0
        
        text = str(text)
        
        # Count specific indicators
        numbers = len(re.findall(r'\d+', text))
        proper_nouns = len(re.findall(r'\b[A-Z][a-z]+\b', text))
        technical_terms = len(re.findall(r'(?i)\b(algorithm|method|process|system|function|parameter|variable)\b', text))
        citations = len(re.findall(r'(?i)\b(according to|research shows|study|paper|source)\b', text))
        
        specificity_score = numbers * 0.5 + proper_nouns * 0.3 + technical_terms * 0.8 + citations * 1.2
        return specificity_score
    
    features['response_a_specificity'] = df['response_a'].apply(compute_specificity)
    features['response_b_specificity'] = df['response_b'].apply(compute_specificity)
    features['specificity_diff'] = features['response_a_specificity'] - features['response_b_specificity']
    
    print(f"  - Created {features.shape[1]} response quality features")
    return features
    
def preprocess_data(df, is_train=True):
    df_copy = df.copy()
    
    # Clean text columns with robust cleaning
    text_cols_to_clean = ['prompt', 'response_a', 'response_b']
    for col in text_cols_to_clean:
        if col in df_copy.columns:
            print(f"Cleaning column: {col}")
            # First apply JSON-like cleaning
            df_copy[col] = df_copy[col].apply(clean_json_like_string)
            # Then apply robust Unicode cleaning
            df_copy[col] = df_copy[col].apply(robust_text_cleaning)
            
            # Additional validation
            print(f"  - Column {col}: {df_copy[col].isna().sum()} NaN values")
            print(f"  - Column {col}: {(df_copy[col] == '').sum()} empty strings")

    if is_train:
        # Create a single target column for stratification and CatBoost MultiClass
        conditions = [
            df_copy['winner_model_a'] == 1,
            df_copy['winner_model_b'] == 1,
            df_copy['winner_tie'] == 1
        ]
        choices = [0, 1, 2]
        df_copy['target'] = np.select(conditions, choices, default=-1)
        
        # Verify target creation
        print("Target distribution:")
        print(df_copy['target'].value_counts(normalize=True))
        if (df_copy['target'] == -1).any():
            print("Warning: Some rows could not be mapped to a target class!")
            print(f"Number of unmapped rows: {(df_copy['target'] == -1).sum()}")
    
    # --- Extract all engineered features (for both train and test) ---
    print("Extracting engineered features...")

    length_features = extract_length_features(df_copy)
    structure_features = extract_structure_features(df_copy)
    content_features = extract_content_features(df_copy)
    bias_features = extract_bias_features(df_copy)

    # --- Extract semantic and similarity features ---
    print("Extracting semantic features...")
    semantic_features = extract_semantic_features(df_copy)

    print("Extracting similarity features...")
    similarity_features = extract_similarity_features(df_copy)

    print("Extracting advanced text features...")
    advanced_text_features = extract_advanced_text_features(df_copy)

    print("Extracting response quality features...")
    quality_features = extract_response_quality_features(df_copy)

    # --- Extract BERT features ---
    bert_derived_features = extract_bert_features(df_copy)
    

    # Combine all features
    all_feature_dfs = [
        length_features, 
        structure_features, 
        content_features, 
        bias_features,
        semantic_features,
        similarity_features,
        advanced_text_features,
        quality_features
    ]
    if not bert_derived_features.empty: # Only add if BERT features were generated
        all_feature_dfs.append(bert_derived_features)

    engineered_features = pd.concat(all_feature_dfs, axis=1)
    
    print(f"Created {engineered_features.shape[1]} total engineered features")
    
    df_copy = pd.concat([df_copy, engineered_features], axis=1)
    
    return df_copy
    
train_processed_df = preprocess_data(train_df, is_train=True)
test_processed_df = preprocess_data(test_df, is_train=False)

# --- Updated Feature Selection ---
# Combine text features with engineered features
text_features_cols = ['prompt', 'response_a', 'response_b']

# Get engineered feature column names
engineered_cols = [col for col in train_processed_df.columns 
                  if col not in ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 
                               'winner_model_a', 'winner_model_b', 'winner_tie', 'target']]

print(f"Text features: {len(text_features_cols)}")
print(f"Engineered features: {len(engineered_cols)}")
print(f"Total features for training: {len(text_features_cols) + len(engineered_cols)}")

# Prepare feature sets
X_text = train_processed_df[text_features_cols]
X_engineered = train_processed_df[engineered_cols]
y = train_processed_df['target']

X_test_text = test_processed_df[text_features_cols]
X_test_engineered = test_processed_df[engineered_cols]

print(f"X_text shape: {X_text.shape}")
print(f"X_engineered shape: {X_engineered.shape}")
print(f"y shape: {y.shape}")


# Training

## Training Model Layer 1 (Stacking Catboost + XGB) 

Catboost dan XGB dipilih karena dalam performa model tunggal,kedua machine learning ini memiliki nilai paling baik diantara semua machine learning yang pernah saya coba pada kasus ini, selain itu catboost cocok untuk training data text classification.

In [None]:
# --- Lightweight Stacking Approach with CatBoost + XGBoost ---
print("\n=== Starting Lightweight Stacking Training ===")
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

# Initialize storage for out-of-fold predictions and models
catboost_oof_preds = np.zeros((len(X_text), 3))
xgb_oof_preds = np.zeros((len(X_text), 3))
catboost_test_preds = np.zeros((len(X_test_text), 3))
xgb_test_preds = np.zeros((len(X_test_text), 3))

catboost_models = []
xgb_models = []
val_scores_catboost = []
val_scores_xgb = []

# Enhanced CatBoost Parameters 
catboost_params = {
    'iterations': 2200,  # Reduced for faster training in stacking
    'learning_rate': 0.047,
    'depth': 5,
    'loss_function': 'MultiClass',
    'eval_metric': 'MultiClass',
    'random_seed': SEED,
    'l2_leaf_reg': 6,
    'verbose': 100,
    'early_stopping_rounds': 200,
    'border_count': 64,
    'bootstrap_type': 'Bernoulli', 
    'subsample': 0.8,
    'feature_border_type': 'GreedyLogSum',
}

# Lightweight XGBoost Parameters
xgb_params = {
    'objective': 'multi:softprob',
    'num_class': 3,
    'eval_metric': 'mlogloss',
    
    # Tree Structure - Slightly deeper but still fast
    'max_depth': 6,  # Increased from 5 to 6 for better complexity
    'min_child_weight': 3,  # Added to prevent overfitting
    'gamma': 0.15,  # Added min split loss for regularization
    
    # Learning Parameters - Balanced approach
    'learning_rate': 0.085,  # Slightly reduced from 0.1 for better convergence
    
    # Sampling Parameters - More aggressive for diversity
    'subsample': 0.85,  # Increased from 0.8
    'colsample_bytree': 0.85,  # Increased from 0.8
    'colsample_bylevel': 0.8,  # Added for additional feature sampling
    'colsample_bynode': 0.8,  # Added for node-level feature sampling
    
    # Regularization - Fine-tuned
    'reg_alpha': 0.6,  # Reduced L1 regularization
    'reg_lambda': 2,  # Increased L2 regularization
    
    # Performance Parameters
    'random_state': SEED,
    'n_jobs': -1,
    'verbosity': 1,
    'tree_method': 'hist',  # Added for faster training
    
    # Advanced Parameters for Better Performance
    'grow_policy': 'lossguide',  # More efficient tree growth
    'max_leaves': 256,  # Limit leaves for speed while maintaining complexity
    'scale_pos_weight': 1,  # Can be adjusted based on class imbalance
}

for fold, (train_idx, val_idx) in enumerate(skf.split(X_text, y)):
    print(f"\n--- Fold {fold+1}/{N_FOLDS} ---")
    
    # Split data
    X_train_text_fold = X_text.iloc[train_idx]
    X_train_eng_fold = X_engineered.iloc[train_idx]
    X_val_text_fold = X_text.iloc[val_idx]
    X_val_eng_fold = X_engineered.iloc[val_idx]
    y_train_fold = y.iloc[train_idx]
    y_val_fold = y.iloc[val_idx]
    
    # Combine features
    X_train_combined = pd.concat([X_train_text_fold, X_train_eng_fold], axis=1)
    X_val_combined = pd.concat([X_val_text_fold, X_val_eng_fold], axis=1)
    
    # === TRAIN CATBOOST MODEL ===
    print(f"Training CatBoost for fold {fold+1}...")
    catboost_model = CatBoostClassifier(**catboost_params)
    
    try:
        catboost_model.fit(X_train_combined, y_train_fold,
                          eval_set=[(X_val_combined, y_val_fold)],
                          text_features=text_features_cols,
                          use_best_model=True)

        # Get out-of-fold predictions
        catboost_val_preds = catboost_model.predict_proba(X_val_combined)
        catboost_oof_preds[val_idx] = catboost_val_preds
        
        # Evaluate CatBoost
        catboost_fold_score = log_loss(y_val_fold, catboost_val_preds)
        val_scores_catboost.append(catboost_fold_score)
        catboost_models.append(catboost_model)
        
        print(f"CatBoost Fold {fold+1} LogLoss: {catboost_fold_score:.4f}")
        
    except Exception as e:
        print(f"Error in CatBoost fold {fold+1}: {e}")
        catboost_models.append(None)
        val_scores_catboost.append(999)
    
    # === TRAIN LIGHTWEIGHT XGBOOST MODEL ===
    print(f"Training XGBoost for fold {fold+1}...")
    
    # For XGBoost, we only use engineered features (no text features)
    # This makes it much faster while still contributing to ensemble diversity
    xgb_model = xgb.XGBClassifier(
        n_estimators=750,  # Limited iterations for speed
        **xgb_params
    )
    
    try:
        xgb_model.fit(
            X_train_eng_fold, y_train_fold,
            eval_set=[(X_val_eng_fold, y_val_fold)],
            early_stopping_rounds=100,
            verbose=False
        )
        
        # Get out-of-fold predictions
        xgb_val_preds = xgb_model.predict_proba(X_val_eng_fold)
        xgb_oof_preds[val_idx] = xgb_val_preds
        
        # Evaluate XGBoost
        xgb_fold_score = log_loss(y_val_fold, xgb_val_preds)
        val_scores_xgb.append(xgb_fold_score)
        xgb_models.append(xgb_model)
        
        print(f"XGBoost Fold {fold+1} LogLoss: {xgb_fold_score:.4f}")
        
    except Exception as e:
        print(f"Error in XGBoost fold {fold+1}: {e}")
        xgb_models.append(None)
        val_scores_xgb.append(999)
    
    # Clean up memory
    del X_train_combined, X_val_combined
    del X_train_text_fold, X_train_eng_fold, X_val_text_fold, X_val_eng_fold
    del y_train_fold, y_val_fold
    gc.collect()

# === EVALUATE INDIVIDUAL MODELS ===
print("\n=== Individual Model Performance ===")

# CatBoost performance
if val_scores_catboost and min(val_scores_catboost) < 999:
    catboost_mean_score = np.mean([s for s in val_scores_catboost if s < 999])
    catboost_std_score = np.std([s for s in val_scores_catboost if s < 999])
    catboost_oof_score = log_loss(y, catboost_oof_preds)
    print(f"CatBoost CV LogLoss: {catboost_mean_score:.4f} (+/- {catboost_std_score:.4f})")
    print(f"CatBoost OOF LogLoss: {catboost_oof_score:.4f}")
else:
    print("CatBoost: No valid models trained")
    catboost_mean_score = 999

# XGBoost performance
if val_scores_xgb and min(val_scores_xgb) < 999:
    xgb_mean_score = np.mean([s for s in val_scores_xgb if s < 999])
    xgb_std_score = np.std([s for s in val_scores_xgb if s < 999])
    xgb_oof_score = log_loss(y, xgb_oof_preds)
    print(f"XGBoost CV LogLoss: {xgb_mean_score:.4f} (+/- {xgb_std_score:.4f})")
    print(f"XGBoost OOF LogLoss: {xgb_oof_score:.4f}")
else:
    print("XGBoost: No valid models trained")
    xgb_mean_score = 999

## Training Layer 2 (Meta Model LightGBM classifier)

Meskipun meta layer cepat dilatih dan rawan overfit, sebaiknya tetap gunakan machine dengan parameter sedang karena jika terlalu takut overfit justru malah terkena fallback.

In [None]:
# === TRAIN META MODEL (Layer 2) ===
print("\n=== Training Meta Model (Stacking Layer 2) ===")

if catboost_mean_score < 999 and xgb_mean_score < 999:
    print("Training LightGBM meta-learner...")
    
    # Prepare stacking features (OOF predictions from base models)
    stacking_features = np.column_stack([catboost_oof_preds, xgb_oof_preds])
    print(f"Stacking features shape: {stacking_features.shape}")
    
    # Train meta model with LightGBM
    meta_model = lgb.LGBMClassifier(
        objective='multiclass',
        num_class=3,
        metric='multi_logloss',
        boosting_type='gbdt',
        num_leaves=31,
        learning_rate=0.05,
        feature_fraction=0.9,
        bagging_fraction=0.8,
        bagging_freq=5,
        verbose=-1,
        random_state=SEED,
        n_estimators=250,
        min_child_samples=20,
        reg_alpha=0.1,
        reg_lambda=0.1,
        early_stopping_rounds=70
    )

# Train with validation for early stopping
    meta_train_x, meta_val_x, meta_train_y, meta_val_y = train_test_split(
        stacking_features, y, test_size=0.2, random_state=SEED, stratify=y
    )
    
    meta_model.fit(
        meta_train_x, meta_train_y,
        eval_set=[(meta_val_x, meta_val_y)],
        callbacks=[lgb.early_stopping(70), lgb.log_evaluation(0)]
    )
    
    # Get meta model predictions on OOF data
    meta_oof_preds = meta_model.predict_proba(stacking_features)
    meta_oof_score = log_loss(y, meta_oof_preds)
    
    print(f"Meta Model OOF LogLoss: {meta_oof_score:.4f}")
    
    # Compare with individual models
    print(f"Improvement over CatBoost: {catboost_oof_score - meta_oof_score:.4f}")
    print(f"Improvement over XGBoost: {xgb_oof_score - meta_oof_score:.4f}")
    
    # Use meta model predictions as final ensemble
    ensemble_oof_preds = meta_oof_preds
    ensemble_oof_score = meta_oof_score
    
elif catboost_mean_score < 999:
    print("Using only CatBoost predictions (XGBoost failed)")
    ensemble_oof_preds = catboost_oof_preds
    ensemble_oof_score = catboost_oof_score
    meta_model = None
    
elif xgb_mean_score < 999:
    print("Using only XGBoost predictions (CatBoost failed)")
    ensemble_oof_preds = xgb_oof_preds
    ensemble_oof_score = xgb_oof_score
    meta_model = None
    
else:
    print("ERROR: Both models failed to train properly!")
    raise Exception("No valid models for ensemble")

print(f"Best iteration: {meta_model.best_iteration_}")

# Test Prediction

Fallback sangat penting sebagai antisipasi jika OOF underfit 

In [None]:
# === GENERATE TEST PREDICTIONS ===
print("\n=== Generating Test Predictions ===")

X_test_combined = pd.concat([X_test_text, X_test_engineered], axis=1)

# CatBoost test predictions
if catboost_models and any(m is not None for m in catboost_models):
    print("Generating CatBoost test predictions...")
    catboost_test_preds_list = []
    
    for i, model in enumerate(catboost_models):
        if model is not None:
            try:
                fold_preds = model.predict_proba(X_test_combined)
                catboost_test_preds_list.append(fold_preds)
                print(f"  CatBoost model {i+1}: Success")
            except Exception as e:
                print(f"  CatBoost model {i+1}: Error - {e}")
    
    if catboost_test_preds_list:
        catboost_test_preds = np.mean(catboost_test_preds_list, axis=0)
        print(f"  Averaged {len(catboost_test_preds_list)} CatBoost models")
    else:
        catboost_test_preds = np.zeros((len(X_test_text), 3))
        print("  No valid CatBoost predictions")

# XGBoost test predictions
if xgb_models and any(m is not None for m in xgb_models):
    print("Generating XGBoost test predictions...")
    xgb_test_preds_list = []
    
    for i, model in enumerate(xgb_models):
        if model is not None:
            try:
                fold_preds = model.predict_proba(X_test_engineered)
                xgb_test_preds_list.append(fold_preds)
                print(f"  XGBoost model {i+1}: Success")
            except Exception as e:
                print(f"  XGBoost model {i+1}: Error - {e}")
    
    if xgb_test_preds_list:
        xgb_test_preds = np.mean(xgb_test_preds_list, axis=0)
        print(f"  Averaged {len(xgb_test_preds_list)} XGBoost models")
    else:
        xgb_test_preds = np.zeros((len(X_test_text), 3))
        print("  No valid XGBoost predictions")

# Ensemble test predictions
# Generate stacking test predictions
if meta_model is not None:
    print("Generating stacking test predictions...")
    
    # Prepare stacking features for test set
    test_stacking_features = np.column_stack([catboost_test_preds, xgb_test_preds])
    print(f"Test stacking features shape: {test_stacking_features.shape}")
    
    # Get meta model predictions
    ensemble_test_preds = meta_model.predict_proba(test_stacking_features)
    print("Meta model test predictions generated successfully")
    
else:
    # Fallback to simple ensemble if meta model couldn't be trained
    if catboost_mean_score < 999 and xgb_mean_score < 999:
        catboost_weight = 1 / catboost_mean_score
        xgb_weight = 1 / xgb_mean_score
        total_weight = catboost_weight + xgb_weight
        
        catboost_weight = catboost_weight / total_weight
        xgb_weight = xgb_weight / total_weight
        
        ensemble_test_preds = catboost_weight * catboost_test_preds + xgb_weight * xgb_test_preds
    elif catboost_mean_score < 999:
        ensemble_test_preds = catboost_test_preds
    else:
        ensemble_test_preds = xgb_test_preds

print(f"Final test predictions shape: {ensemble_test_preds.shape}")

## Prepare Submission

In [None]:
# === PREPARE SUBMISSION ===
print("\n=== Preparing Submission ===")

# Create submission dataframe
submission_df = sample_submission.copy()
submission_df['winner_model_a'] = ensemble_test_preds[:, 0]
submission_df['winner_model_b'] = ensemble_test_preds[:, 1]
submission_df['winner_tie'] = ensemble_test_preds[:, 2]

# Verify predictions sum to 1
pred_sums = submission_df[['winner_model_a', 'winner_model_b', 'winner_tie']].sum(axis=1)
print(f"Prediction sums - Min: {pred_sums.min():.6f}, Max: {pred_sums.max():.6f}")
print(f"Predictions close to 1.0: {np.allclose(pred_sums, 1.0)}")

# Display submission statistics
print(f"\nSubmission statistics:")
print(f"winner_model_a: {submission_df['winner_model_a'].mean():.4f} ± {submission_df['winner_model_a'].std():.4f}")
print(f"winner_model_b: {submission_df['winner_model_b'].mean():.4f} ± {submission_df['winner_model_b'].std():.4f}")
print(f"winner_tie: {submission_df['winner_tie'].mean():.4f} ± {submission_df['winner_tie'].std():.4f}")

# Save submission
submission_df.to_csv('submission.csv', index=False)
print("\nSubmission saved as 'submission.csv'")

# Feature Importance for Further Improvement

Anda mungkin menemukan insight baru untuk feature engineering yang lebih baik disini.

In [None]:
# === FEATURE IMPORTANCE ANALYSIS ===
print("\n=== Feature Importance Analysis ===")

if catboost_models and any(m is not None for m in catboost_models):
    print("CatBoost Feature Importance (Top 20):")
    
    # Get feature importance from first valid model
    valid_catboost = next(m for m in catboost_models if m is not None)
    feature_names = list(X_text.columns) + list(X_engineered.columns)
    
    if hasattr(valid_catboost, 'feature_importances_'):
        importances = valid_catboost.feature_importances_
        feature_importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': importances
        }).sort_values('importance', ascending=False)
        
        print(feature_importance_df.head(20).to_string(index=False))
        
        # Save feature importance
        feature_importance_df.to_csv('catboost_feature_importance.csv', index=False)
        print("\nCatBoost feature importance saved as 'catboost_feature_importance.csv'")

if xgb_models and any(m is not None for m in xgb_models):
    print("\nXGBoost Feature Importance (Top 20):")
    
    # Get feature importance from first valid model
    valid_xgb = next(m for m in xgb_models if m is not None)
    
    if hasattr(valid_xgb, 'feature_importances_'):
        importances = valid_xgb.feature_importances_
        feature_importance_df = pd.DataFrame({
            'feature': list(X_engineered.columns),
            'importance': importances
        }).sort_values('importance', ascending=False)
        
        print(feature_importance_df.head(20).to_string(index=False))
        
        # Save feature importance
        feature_importance_df.to_csv('xgb_feature_importance.csv', index=False)
        print("\nXGBoost feature importance saved as 'xgb_feature_importance.csv'")

# === META MODEL ANALYSIS ===
print("\n=== Meta Model Analysis ===")

if meta_model is not None:
    print("LightGBM Meta Model Feature Importance:")
    
    # Get feature names for stacking features
    stacking_feature_names = []
    for i in range(3):  # 3 classes
        stacking_feature_names.append(f'CatBoost_class_{i}')
    for i in range(3):
        stacking_feature_names.append(f'XGBoost_class_{i}')
    
    # Get feature importance
    importances = meta_model.feature_importances_
    meta_importance_df = pd.DataFrame({
        'feature': stacking_feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    print(meta_importance_df.to_string(index=False))
    
    # Save meta model feature importance
    meta_importance_df.to_csv('meta_model_feature_importance.csv', index=False)
    print("\nMeta model feature importance saved as 'meta_model_feature_importance.csv'")
    
    print(f"\nMeta model training score: {meta_model.best_score_}")

# Summary and Interpretation

## Summary

In [None]:
# === FINAL SUMMARY ===
print("\n" + "="*60)
print("FINAL TRAINING SUMMARY")
print("="*60)
print(f"Training samples: {len(train_processed_df)}")
print(f"Test samples: {len(test_processed_df)}")
print(f"Text features: {len(text_features_cols)}")
print(f"Engineered features: {len(engineered_cols)}")
print(f"Total features: {len(text_features_cols) + len(engineered_cols)}")
print(f"Cross-validation folds: {N_FOLDS}")
print()
print("Model Performance:")
if catboost_mean_score < 999:
    print(f"  CatBoost CV LogLoss: {catboost_mean_score:.4f} ± {catboost_std_score:.4f}")
if xgb_mean_score < 999:
    print(f"  XGBoost CV LogLoss: {xgb_mean_score:.4f} ± {xgb_std_score:.4f}")
if meta_model is not None:
    print(f"  Stacking Model OOF LogLoss: {ensemble_oof_score:.4f}")
    print(f"  Stacking Architecture: CatBoost + XGBoost → LightGBM")
else:
    print(f"  Simple Ensemble OOF LogLoss: {ensemble_oof_score:.4f}")
print()
print("Files generated:")
print("  - submission.csv")
if catboost_models and any(m is not None for m in catboost_models):
    print("  - catboost_feature_importance.csv")
if xgb_models and any(m is not None for m in xgb_models):
    print("  - xgb_feature_importance.csv")
if meta_model is not None:
    print("  - meta_model_feature_importance.csv")
print()
print("Training completed successfully!")
print("="*60)

## Interpretation

In [None]:
# === VISUALIZATION AND INTERPRETATION ===
print("\n" + "="*60)
print("VISUALIZATION AND INTERPRETATION")
print("="*60)

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.calibration import calibration_curve, CalibrationDisplay

# --- Helper function untuk plot feature importance ---
def plot_feature_importance(importance_df, title, top_n=20, filename='feature_importance.png'):
    if importance_df is None or importance_df.empty:
        print(f"  Skipping '{title}' - importance data not available.")
        return
    try:
        plt.figure(figsize=(10, max(6, top_n * 0.3))) # Adjust height based on top_n
        sns.barplot(x='importance', y='feature', data=importance_df.head(top_n), palette='viridis')
        plt.title(f'{title} (Top {top_n})')
        plt.tight_layout()
        plt.savefig(filename)
        plt.show()
        print(f"  Saved {filename}")
    except Exception as e:
        print(f"  Error plotting {title}: {e}")

# --- Feature Importance CatBoost ---
print("\nPlotting CatBoost Feature Importance...")
try:
    cb_fi_df = pd.read_csv('catboost_feature_importance.csv')
    plot_feature_importance(cb_fi_df, 'CatBoost Feature Importance', filename='catboost_importance_plot.png')
except FileNotFoundError:
    print("  catboost_feature_importance.csv not found. Skipping plot.")
except Exception as e:
    print(f"  Error loading/plotting CatBoost importance: {e}")


# --- Feature Importance XGBoost ---
print("\nPlotting XGBoost Feature Importance...")
try:
    xgb_fi_df = pd.read_csv('xgb_feature_importance.csv')
    plot_feature_importance(xgb_fi_df, 'XGBoost Feature Importance', filename='xgb_importance_plot.png')
except FileNotFoundError:
    print("  xgb_feature_importance.csv not found. Skipping plot.")
except Exception as e:
    print(f"  Error loading/plotting XGBoost importance: {e}")


# --- Meta-Model Feature Importance ---
print("\nPlotting Meta-Model Feature Importance...")
if meta_model is not None:
    try:
        meta_fi_df = pd.read_csv('meta_model_feature_importance.csv')
        plot_feature_importance(meta_fi_df, 'Meta-Model Feature Importance (Base Model Weights)',
                                top_n=len(meta_fi_df), # Show all meta-features
                                filename='meta_model_importance_plot.png')
    except FileNotFoundError:
        print("  meta_model_feature_importance.csv not found. Skipping plot.")
    except Exception as e:
        print(f"  Error loading/plotting Meta-Model importance: {e}")
else:
    print("  Meta-model not trained. Skipping meta-model importance plot.")


# --- Skor Validasi Silang (CV Scores) ---
print("\nPlotting Cross-Validation Scores...")
try:
    cv_scores_data = []
    model_names = []
    if 'val_scores_catboost' in locals() and val_scores_catboost and min(val_scores_catboost) < 999:
        cv_scores_data.extend(val_scores_catboost)
        model_names.extend(['CatBoost'] * len(val_scores_catboost))
    if 'val_scores_xgb' in locals() and val_scores_xgb and min(val_scores_xgb) < 999:
        cv_scores_data.extend(val_scores_xgb)
        model_names.extend(['XGBoost'] * len(val_scores_xgb))

    if cv_scores_data:
        cv_df = pd.DataFrame({'Model': model_names, 'LogLoss': cv_scores_data})
        plt.figure(figsize=(8, 6))
        sns.boxplot(x='Model', y='LogLoss', data=cv_df, palette='pastel')
        plt.title('Cross-Validation LogLoss by Model')
        plt.ylabel('LogLoss (Lower is Better)')
        plt.tight_layout()
        plt.savefig('cv_scores_boxplot.png')
        plt.show()
        print("  Saved cv_scores_boxplot.png")
    else:
        print("  No valid CV scores to plot.")
except Exception as e:
    print(f"  Error plotting CV scores: {e}")


# --- Matriks Konfusi (Confusion Matrix) untuk OOF Ensemble ---
print("\nPlotting OOF Ensemble Confusion Matrix...")
if 'ensemble_oof_preds' in locals() and 'y' in locals():
    try:
        y_pred_labels_oof = np.argmax(ensemble_oof_preds, axis=1)
        cm = confusion_matrix(y, y_pred_labels_oof)
        class_names = ['Model A Wins', 'Model B Wins', 'Tie'] # Sesuai dengan target encoding 0, 1, 2

        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
        fig, ax = plt.subplots(figsize=(8, 7))
        disp.plot(ax=ax, cmap='Blues', values_format='d')
        plt.title(f'Ensemble OOF Predictions Confusion Matrix\nOOF LogLoss: {ensemble_oof_score:.4f}')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig('ensemble_oof_confusion_matrix.png')
        plt.show()
        print("  Saved ensemble_oof_confusion_matrix.png")
    except Exception as e:
        print(f"  Error plotting confusion matrix: {e}")
else:
    print("  Ensemble OOF predictions or true labels not available for confusion matrix.")

# --- Kurva Kalibrasi (Calibration Curve) untuk OOF Ensemble ---
print("\nPlotting OOF Ensemble Calibration Curves...")
if 'ensemble_oof_preds' in locals() and 'y' in locals():
    try:
        fig, ax = plt.subplots(1, 1, figsize=(10, 8))
        class_names = ['Model A Wins', 'Model B Wins', 'Tie']
        colors = ['blue', 'red', 'green']

        for i in range(ensemble_oof_preds.shape[1]):
            prob_true, prob_pred = calibration_curve(y == i, ensemble_oof_preds[:, i], n_bins=10, strategy='uniform')
            disp = CalibrationDisplay(prob_true, prob_pred, ensemble_oof_preds[:, i])
            disp.plot(ax=ax, name=f'Class: {class_names[i]}', color=colors[i])

        ax.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
        ax.set_title(f'Ensemble OOF Calibration Curves\nOOF LogLoss: {ensemble_oof_score:.4f}')
        ax.set_xlabel("Mean Predicted Probability (Positive Class)")
        ax.set_ylabel("Fraction of Positives (Positive Class)")
        ax.legend(loc="lower right")
        plt.tight_layout()
        plt.savefig('ensemble_oof_calibration_curves.png')
        plt.show()
        print("  Saved ensemble_oof_calibration_curves.png")
    except Exception as e:
        print(f"  Error plotting calibration curves: {e}")
else:
    print("  Ensemble OOF predictions or true labels not available for calibration curves.")


print("\n" + "="*60)
print("VISUALIZATION COMPLETED")
print("="*60)