# DBSCAN Post-Processing for exp_007

Implement DBSCAN clustering post-processing as described in the 77th place writeup.
This technique smooths predictions by clustering nearby values and replacing with cluster medians.

Expected improvement: +0.025-0.030 (0.3612 → 0.386-0.391)

In [4]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from scipy.stats import spearmanr
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
import gc

# Load data
train_df = pd.read_csv('/home/data/train.csv')
test_df = pd.read_csv('/home/data/test.csv')

# Identify target columns
meta_cols = ['qa_id', 'question_title', 'question_body', 'question_user_name', 
             'question_user_page', 'answer', 'answer_user_name', 'answer_user_page', 
             'url', 'category', 'host']
target_cols = [c for c in train_df.columns if c not in meta_cols]

print(f"Number of target columns: {len(target_cols)}")
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

OOF predictions shape: (6079, 30)
Number of target columns: 30
OOF DataFrame shape: (6079, 30)

First few predictions:
   question_asker_intent_understanding  question_body_critical  \
0                             0.908157                0.569118   
1                             0.968442                0.832744   
2                             0.862698                0.538458   
3                             0.910688                0.556180   
4                             0.922277                0.705731   

   question_conversational  question_expect_short_answer  \
0                 0.039477                      0.441199   
1                 0.001391                      0.833066   
2                 0.002122                      0.671444   
3                 0.042393                      0.807721   
4                 0.000006                      0.787416   

   question_fact_seeking  question_has_commonly_accepted_answer  \
0               0.688862                               0

In [5]:
# Let's check the actual columns in train.csv and understand the target structure
train_df = pd.read_csv('/home/data/train.csv')
all_cols = list(train_df.columns)
print(f"Total columns: {len(all_cols)}")
print(f"First 10 columns: {all_cols[:10]}")
print(f"Last 10 columns: {all_cols[-10:]}")

# Identify target columns (excluding metadata columns)
meta_cols = ['qa_id', 'question_title', 'question_body', 'question_user_name', 
             'question_user_page', 'answer', 'answer_user_name', 'answer_user_page', 
             'url', 'category', 'host']

target_cols = [c for c in all_cols if c not in meta_cols]
print(f"\nNumber of target columns: {len(target_cols)}")
print(f"Target columns: {target_cols}")

# Check OOF predictions shape again
oof_preds = np.load('/home/code/experiments/004_bert_token_type_ids/oof_predictions.npy')
print(f"\nOOF predictions shape: {oof_preds.shape}")

# Check if test predictions exist and their shape
try:
    test_preds = np.load('/home/code/experiments/004_bert_token_type_ids/test_predictions.npy')
    print(f"Test predictions shape: {test_preds.shape}")
except:
    print("Test predictions file not found")

Total columns: 41
First 10 columns: ['qa_id', 'question_title', 'question_body', 'question_user_name', 'question_user_page', 'answer', 'answer_user_name', 'answer_user_page', 'url', 'category']
Last 10 columns: ['question_well_written', 'answer_helpful', 'answer_level_of_information', 'answer_plausible', 'answer_relevance', 'answer_satisfaction', 'answer_type_instructions', 'answer_type_procedure', 'answer_type_reason_explanation', 'answer_well_written']

Number of target columns: 30
Target columns: ['question_asker_intent_understanding', 'question_body_critical', 'question_conversational', 'question_expect_short_answer', 'question_fact_seeking', 'question_has_commonly_accepted_answer', 'question_interestingness_others', 'question_interestingness_self', 'question_multi_intent', 'question_not_really_a_question', 'question_opinion_seeking', 'question_type_choice', 'question_type_compare', 'question_type_consequence', 'question_type_definition', 'question_type_entity', 'question_type_inst

In [6]:
# Load OOF predictions from exp_007
oof_preds = np.load('/home/code/experiments/004_bert_token_type_ids/oof_predictions.npy')
print(f"OOF predictions shape: {oof_preds.shape}")

# Verify shape matches targets
assert oof_preds.shape[1] == len(target_cols), f"Shape mismatch: {oof_preds.shape[1]} vs {len(target_cols)}"

# Create DataFrame for easier handling
oof_df = pd.DataFrame(oof_preds, columns=target_cols)
print(f"OOF DataFrame shape: {oof_df.shape}")
print("\nFirst few predictions:")
print(oof_df.head())

OOF predictions shape: (6079, 30)
OOF DataFrame shape: (6079, 30)

First few predictions:
   question_asker_intent_understanding  question_body_critical  \
0                             0.908157                0.569118   
1                             0.968442                0.832744   
2                             0.862698                0.538458   
3                             0.910688                0.556180   
4                             0.922277                0.705731   

   question_conversational  question_expect_short_answer  \
0                 0.039477                      0.441199   
1                 0.001391                      0.833066   
2                 0.002122                      0.671444   
3                 0.042393                      0.807721   
4                 0.000006                      0.787416   

   question_fact_seeking  question_has_commonly_accepted_answer  \
0               0.688862                               0.385393   
1               0.

In [7]:
# Calculate baseline CV score (should match exp_007: 0.3612)
def calculate_spearman(y_true, y_pred):
    """Calculate mean Spearman correlation across all targets"""
    scores = []
    for i, col in enumerate(target_cols):
        score = spearmanr(y_true[col], y_pred[:, i]).correlation
        scores.append(score)
    return np.mean(scores)

baseline_score = calculate_spearman(train_df[target_cols], oof_preds)
print(f"Baseline CV score: {baseline_score:.6f}")
print(f"Expected (exp_007): 0.3612008823544772")

Baseline CV score: 0.351537
Expected (exp_007): 0.3612008823544772


In [None]:
# Implement DBSCAN post-processing
def dbscan_postprocess(predictions, eps_percentile=0.95, min_samples=2):
    """
    Apply DBSCAN clustering to smooth predictions.
    For each target column:
    1. Sort predictions and compute differences
    2. Set eps = percentile of differences (default: 95th percentile)
    3. Apply DBSCAN clustering
    4. Replace each cluster's predictions with cluster median
    """
    processed_preds = predictions.copy()
    
    for i in range(predictions.shape[1]):
        col_preds = predictions[:, i]
        
        # Sort predictions and compute differences
        sorted_idx = np.argsort(col_preds)
        sorted_preds = col_preds[sorted_idx]
        
        # Compute differences between consecutive sorted values
        diffs = np.diff(sorted_preds)
        
        # Set eps as percentile of differences
        if len(diffs) > 0:
            eps = np.percentile(diffs, eps_percentile * 100)
            
            # Apply DBSCAN
            # Reshape for sklearn (n_samples, n_features)
            X = col_preds.reshape(-1, 1)
            db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
            
            # Get cluster labels (-1 = noise/outliers)
            labels = db.labels_
            
            # Replace each cluster's predictions with median
            unique_labels = set(labels)
            for label in unique_labels:
                if label == -1:  # Skip noise points
                    continue
                
                # Get indices for this cluster
                cluster_mask = labels == label
                if np.sum(cluster_mask) >= min_samples:
                    # Replace with median of cluster
                    cluster_median = np.median(col_preds[cluster_mask])
                    processed_preds[cluster_mask, i] = cluster_median
    
    return processed_preds

# Test with different eps percentiles
for percentile in [0.90, 0.95, 0.98]:
    processed = dbscan_postprocess(oof_preds, eps_percentile=percentile)
    score = calculate_spearman(train_df[target_cols], processed)
    print(f"DBSCAN (eps={percentile:.2f}): {score:.6f}")

In [None]:
# Use best percentile and apply to OOF predictions
best_percentile = 0.95  # Based on typical results from writeup
processed_oof = dbscan_postprocess(oof_preds, eps_percentile=best_percentile)

# Calculate final score
final_score = calculate_spearman(train_df[target_cols], processed_oof)
print(f"Final CV score after DBSCAN: {final_score:.6f}")
print(f"Improvement: +{final_score - baseline_score:.6f}")

# Save processed OOF predictions
np.save('/home/code/experiments/005_dbscan_postprocessing/processed_oof_predictions.npy', processed_oof)
print(f"\nSaved processed OOF predictions to: /home/code/experiments/005_dbscan_postprocessing/processed_oof_predictions.npy")

In [None]:
# Save processed OOF predictions
np.save('/home/code/experiments/005_dbscan_postprocessing/processed_oof_predictions.npy', processed_oof)
print(f"Saved processed OOF predictions to: /home/code/experiments/005_dbscan_postprocessing/processed_oof_predictions.npy")

# Calculate per-target improvements
print("\nPer-target improvements:")
for i, col in enumerate(target_cols):
    baseline_target = spearmanr(train_df[col], oof_preds[:, i]).correlation
    processed_target = spearmanr(train_df[col], processed_oof[:, i]).correlation
    improvement = processed_target - baseline_target
    if abs(improvement) > 0.001:  # Only show meaningful changes
        print(f"{col}: {baseline_target:.4f} → {processed_target:.4f} ({improvement:+.4f})")

In [None]:
# Generate test predictions from exp_007 models
# Since test_predictions.npy doesn't exist, we need to generate it

import torch
from torch.utils.data import DataLoader
import json
from pathlib import Path

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load test data
test_df = pd.read_csv('/home/data/test.csv')
print(f"Test data shape: {test_df.shape}")

# Initialize test predictions array
test_predictions = np.zeros((len(test_df), len(target_cols)))

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Create test dataset
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_length=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # Combine question title, body, and answer with separators
        text = f"[CLS] {row['question_title']} [SEP] {row['question_body']} [SEP] {row['answer']} [SEP]"
        
        # Tokenize with token_type_ids
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=False,  # We already added [CLS] and [SEP]
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        # Create token_type_ids (0 for question, 1 for answer)
        # Find positions of [SEP] tokens
        input_ids = encoding['input_ids'].squeeze()
        sep_positions = (input_ids == self.tokenizer.sep_token_id).nonzero(as_tuple=True)[0]
        
        token_type_ids = torch.zeros_like(input_ids)
        if len(sep_positions) >= 2:
            # After second [SEP] is answer part
            answer_start = sep_positions[1] + 1
            token_type_ids[answer_start:] = 1
        
        return {
            'input_ids': input_ids,
            'attention_mask': encoding['attention_mask'].squeeze(),
            'token_type_ids': token_type_ids
        }

# Create test dataset and dataloader
test_dataset = TestDataset(test_df, tokenizer, max_length=512)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=0)

print(f"Test dataset created with {len(test_dataset)} samples")
print(f"Test dataloader has {len(test_loader)} batches")

In [21]:
# Define the model architecture (same as exp_007)
class BERTQuestionAnswering(nn.Module):
    def __init__(self, model_name='bert-base-uncased', num_labels=30):
        super(BERTQuestionAnswering, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        
    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        # Use [CLS] token representation
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Generate predictions for each fold
fold_predictions = []

for fold in range(5):
    print(f"\nGenerating predictions for fold {fold}...")
    
    # Load model
    model_path = f'/home/code/experiments/004_bert_token_type_ids/best_model_fold_{fold}.pth'
    model = BERTQuestionAnswering('bert-base-uncased', num_labels=len(target_cols))
    
    # Load checkpoint (direct state dict)
    checkpoint = torch.load(model_path, map_location=device)
    model.load_state_dict(checkpoint)
    model.to(device)
    model.eval()
    
    # Generate predictions
    fold_pred = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            
            outputs = model(input_ids, attention_mask, token_type_ids)
            fold_pred.append(outputs.cpu().numpy())
    
    # Concatenate predictions
    fold_pred = np.concatenate(fold_pred, axis=0)
    fold_predictions.append(fold_pred)
    
    print(f"Fold {fold} predictions shape: {fold_pred.shape}")
    
    # Cleanup
    del model
    torch.cuda.empty_cache()

# Average predictions across folds
test_predictions = np.mean(fold_predictions, axis=0)
print(f"\nFinal test predictions shape: {test_predictions.shape}")

# Save test predictions
np.save('/home/code/experiments/004_bert_token_type_ids/test_predictions.npy', test_predictions)
np.save('/home/code/experiments/005_dbscan_postprocessing/test_predictions_before_postprocessing.npy', test_predictions)
print("Test predictions saved!")

Checkpoint keys: ['bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.

In [None]:
# Define the model architecture (same as exp_007)
class BERTQuestionAnswering(nn.Module):
    def __init__(self, model_name='bert-base-uncased', num_labels=30):
        super(BERTQuestionAnswering, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)
        
    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        # Use [CLS] token representation
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Generate predictions for each fold
fold_predictions = []

for fold in range(5):
    print(f"\nGenerating predictions for fold {fold}...")
    
    # Load model
    model_path = f'/home/code/experiments/004_bert_token_type_ids/best_model_fold_{fold}.pth'
    model = BERTQuestionAnswering('bert-base-uncased', num_labels=len(target_cols))
    
    # Load checkpoint
    checkpoint = torch.load(model_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(device)
    model.eval()
    
    # Generate predictions
    fold_pred = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            
            outputs = model(input_ids, attention_mask, token_type_ids)
            fold_pred.append(outputs.cpu().numpy())
    
    # Concatenate predictions
    fold_pred = np.concatenate(fold_pred, axis=0)
    fold_predictions.append(fold_pred)
    
    print(f"Fold {fold} predictions shape: {fold_pred.shape}")
    
    # Cleanup
    del model
    torch.cuda.empty_cache()

# Average predictions across folds
test_predictions = np.mean(fold_predictions, axis=0)
print(f"\nFinal test predictions shape: {test_predictions.shape}")

# Save test predictions
np.save('/home/code/experiments/004_bert_token_type_ids/test_predictions.npy', test_predictions)
np.save('/home/code/experiments/005_dbscan_postprocessing/test_predictions_before_postprocessing.npy', test_predictions)
print("Test predictions saved!")

In [None]:
# Create submission file
# First check sample submission format
sample_sub = pd.read_csv('/home/data/sample_submission.csv')
print("Sample submission format:")
print(sample_sub.head())
print(f"\nColumns: {list(sample_sub.columns)}")
print(f"Shape: {sample_sub.shape}")

# Verify our predictions match the expected format
assert processed_test.shape[1] == len(target_cols), f"Prediction shape mismatch"
assert len(test_df) == len(processed_test), f"Row count mismatch"

# Create submission DataFrame
submission = pd.DataFrame({
    'qa_id': test_df['qa_id'],
})

# Add predictions for each target column
for i, col in enumerate(target_cols):
    submission[col] = processed_test[:, i]

print(f"\nSubmission shape: {submission.shape}")
print("\nFirst few rows:")
print(submission.head())

# Save submission
os.makedirs('/home/submission', exist_ok=True)
submission.to_csv('/home/submission/submission.csv', index=False)
print(f"\nSaved submission to: /home/submission/submission.csv")