# DBSCAN Post-Processing for exp_007

Implement DBSCAN clustering post-processing as described in the 77th place writeup.
This technique smooths predictions by clustering nearby values and replacing with cluster medians.

Expected improvement: +0.025-0.030 (0.3612 â†’ 0.386-0.391)

In [None]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from scipy.stats import spearmanr
import os

# Load data
train_df = pd.read_csv('/home/data/train.csv')
test_df = pd.read_csv('/home/data/test.csv')
target_cols = [c for c in train_df.columns if c not in ['qa_id', 'question_title', 'question_body', 'answer']]

print(f"Number of target columns: {len(target_cols)}")
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

In [None]:
# Let's check the actual columns in train.csv and understand the target structure
train_df = pd.read_csv('/home/data/train.csv')
all_cols = list(train_df.columns)
print(f"Total columns: {len(all_cols)}")
print(f"First 10 columns: {all_cols[:10]}")
print(f"Last 10 columns: {all_cols[-10:]}")

# Identify target columns (excluding metadata columns)
meta_cols = ['qa_id', 'question_title', 'question_body', 'question_user_name', 
             'question_user_page', 'answer', 'answer_user_name', 'answer_user_page', 
             'url', 'category', 'host']

target_cols = [c for c in all_cols if c not in meta_cols]
print(f"\nNumber of target columns: {len(target_cols)}")
print(f"Target columns: {target_cols}")

# Check OOF predictions shape again
oof_preds = np.load('/home/code/experiments/004_bert_token_type_ids/oof_predictions.npy')
print(f"\nOOF predictions shape: {oof_preds.shape}")

# Check if test predictions exist and their shape
try:
    test_preds = np.load('/home/code/experiments/004_bert_token_type_ids/test_predictions.npy')
    print(f"Test predictions shape: {test_preds.shape}")
except:
    print("Test predictions file not found")

In [None]:
# Load OOF predictions from exp_007
oof_preds = np.load('/home/code/experiments/004_bert_token_type_ids/oof_predictions.npy')
print(f"OOF predictions shape: {oof_preds.shape}")

# Verify shape matches targets
assert oof_preds.shape[1] == len(target_cols), f"Shape mismatch: {oof_preds.shape[1]} vs {len(target_cols)}"

# Create DataFrame for easier handling
oof_df = pd.DataFrame(oof_preds, columns=target_cols)
print(f"OOF DataFrame shape: {oof_df.shape}")
print("\nFirst few predictions:")
print(oof_df.head())

In [None]:
# Calculate baseline CV score (should match exp_007: 0.3612)
def calculate_spearman(y_true, y_pred):
    """Calculate mean Spearman correlation across all targets"""
    scores = []
    for i, col in enumerate(target_cols):
        score = spearmanr(y_true[col], y_pred[:, i]).correlation
        scores.append(score)
    return np.mean(scores)

baseline_score = calculate_spearman(train_df[target_cols], oof_preds)
print(f"Baseline CV score: {baseline_score:.6f}")
print(f"Expected (exp_007): 0.3612008823544772")

In [None]:
# Implement DBSCAN post-processing
def dbscan_postprocess(predictions, eps_percentile=0.95, min_samples=2):
    """
    Apply DBSCAN clustering to smooth predictions.
    For each target column:
    1. Sort predictions and compute differences
    2. Set eps = percentile of differences (default: 95th percentile)
    3. Apply DBSCAN clustering
    4. Replace each cluster's predictions with cluster median
    """
    processed_preds = predictions.copy()
    
    for i in range(predictions.shape[1]):
        col_preds = predictions[:, i]
        
        # Sort predictions and compute differences
        sorted_idx = np.argsort(col_preds)
        sorted_preds = col_preds[sorted_idx]
        
        # Compute differences between consecutive sorted values
        diffs = np.diff(sorted_preds)
        
        # Set eps as percentile of differences
        if len(diffs) > 0:
            eps = np.percentile(diffs, eps_percentile * 100)
            
            # Apply DBSCAN
            # Reshape for sklearn (n_samples, n_features)
            X = col_preds.reshape(-1, 1)
            db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
            
            # Get cluster labels (-1 = noise/outliers)
            labels = db.labels_
            
            # Replace each cluster's predictions with median
            unique_labels = set(labels)
            for label in unique_labels:
                if label == -1:  # Skip noise points
                    continue
                
                # Get indices for this cluster
                cluster_mask = labels == label
                if np.sum(cluster_mask) >= min_samples:
                    # Replace with median of cluster
                    cluster_median = np.median(col_preds[cluster_mask])
                    processed_preds[cluster_mask, i] = cluster_median
    
    return processed_preds

# Test with different eps percentiles
for percentile in [0.90, 0.95, 0.98]:
    processed = dbscan_postprocess(oof_preds, eps_percentile=percentile)
    score = calculate_spearman(train_df[target_cols], processed)
    print(f"DBSCAN (eps={percentile:.2f}): {score:.6f}")

In [None]:
# Use best percentile and apply to OOF predictions
best_percentile = 0.95  # Based on typical results from writeup
processed_oof = dbscan_postprocess(oof_preds, eps_percentile=best_percentile)

# Calculate final score
final_score = calculate_spearman(train_df[target_cols], processed_oof)
print(f"Final CV score after DBSCAN: {final_score:.6f}")
print(f"Improvement: +{final_score - baseline_score:.6f}")

# Save processed OOF predictions
np.save('/home/code/experiments/005_dbscan_postprocessing/processed_oof_predictions.npy', processed_oof)
print(f"\nSaved processed OOF predictions to: /home/code/experiments/005_dbscan_postprocessing/processed_oof_predictions.npy")

In [None]:
# Load test predictions from exp_007 and apply same post-processing
test_preds = np.load('/home/code/experiments/004_bert_token_type_ids/test_predictions.npy')
print(f"Test predictions shape: {test_preds.shape}")

# Apply DBSCAN post-processing to test predictions
# Note: We use the same eps_percentile as for OOF
processed_test = dbscan_postprocess(test_preds, eps_percentile=best_percentile)

# Save processed test predictions
np.save('/home/code/experiments/005_dbscan_postprocessing/processed_test_predictions.npy', processed_test)
print(f"Saved processed test predictions to: /home/code/experiments/005_dbscan_postprocessing/processed_test_predictions.npy")

In [None]:
# Create submission file
# First check sample submission format
sample_sub = pd.read_csv('/home/data/sample_submission.csv')
print("Sample submission format:")
print(sample_sub.head())
print(f"\nColumns: {list(sample_sub.columns)}")
print(f"Shape: {sample_sub.shape}")

# Verify our predictions match the expected format
assert processed_test.shape[1] == len(target_cols), f"Prediction shape mismatch"
assert len(test_df) == len(processed_test), f"Row count mismatch"

# Create submission DataFrame
submission = pd.DataFrame({
    'qa_id': test_df['qa_id'],
})

# Add predictions for each target column
for i, col in enumerate(target_cols):
    submission[col] = processed_test[:, i]

print(f"\nSubmission shape: {submission.shape}")
print("\nFirst few rows:")
print(submission.head())

# Save submission
os.makedirs('/home/submission', exist_ok=True)
submission.to_csv('/home/submission/submission.csv', index=False)
print(f"\nSaved submission to: /home/submission/submission.csv")