## Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')
sns.set_style("whitegrid")

## Blending Function

In [None]:
def blend_submissions(weight_dict, output_path="submission.csv", visualize=True):
    
    print("Starting Blending Pipeline")
    print("=" * 70)
    
    dfs = []
    stats = {}
    
    print("\nLoading submissions...")
    for idx, (path, weight) in enumerate(weight_dict.items(), 1):
        df = pd.read_csv(path)
        
        if df['exam_score'].isna().any():
            print(f"  Warning: Found {df['exam_score'].isna().sum()} NaN values in {Path(path).name}")
            df = df.dropna(subset=['exam_score'])
        
        if not np.isfinite(df['exam_score']).all():
            print(f"  Warning: Found infinite values in {Path(path).name}")
            df = df[np.isfinite(df['exam_score'])]
        
        df['weighted_pred'] = df['exam_score'] * weight
        dfs.append(df)
        
        stats[f'Model_{idx}'] = {
            'file': Path(path).name,
            'weight': weight,
            'mean': df['exam_score'].mean(),
            'std': df['exam_score'].std()
        }
        print(f"  Loaded {Path(path).name}: weight={weight}")
    
    print("\nMerging predictions...")
    merged = dfs[0][['id', 'weighted_pred']].copy()
    
    for i, df in enumerate(dfs[1:], 1):
        merged = merged.merge(
            df[['id', 'weighted_pred']], 
            on='id', 
            suffixes=('', f'_{i}')
        )
    
    weight_cols = [c for c in merged.columns if 'weighted_pred' in c]
    merged['exam_score'] = merged[weight_cols].sum(axis=1) / sum(weight_dict.values())
    
    merged = merged.dropna(subset=['exam_score'])
    merged = merged[np.isfinite(merged['exam_score'])]
    
    for i, df in enumerate(dfs):
        merged[f'model_{i+1}'] = df.set_index('id')['exam_score']
    
    result = merged[['id', 'exam_score']].copy()
    
    print("\nStatistics:")
    print("-" * 70)
    total_weight = sum(weight_dict.values())
    for name, s in stats.items():
        pct = (s['weight'] / total_weight) * 100
        print(f"{name}: {s['file']}")
        print(f"  Weight: {s['weight']:.2f} ({pct:.1f}%) | Mean: {s['mean']:.4f} | Std: {s['std']:.4f}")
    
    print(f"\nBlended: Mean={result['exam_score'].mean():.4f}, Std={result['exam_score'].std():.4f}")
    print("-" * 70)
    
    if visualize and len(dfs) > 1:
        create_visualization(dfs, result, stats, total_weight)
    
    result.to_csv(output_path, index=False)
    print(f"\nBlended submission saved: {output_path}")
    print(f"Total samples: {len(result)}")
    
    return merged

In [None]:
def create_visualization(dfs, result, stats, total_weight):
    
    print("\nGenerating visualization...")
    
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    colors = plt.cm.Set2(np.linspace(0, 1, len(dfs)))
    
    # Distribution comparison
    for i, df in enumerate(dfs, 1):
        data = df['exam_score'].dropna()
        data = data[np.isfinite(data)]
        if len(data) > 0:
            axes[0].hist(data, bins=50, alpha=0.4, 
                        color=colors[i-1], label=f'Model {i}', edgecolor='white')
            data.plot.kde(ax=axes[0], color=colors[i-1], linewidth=2)
    
    result_data = result['exam_score'].dropna()
    result_data = result_data[np.isfinite(result_data)]
    
    axes[0].hist(result_data, bins=50, alpha=0.6, 
                color='crimson', label='Blended', edgecolor='darkred', linewidth=1.5)
    result_data.plot.kde(ax=axes[0], color='darkred', linewidth=2.5, linestyle='--')
    
    axes[0].set_xlabel('Exam Score', fontsize=11, fontweight='bold')
    axes[0].set_ylabel('Density / Frequency', fontsize=11, fontweight='bold')
    axes[0].set_title('Score Distribution: Models vs Ensemble', fontsize=12, fontweight='bold')
    axes[0].legend()
    axes[0].grid(alpha=0.3)
    
    # Box plot comparison
    box_data = []
    for df in dfs:
        data = df['exam_score'].dropna()
        data = data[np.isfinite(data)]
        box_data.append(data.values)
    box_data.append(result_data.values)
    
    box_labels = [f'Model {i}' for i in range(1, len(dfs) + 1)] + ['Blended']
    
    bp = axes[1].boxplot(box_data, labels=box_labels, patch_artist=True,
                        notch=True, widths=0.6)
    
    for patch, color in zip(bp['boxes'][:-1], colors):
        patch.set_facecolor(color)
        patch.set_alpha(0.7)
    bp['boxes'][-1].set_facecolor('crimson')
    bp['boxes'][-1].set_alpha(0.8)
    
    axes[1].set_ylabel('Exam Score', fontsize=11, fontweight='bold')
    axes[1].set_title('Distribution Comparison (Box Plot)', fontsize=12, fontweight='bold')
    axes[1].grid(alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.savefig('blending_analysis.png', dpi=200, bbox_inches='tight')
    plt.show()
    
    print("Saved: blending_analysis.png")

## OOF Generation Function

In [None]:
def generate_oof(blended_df, output_path="oof_predictions.csv", visualize=True):
    
    print("\n" + "=" * 70)
    print("Generating OOF Predictions")
    print("=" * 70)
    
    model_cols = [c for c in blended_df.columns if c.startswith('model_')]
    
    if len(model_cols) == 0:
        print("No individual model predictions found")
        return None
    
    oof = blended_df[['id', 'exam_score'] + model_cols].copy()
    
    if len(model_cols) > 1:
        pred_std = oof[model_cols].std(axis=1)
        oof['diversity'] = pred_std
        
        print(f"\nOOF Metrics:")
        print(f"  Models included: {len(model_cols)}")
        print(f"  Samples: {len(oof)}")
        print(f"  Mean diversity: {pred_std.mean():.4f}")
        print(f"  Agreement score: {(1 / (1 + pred_std.mean())):.4f}")
        
        if visualize:
            create_oof_visualization(oof, model_cols, pred_std)
    
    oof.to_csv(output_path, index=False)
    print(f"\nOOF predictions saved: {output_path}")
    
    return oof

In [None]:
def create_oof_visualization(oof, model_cols, pred_std):
    
    print("\nGenerating OOF visualization...")
    
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    colors = plt.cm.Set2(np.linspace(0, 1, len(model_cols)))
    
    # Diversity distribution
    pred_std_clean = pred_std.dropna()
    pred_std_clean = pred_std_clean[np.isfinite(pred_std_clean)]
    
    axes[0].hist(pred_std_clean, bins=50, color='steelblue', edgecolor='navy', alpha=0.7)
    axes[0].axvline(pred_std_clean.mean(), color='red', linestyle='--', linewidth=2,
                   label=f'Mean: {pred_std_clean.mean():.4f}')
    axes[0].set_xlabel('Standard Deviation Across Models', fontsize=11, fontweight='bold')
    axes[0].set_ylabel('Frequency', fontsize=11, fontweight='bold')
    axes[0].set_title('Prediction Diversity Distribution', fontsize=12, fontweight='bold')
    axes[0].legend()
    axes[0].grid(alpha=0.3)
    
    # Model vs Ensemble scatter
    oof_clean = oof.dropna(subset=['exam_score'] + model_cols)
    
    for i, col in enumerate(model_cols):
        axes[1].scatter(oof_clean[col], oof_clean['exam_score'], 
                       alpha=0.4, s=20, color=colors[i], label=f'Model {i+1}')
    
    min_val = min(oof_clean['exam_score'].min(), min([oof_clean[col].min() for col in model_cols]))
    max_val = max(oof_clean['exam_score'].max(), max([oof_clean[col].max() for col in model_cols]))
    axes[1].plot([min_val, max_val], [min_val, max_val], 
                'r--', linewidth=2, label='Perfect Agreement')
    
    axes[1].set_xlabel('Individual Model Predictions', fontsize=11, fontweight='bold')
    axes[1].set_ylabel('Ensemble Prediction', fontsize=11, fontweight='bold')
    axes[1].set_title('Models vs Ensemble', fontsize=12, fontweight='bold')
    axes[1].legend()
    axes[1].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('oof_analysis.png', dpi=200, bbox_inches='tight')
    plt.show()
    
    print("Saved: oof_analysis.png")

## Execute Pipeline

In [None]:
weight_dict = {
    "/kaggle/input/student-test-scores-vault/submission.csv": 2.7,
    "/kaggle/input/student-test-scores-vault/submission (1).csv": 0.1,
}

blended_df = blend_submissions(weight_dict, output_path="submission.csv", visualize=True)

oof_df = generate_oof(blended_df, output_path="oof_predictions.csv", visualize=True)

print("\n" + "=" * 70)
print("Pipeline Complete")
print("=" * 70)