## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy.optimize import minimize
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings('ignore')
sns.set_style("whitegrid")

## Submissions Analysis

In [None]:
def analyze_submissions(file_paths):
    
    print("Submission Analysis")
    print("=" * 70)
    
    dfs = []
    analysis = []
    
    for idx, path in enumerate(file_paths, 1):
        df = pd.read_csv(path)
        filename = Path(path).name
        
        stats = {
            'model': f'Model_{idx}',
            'filename': filename,
            'samples': len(df),
            'missing': df['exam_score'].isna().sum(),
            'duplicates': df['id'].duplicated().sum(),
            'mean': df['exam_score'].mean(),
            'std': df['exam_score'].std(),
            'min': df['exam_score'].min(),
            'max': df['exam_score'].max()
        }
        
        analysis.append(stats)
        dfs.append(df)
        
        print(f"{stats['model']}: {filename}")
        print(f"  Samples: {stats['samples']} | Mean: {stats['mean']:.6f} | Std: {stats['std']:.6f}")
    
    analysis_df = pd.DataFrame(analysis)
    print("=" * 70)
    
    return dfs, analysis_df


def calculate_correlation(dfs):
    
    print("\nPrediction Correlation Analysis")
    print("-" * 70)
    
    merged = dfs[0][['id', 'exam_score']].copy()
    merged.columns = ['id', 'model_1']
    
    for i, df in enumerate(dfs[1:], 2):
        temp = df[['id', 'exam_score']].copy()
        temp.columns = ['id', f'model_{i}']
        merged = merged.merge(temp, on='id', how='inner')
    
    model_cols = [c for c in merged.columns if c.startswith('model_')]
    corr_matrix = merged[model_cols].corr()
    
    print("\nCorrelation Matrix:")
    print(corr_matrix.round(4))
    
    avg_corr = corr_matrix.values[np.triu_indices_from(corr_matrix.values, k=1)].mean()
    print(f"\nAverage Correlation: {avg_corr:.4f}")
    
    if avg_corr > 0.95:
        print("High correlation detected. Limited ensemble benefit expected.")
    elif avg_corr > 0.85:
        print("Moderate correlation. Some ensemble benefit expected.")
    else:
        print("Good diversity detected. Strong ensemble potential.")
    
    return corr_matrix, merged

## Weight Optimization

In [None]:
def optimize_weights(merged_df, method='hill_climb', validation_split=0.2):
    
    print(f"\nWeight Optimization: {method.upper()}")
    print("=" * 70)
    
    model_cols = [c for c in merged_df.columns if c.startswith('model_')]
    n_models = len(model_cols)
    
    if validation_split > 0:
        n_val = int(len(merged_df) * validation_split)
        indices = np.random.permutation(len(merged_df))
        train_df = merged_df.iloc[indices[n_val:]].copy()
        val_df = merged_df.iloc[indices[:n_val]].copy()
        print(f"Train: {len(train_df)} | Validation: {len(val_df)}")
    else:
        train_df = val_df = merged_df.copy()
    
    if method == 'equal':
        weights = np.ones(n_models) / n_models
        
    elif method == 'hill_climb':
        weights = hill_climb_optimization(train_df, val_df, model_cols)
        
    elif method == 'scipy':
        weights = scipy_optimization(train_df, val_df, model_cols)
        
    else:
        weights = np.ones(n_models) / n_models
    
    print(f"\nOptimized Weights: {weights}")
    return weights


def hill_climb_optimization(train_df, val_df, model_cols, iterations=1000):
    
    n_models = len(model_cols)
    best_weights = np.ones(n_models) / n_models
    target = val_df[model_cols].mean(axis=1)
    best_score = mean_squared_error(target, val_df[model_cols].mean(axis=1))
    
    for _ in range(iterations):
        new_weights = best_weights + np.random.randn(n_models) * 0.1
        new_weights = np.maximum(new_weights, 0)
        new_weights = new_weights / new_weights.sum()
        
        pred = (val_df[model_cols].values * new_weights).sum(axis=1)
        score = mean_squared_error(target, pred)
        
        if score < best_score:
            best_score = score
            best_weights = new_weights
    
    print(f"Validation MSE: {best_score:.6f}")
    return best_weights


def scipy_optimization(train_df, val_df, model_cols):
    
    n_models = len(model_cols)
    target = val_df[model_cols].mean(axis=1)
    
    def objective(weights):
        weights = weights / weights.sum()
        pred = (val_df[model_cols].values * weights).sum(axis=1)
        return mean_squared_error(target, pred)
    
    constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}
    bounds = [(0, 1) for _ in range(n_models)]
    initial = np.ones(n_models) / n_models
    
    result = minimize(objective, initial, method='SLSQP', bounds=bounds, constraints=constraints)
    optimal_weights = result.x / result.x.sum()
    
    print(f"Validation MSE: {result.fun:.6f}")
    return optimal_weights

## Blending

In [None]:
def blend_submissions(file_paths, weights, output_path="submission.csv", visualize=True):
    
    print("\nBlending Submissions")
    print("=" * 70)
    
    dfs = []
    
    for idx, (path, weight) in enumerate(zip(file_paths, weights), 1):
        df = pd.read_csv(path)
        df = df.dropna(subset=['exam_score'])
        df = df[np.isfinite(df['exam_score'])]
        df['weighted_pred'] = df['exam_score'] * weight
        dfs.append(df)
        print(f"Model {idx}: {Path(path).name} (weight: {weight:.4f})")
    
    merged = dfs[0][['id', 'weighted_pred']].copy()
    
    for i, df in enumerate(dfs[1:], 1):
        merged = merged.merge(df[['id', 'weighted_pred']], on='id', suffixes=('', f'_{i}'))
    
    weight_cols = [c for c in merged.columns if 'weighted_pred' in c]
    merged['exam_score'] = merged[weight_cols].sum(axis=1)
    
    for i, df in enumerate(dfs):
        temp = df[['id', 'exam_score']].copy()
        temp.columns = ['id', f'model_{i+1}']
        merged = merged.merge(temp, on='id', how='left')
    
    merged = merged.dropna()
    result = merged[['id', 'exam_score']].copy()
    
    print(f"\nBlended Results:")
    print(f"  Samples: {len(result)}")
    print(f"  Mean: {result['exam_score'].mean():.6f}")
    print(f"  Std: {result['exam_score'].std():.6f}")
    
    if visualize:
        visualize_blend(dfs, result, weights)
    
    result.to_csv(output_path, index=False)
    print(f"\nSaved: {output_path}")
    
    return merged

In [None]:
def visualize_blend(dfs, result, weights):
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    colors = plt.cm.tab10(np.linspace(0, 1, len(dfs)))
    
    for i, df in enumerate(dfs, 1):
        data = df['exam_score'].dropna()
        axes[0].hist(data, bins=50, alpha=0.4, color=colors[i-1], 
                    label=f'Model {i} (w={weights[i-1]:.3f})')
    
    axes[0].hist(result['exam_score'], bins=50, alpha=0.6, 
                color='black', label='Blended', linewidth=2)
    axes[0].set_xlabel('Exam Score')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Distribution Comparison')
    axes[0].legend()
    axes[0].grid(alpha=0.3)
    
    box_data = [df['exam_score'].dropna().values for df in dfs]
    box_data.append(result['exam_score'].values)
    labels = [f'M{i}\n{weights[i-1]:.3f}' for i in range(1, len(dfs)+1)] + ['Blend']
    
    bp = axes[1].boxplot(box_data, labels=labels, patch_artist=True)
    for patch, color in zip(bp['boxes'][:-1], colors):
        patch.set_facecolor(color)
        patch.set_alpha(0.6)
    bp['boxes'][-1].set_facecolor('black')
    bp['boxes'][-1].set_alpha(0.6)
    
    axes[1].set_ylabel('Exam Score')
    axes[1].set_title('Box Plot Comparison')
    axes[1].grid(alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.savefig('blending_analysis.png', dpi=200)
    plt.show()

## Execute Pipeline

In [None]:
file_paths = [
    "/kaggle/input/s6e1-student-test-scores/8.54461_sub.csv",
    "/kaggle/input/s6e1-student-test-scores/8.54462_sub.csv",
    "/kaggle/input/s6e1-student-test-scores/8.54465_sub.csv",
    "/kaggle/input/s6e1-student-test-scores/8.54466_sub.csv",
    "/kaggle/input/s6e1-student-test-scores/8.54476_sub.csv",
]

dfs, analysis_df = analyze_submissions(file_paths)

corr_matrix, merged_df = calculate_correlation(dfs)

methods = ['equal', 'hill_climb', 'scipy']
all_weights = {}

for method in methods:
    weights = optimize_weights(merged_df, method=method, validation_split=0.2)
    all_weights[method] = weights

chosen_method = 'hill_climb'
final_weights = all_weights[chosen_method]

blended_df = blend_submissions(file_paths, final_weights, output_path="submission.csv", visualize=True)

print("\n" + "=" * 70)
print("Pipeline Complete")
print("=" * 70)