# 🧠 Kaggle Competition Template – Ensemble Learning (Multiclass)

This notebook is optimized for structured ensemble learning competitions.  
It supports baseline → boosting → voting/stacking.

**Pipeline:**
1. EDA (Quick)
2. Preprocessing
3. Feature Engineering
4. Modeling (Multiple Models)
5. Ensemble (Voting/Stacking)
6. Evaluation (Accuracy/F1)
7. Submission
8. Experiment Log


In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# Optional ensemble models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

import warnings
warnings.filterwarnings("ignore")


In [None]:
# === Basic Info ===
print("Dataset shape:", df_train.shape)
display(df_train.head(3))
df_train.info()

# === Missing Values ===
missing = df_train.isna().sum().sort_values(ascending=False)
missing = missing[missing > 0]
plt.figure(figsize=(10,5))
sns.barplot(x=missing.values, y=missing.index)
plt.title("Missing Values per Feature")
plt.show()

# === Target Distribution ===
plt.figure(figsize=(6,4))
sns.countplot(x=df_train['target'])
plt.title("Target Distribution")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

print("Class proportions:")
print(df_train['target'].value_counts(normalize=True).round(3))


In [None]:
# === Feature Summary ===
num_cols = df_train.select_dtypes(include=['int64','float64']).columns
cat_cols = df_train.select_dtypes(include=['object']).columns

print(f"Numerical features: {len(num_cols)}")
print(f"Categorical features: {len(cat_cols)}")


In [None]:
# === Correlation (Numerical) ===
plt.figure(figsize=(10,8))
corr = df_train[num_cols].corr()
sns.heatmap(corr, cmap='coolwarm', annot=False)
plt.title("Numerical Feature Correlation")
plt.show()


In [None]:
# === Numerical Features vs Target ===
for col in num_cols[:6]:
    plt.figure(figsize=(5,3))
    sns.boxplot(x='target', y=col, data=df_train)
    plt.title(f"{col} vs Target")
    plt.show()


In [None]:
# === Categorical Features vs Target ===
for col in cat_cols[:5]:
    plt.figure(figsize=(6,3))
    sns.countplot(x=col, hue='target', data=df_train)
    plt.title(f"{col} vs Target")
    plt.xticks(rotation=30)
    plt.show()


In [None]:
# === Outlier Detection ===
plt.figure(figsize=(12,4))
sns.boxplot(data=df_train[num_cols])
plt.title("Outlier Overview (Numerical Features)")
plt.show()


In [None]:
# === Quick Feature Importance (RF baseline) ===
le = LabelEncoder()
y_encoded = le.fit_transform(df_train['target'])

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(df_train[num_cols].fillna(0), y_encoded)

importances = pd.Series(rf.feature_importances_, index=num_cols).sort_values(ascending=False)
plt.figure(figsize=(8,4))
sns.barplot(x=importances.values[:10], y=importances.index[:10])
plt.title("Top 10 Feature Importances (Quick RF Baseline)")
plt.show()


In [None]:
def feature_engineering(df):
    df = df.copy()
    
    # Generic rules
    if "release_year" in df.columns:
        df["device_age"] = 2025 - df["release_year"].fillna(2025)

    # Interaction (optional)
    if all(col in df.columns for col in ["cpu_tier", "gpu_tier"]):
        df["performance_score"] = df["cpu_tier"] * df["gpu_tier"]
    
    # Domain-specific placeholders
    # (edit per competition)
    # if "income" in df.columns and "expenses" in df.columns:
    #     df["savings_ratio"] = df["income"] / (df["expenses"] + 1)
    
    return df


In [None]:
def build_preprocessor(X, num_strategy="median", scaler="standard", cat_strategy="most_frequent"):
    num_cols = X.select_dtypes(include=['int64', 'float64']).columns
    cat_cols = X.select_dtypes(include=['object']).columns

    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy=num_strategy)),
        ("scaler", StandardScaler() if scaler=="standard" else MinMaxScaler())
    ])

    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy=cat_strategy)),
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer([
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ])
    return preprocessor


In [None]:
def build_models():
    models = {
        "logreg": LogisticRegression(max_iter=500),
        "rf": RandomForestClassifier(n_estimators=300, random_state=42),
        "xgb": XGBClassifier(
            n_estimators=400, learning_rate=0.05, random_state=42, 
            use_label_encoder=False, eval_metric='mlogloss'
        ),
        "gb": GradientBoostingClassifier(n_estimators=300, random_state=42),
        "svc": SVC(probability=True, kernel='rbf')
    }
    return models


In [None]:
def build_voting_ensemble(models):
    estimators = [(name, m) for name, m in models.items()]
    voting_clf = VotingClassifier(estimators=estimators, voting='soft')
    return voting_clf

def build_stacking_ensemble(models):
    estimators = [(name, m) for name, m in models.items()]
    final_estimator = LogisticRegression(max_iter=500)
    stack_clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)
    return stack_clf


In [None]:
def evaluate_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average="weighted")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred))
    return acc, f1


In [None]:
def generate_submission(model, preprocessor, df_train, df_test):
    X_train = df_train.drop(columns=["target"])
    y_train = df_train["target"]
    X_test = df_test.copy()
    
    X_train_proc = preprocessor.fit_transform(X_train, y_train)
    X_test_proc = preprocessor.transform(X_test)
    
    model.fit(X_train_proc, y_train)
    y_pred = model.predict(X_test_proc)
    
    submission = pd.DataFrame({
        "id": df_test["id"],
        "target": y_pred
    })
    submission.to_csv("submission.csv", index=False)
    print("✅ Submission saved: submission.csv")


In [None]:
import csv

def log_experiment(version, model_name, ensemble_type, acc, f1, notes):
    with open("experiment_log.csv", "a", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([version, model_name, ensemble_type, acc, f1, notes])


In [None]:
# === 1. Feature Engineering ===
df_train = feature_engineering(df_train)
df_test = feature_engineering(df_test)

# === 2. Split Data ===
X = df_train.drop(columns=["target"])
y = df_train["target"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# === 3. Preprocess ===
preprocessor = build_preprocessor(X_train)
X_train_proc = preprocessor.fit_transform(X_train, y_train)
X_val_proc = preprocessor.transform(X_val)

# === 4. Build Base Models ===
models = build_models()

# === 5. Build Ensemble ===
voting_model = build_voting_ensemble(models)
stacking_model = build_stacking_ensemble(models)

# === 6. Evaluate (example with stacking) ===
acc, f1 = evaluate_model(stacking_model, X_train_proc, y_train, X_val_proc, y_val)

# === 7. Log Experiment ===
log_experiment("v1", "stacking", "LogReg Meta", acc, f1, "Baseline ensemble")
