In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [6]:
# -------------------- IMPORTS --------------------
import pandas as pd
import numpy as np
!pip install catboost
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.base import BaseEstimator, clone

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor

# -------------------- USER INPUT --------------------
train_file = "/kaggle/input/titanic/train.csv"
test_file = "/kaggle/input/titanic/test.csv"
target_column = "Survived"  # set your target column name

# -------------------- LOAD DATA --------------------
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

X = train.drop(columns=[target_column])
y = train[target_column]
X_test = test.copy()

# Detect classification vs regression
if y.dtype == 'object' or len(y.unique()) <= 20:
    problem_type = 'classification'
else:
    problem_type = 'regression'

print(f"Problem type detected: {problem_type}")

# -------------------- HANDLE MISSING VALUES --------------------
# Fill numerical NaNs with median, categorical with "missing"
for col in X.columns:
    if X[col].dtype in [np.float64, np.int64]:
        median = X[col].median()
        X[col].fillna(median, inplace=True)
        if col in X_test.columns:
            X_test[col].fillna(median, inplace=True)
    else:
        X[col].fillna("missing", inplace=True)
        if col in X_test.columns:
            X_test[col].fillna("missing", inplace=True)

# -------------------- ENCODE CATEGORICALS --------------------
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    
    # Map test set, unseen labels get -1
    mapping = {label: idx for idx, label in enumerate(le.classes_)}
    X_test[col] = X_test[col].map(mapping).fillna(-1).astype(int)

# -------------------- STACKING CLASS --------------------
class StackingEnsemble(BaseEstimator):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = clone(meta_model)
        self.n_folds = n_folds
    
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_features_ = np.zeros((X.shape[0], len(self.base_models)))
        
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)
        
        for i, model in enumerate(self.base_models):
            for train_idx, holdout_idx in kf.split(X, y):
                instance = clone(model)
                instance.fit(X.iloc[train_idx], y.iloc[train_idx])
                self.base_models_[i].append(instance)
                self.meta_features_[holdout_idx, i] = instance.predict(X.iloc[holdout_idx])
        
        self.meta_model.fit(self.meta_features_, y)
        return self
    
    def predict(self, X):
        meta_features = np.column_stack([
            np.mean([model.predict(X) for model in base_models_fold], axis=0)
            for base_models_fold in self.base_models_
        ])
        return self.meta_model.predict(meta_features)

# -------------------- BASE MODELS --------------------
if problem_type == 'regression':
    base_models = [
        lgb.LGBMRegressor(n_estimators=500, learning_rate=0.05, num_leaves=31, random_state=42),
        xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6, random_state=42),
        CatBoostRegressor(iterations=500, learning_rate=0.05, depth=6, verbose=0, random_state=42)
    ]
    meta_model = LinearRegression()
else:
    base_models = [
        lgb.LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=31, random_state=42),
        xgb.XGBClassifier(n_estimators=500, learning_rate=0.05, max_depth=6, use_label_encoder=False, eval_metric='logloss', random_state=42),
        CatBoostClassifier(iterations=500, learning_rate=0.05, depth=6, verbose=0, random_state=42)
    ]
    meta_model = LogisticRegression(max_iter=1000)

# -------------------- TRAIN STACKING MODEL --------------------
stack_model = StackingEnsemble(base_models=base_models, meta_model=meta_model)
stack_model.fit(X, y)

# -------------------- PREDICTIONS --------------------
preds = stack_model.predict(X_test)

# -------------------- PERFORMANCE --------------------
preds_train = stack_model.predict(X)
if problem_type == 'regression':
    print("Train R2:", r2_score(y, preds_train))
    print("Train RMSE:", mean_squared_error(y, preds_train, squared=False))
else:
    print("Train Accuracy:", accuracy_score(y, preds_train))
    print("Train F1 Score:", f1_score(y, preds_train, average='weighted'))

# -------------------- OUTPUT --------------------
output = pd.DataFrame(preds, columns=[target_column])
output.to_csv("stacking_predictions.csv", index=False)
print("Predictions saved to stacking_predictions.csv")


Problem type detected: classification
[LightGBM] [Info] Number of positive: 268, number of negative: 444
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000559 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 955
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376404 -> initscore=-0.504838
[LightGBM] [Info] Start training from score -0.504838
[LightGBM] [Info] Number of positive: 275, number of negative: 438
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000696 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 965
[LightGBM] [Info] Number of data points in the train set: 713, number of used feature