In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool

import optuna

# ---------------------------
# 1. Feature Engineering
# ---------------------------
def feature_engineering(df, is_train=True):
    # Extract Title from Name
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(
        ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],
        'Rare')
    df['Title'] = df['Title'].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})
    
    # Impute missing Age based on Title
    df['Age'] = df.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))
    
    # Fill missing Fare (only in test)
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    
    # Fill missing Embarked with mode
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    
    # Create FamilySize and IsAlone features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = 1
    df.loc[df['FamilySize'] > 1, 'IsAlone'] = 0
    
    # Interaction feature: Age * Class
    df['Age_Class'] = df['Age'] * df['Pclass']
    
    # Extract Cabin letter; fill missing cabins with 'U'
    df['Cabin'].fillna('U', inplace=True)
    df['Cabin'] = df['Cabin'].apply(lambda x: x[0])
    
    # Optionally drop columns that are less informative
    if is_train:
        df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    else:
        # For test set, keep PassengerId for submission
        df.drop(['Name', 'Ticket'], axis=1, inplace=True)
    
    return df

def encode_features(train, test):
    # Combine datasets to encode categorical features consistently
    combined = pd.concat([train, test], sort=False)
    for col in combined.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        combined[col] = le.fit_transform(combined[col])
    return combined.iloc[:train.shape[0]].copy(), combined.iloc[train.shape[0]:].copy()

# ---------------------------
# 2. Load Data & Prepare
# ---------------------------
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
test_passenger_ids = test['PassengerId'].copy()

train = feature_engineering(train, is_train=True)
test = feature_engineering(test, is_train=False)

# Separate target variable and features
y = train['Survived']
X = train.drop('Survived', axis=1)
X_test = test.drop('PassengerId', axis=1)  # keep PassengerId for submission

# Encode categorical features
X, X_test = encode_features(X, X_test)

# Set up cross validation
NFOLDS = 5
cv = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)

# ---------------------------
# 3. Hyperparameter Optimization
# ---------------------------
# LightGBM Objective
def objective_lgb(trial, X, y, cv):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0)
    }
    cv_scores = []
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        train_set = lgb.Dataset(X_train, y_train)
        val_set = lgb.Dataset(X_val, y_val, reference=train_set)
        model = lgb.train(params, train_set, num_boost_round=1000,
                          valid_sets=[val_set],
                          early_stopping_rounds=50,
                          verbose_eval=False)
        preds = model.predict(X_val, num_iteration=model.best_iteration)
        cv_scores.append(log_loss(y_val, preds))
    return np.mean(cv_scores)

study_lgb = optuna.create_study(direction='minimize')
study_lgb.optimize(lambda trial: objective_lgb(trial, X, y, cv), n_trials=30)
best_params_lgb = study_lgb.best_trial.params
print("Best LGB params:", best_params_lgb)

# XGBoost Objective
def objective_xgb(trial, X, y, cv):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'booster': 'gbtree',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0)
    }
    cv_scores = []
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)
        evals = [(dval, 'eval')]
        model = xgb.train(params, dtrain, num_boost_round=1000,
                          evals=evals,
                          early_stopping_rounds=50,
                          verbose_eval=False)
        preds = model.predict(dval, ntree_limit=model.best_ntree_limit)
        cv_scores.append(log_loss(y_val, preds))
    return np.mean(cv_scores)

study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(lambda trial: objective_xgb(trial, X, y, cv), n_trials=30)
best_params_xgb = study_xgb.best_trial.params
print("Best XGB params:", best_params_xgb)

# CatBoost Objective
def objective_cat(trial, X, y, cv):
    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.1),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-2, 10.0),
        'loss_function': 'Logloss',
        'verbose': False,
        'iterations': 1000
    }
    cv_scores = []
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        train_pool = Pool(X_train, y_train)
        val_pool = Pool(X_val, y_val)
        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50, verbose=False)
        preds = model.predict_proba(X_val)[:, 1]
        cv_scores.append(log_loss(y_val, preds))
    return np.mean(cv_scores)

study_cat = optuna.create_study(direction='minimize')
study_cat.optimize(lambda trial: objective_cat(trial, X, y, cv), n_trials=30)
best_params_cat = study_cat.best_trial.params
print("Best CatBoost params:", best_params_cat)

# ---------------------------
# 4. Out-Of-Fold Predictions for Stacking
# ---------------------------
def get_oof_predictions(models, X, y, cv):
    oof_preds = np.zeros((X.shape[0], len(models)))
    for i, name in enumerate(models.keys()):
        fold_preds = np.zeros(X.shape[0])
        for train_idx, val_idx in cv.split(X, y):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            if name == 'lgb':
                train_set = lgb.Dataset(X_train, y_train)
                model = lgb.train(best_params_lgb, train_set, num_boost_round=1000,
                                  valid_sets=[lgb.Dataset(X_val, y_val)],
                                  early_stopping_rounds=50,
                                  verbose_eval=False)
                fold_preds[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
            
            elif name == 'xgb':
                dtrain = xgb.DMatrix(X_train, label=y_train)
                dval = xgb.DMatrix(X_val, label=y_val)
                model = xgb.train(best_params_xgb, dtrain, num_boost_round=1000,
                                  evals=[(dval, 'eval')],
                                  early_stopping_rounds=50,
                                  verbose_eval=False)
                fold_preds[val_idx] = model.predict(xgb.DMatrix(X_val), ntree_limit=model.best_ntree_limit)
            
            elif name == 'cat':
                train_pool = Pool(X_train, y_train)
                val_pool = Pool(X_val, y_val)
                model = CatBoostClassifier(**best_params_cat, iterations=1000, loss_function='Logloss', verbose=False)
                model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50, verbose=False)
                fold_preds[val_idx] = model.predict_proba(X_val)[:, 1]
        oof_preds[:, i] = fold_preds
    return oof_preds

# List of base models for stacking
base_model_names = ['lgb', 'xgb', 'cat']
oof_train = get_oof_predictions(dict.fromkeys(base_model_names), X, y, cv)
print("OOF predictions shape:", oof_train.shape)

# Train meta-model on OOF predictions
meta_model = LogisticRegression()
meta_model.fit(oof_train, y)

# ---------------------------
# 5. Train Final Models on Full Data
# ---------------------------
# Final LightGBM model
final_lgb = lgb.train(best_params_lgb, lgb.Dataset(X, y), num_boost_round=1000)

# Final XGBoost model
final_xgb = xgb.train(best_params_xgb, xgb.DMatrix(X, label=y), num_boost_round=1000)

# Final CatBoost model
final_cat = CatBoostClassifier(**best_params_cat, iterations=1000, loss_function='Logloss', verbose=False)
final_cat.fit(X, y)

# Prepare dictionary of trained models
final_models = {'lgb': final_lgb, 'xgb': final_xgb, 'cat': final_cat}

# ---------------------------
# 6. Predict on Test Set & Blend
# ---------------------------
def get_test_predictions(models, X_test):
    preds = np.zeros((X_test.shape[0], len(models)))
    for i, name in enumerate(models.keys()):
        if name == 'lgb':
            preds[:, i] = models[name].predict(X_test, num_iteration=models[name].best_iteration)
        elif name == 'xgb':
            dtest = xgb.DMatrix(X_test)
            preds[:, i] = models[name].predict(dtest, ntree_limit=models[name].best_ntree_limit)
        elif name == 'cat':
            preds[:, i] = models[name].predict_proba(X_test)[:, 1]
    return preds

test_preds = get_test_predictions(final_models, X_test)
# Stack the base predictions using the meta-model
stacked_test = meta_model.predict_proba(test_preds)[:, 1]

# You can use 0.5 as a threshold or optimize your own threshold
final_test_pred = (stacked_test > 0.5).astype(int)

# ---------------------------
# 7. Create Submission
# ---------------------------
submission = pd.DataFrame({
    "PassengerId": test_passenger_ids,
    "Survived": final_test_pred
})
submission.to_csv("submission.csv", index=False)
print("Submission file created!")


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool

import optuna

# ---------------------------
# 1. Feature Engineering
# ---------------------------
def feature_engineering(df, is_train=True):
    # Extract Title from Name
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(
        ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],
        'Rare')
    df['Title'] = df['Title'].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})
    
    # Impute missing Age based on Title
    df['Age'] = df.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))
    
    # Fill missing Fare (only in test)
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    
    # Fill missing Embarked with mode
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    
    # Create FamilySize and IsAlone features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = 1
    df.loc[df['FamilySize'] > 1, 'IsAlone'] = 0
    
    # Interaction feature: Age * Class
    df['Age_Class'] = df['Age'] * df['Pclass']
    
    # Extract Cabin letter; fill missing cabins with 'U'
    df['Cabin'].fillna('U', inplace=True)
    df['Cabin'] = df['Cabin'].apply(lambda x: x[0])
    
    # Optionally drop columns that are less informative
    if is_train:
        df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    else:
        # For test set, keep PassengerId for submission
        df.drop(['Name', 'Ticket'], axis=1, inplace=True)
    
    return df

def encode_features(train, test):
    # Combine datasets to encode categorical features consistently
    combined = pd.concat([train, test], sort=False)
    for col in combined.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        combined[col] = le.fit_transform(combined[col])
    return combined.iloc[:train.shape[0]].copy(), combined.iloc[train.shape[0]:].copy()

# ---------------------------
# 2. Load Data & Prepare
# ---------------------------
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
test_passenger_ids = test['PassengerId'].copy()

train = feature_engineering(train, is_train=True)
test = feature_engineering(test, is_train=False)

# Separate target variable and features
y = train['Survived']
X = train.drop('Survived', axis=1)
X_test = test.drop('PassengerId', axis=1)  # keep PassengerId for submission

# Encode categorical features
X, X_test = encode_features(X, X_test)

# Set up cross validation
NFOLDS = 5
cv = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)

# ---------------------------
# 3. Hyperparameter Optimization
# ---------------------------
# LightGBM Objective
def objective_lgb(trial, X, y, cv):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0)
    }
    cv_scores = []
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        train_set = lgb.Dataset(X_train, y_train)
        val_set = lgb.Dataset(X_val, y_val, reference=train_set)
        model = lgb.train(params, train_set, num_boost_round=1000,
                          valid_sets=[val_set],
                          callbacks=[lgb.early_stopping(stopping_rounds=50)],
                          verbose_eval=False)
        preds = model.predict(X_val, num_iteration=model.best_iteration)
        cv_scores.append(log_loss(y_val, preds))
    return np.mean(cv_scores)

study_lgb = optuna.create_study(direction='minimize')
study_lgb.optimize(lambda trial: objective_lgb(trial, X, y, cv), n_trials=30)
best_params_lgb = study_lgb.best_trial.params
print("Best LGB params:", best_params_lgb)

# XGBoost Objective
def objective_xgb(trial, X, y, cv):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'booster': 'gbtree',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0)
    }
    cv_scores = []
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)
        evals = [(dval, 'eval')]
        model = xgb.train(params, dtrain, num_boost_round=1000,
                          evals=evals,
                          early_stopping_rounds=50,
                          verbose_eval=False)
        preds = model.predict(dval, ntree_limit=model.best_ntree_limit)
        cv_scores.append(log_loss(y_val, preds))
    return np.mean(cv_scores)

study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(lambda trial: objective_xgb(trial, X, y, cv), n_trials=30)
best_params_xgb = study_xgb.best_trial.params
print("Best XGB params:", best_params_xgb)

# CatBoost Objective
def objective_cat(trial, X, y, cv):
    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.1),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-2, 10.0),
        'loss_function': 'Logloss',
        'verbose': False,
        'iterations': 1000
    }
    cv_scores = []
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        train_pool = Pool(X_train, y_train)
        val_pool = Pool(X_val, y_val)
        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50, verbose=False)
        preds = model.predict_proba(X_val)[:, 1]
        cv_scores.append(log_loss(y_val, preds))
    return np.mean(cv_scores)

study_cat = optuna.create_study(direction='minimize')
study_cat.optimize(lambda trial: objective_cat(trial, X, y, cv), n_trials=30)
best_params_cat = study_cat.best_trial.params
print("Best CatBoost params:", best_params_cat)

# ---------------------------
# 4. Out-Of-Fold Predictions for Stacking
# ---------------------------
def get_oof_predictions(models, X, y, cv):
    oof_preds = np.zeros((X.shape[0], len(models)))
    for i, name in enumerate(models.keys()):
        fold_preds = np.zeros(X.shape[0])
        for train_idx, val_idx in cv.split(X, y):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            if name == 'lgb':
                train_set = lgb.Dataset(X_train, y_train)
                model = lgb.train(best_params_lgb, train_set, num_boost_round=1000,
                                  valid_sets=[lgb.Dataset(X_val, y_val)],
                                  callbacks=[lgb.early_stopping(stopping_rounds=50)],
                                  verbose_eval=False)
                fold_preds[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
            
            elif name == 'xgb':
                dtrain = xgb.DMatrix(X_train, label=y_train)
                dval = xgb.DMatrix(X_val, label=y_val)
                model = xgb.train(best_params_xgb, dtrain, num_boost_round=1000,
                                  evals=[(dval, 'eval')],
                                  early_stopping_rounds=50,
                                  verbose_eval=False)
                fold_preds[val_idx] = model.predict(xgb.DMatrix(X_val), ntree_limit=model.best_ntree_limit)
            
            elif name == 'cat':
                train_pool = Pool(X_train, y_train)
                val_pool = Pool(X_val, y_val)
                model = CatBoostClassifier(**best_params_cat, iterations=1000, loss_function='Logloss', verbose=False)
                model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50, verbose=False)
                fold_preds[val_idx] = model.predict_proba(X_val)[:, 1]
        oof_preds[:, i] = fold_preds
    return oof_preds

# List of base models for stacking
base_model_names = ['lgb', 'xgb', 'cat']
oof_train = get_oof_predictions(dict.fromkeys(base_model_names), X, y, cv)
print("OOF predictions shape:", oof_train.shape)

# Train meta-model on OOF predictions
meta_model = LogisticRegression()
meta_model.fit(oof_train, y)

# ---------------------------
# 5. Train Final Models on Full Data
# ---------------------------
# Final LightGBM model (no early stopping used here)
final_lgb = lgb.train(best_params_lgb, lgb.Dataset(X, y), num_boost_round=1000)

# Final XGBoost model
final_xgb = xgb.train(best_params_xgb, xgb.DMatrix(X, label=y), num_boost_round=1000)

# Final CatBoost model
final_cat = CatBoostClassifier(**best_params_cat, iterations=1000, loss_function='Logloss', verbose=False)
final_cat.fit(X, y)

# Prepare dictionary of trained models
final_models = {'lgb': final_lgb, 'xgb': final_xgb, 'cat': final_cat}

# ---------------------------
# 6. Predict on Test Set & Blend
# ---------------------------
def get_test_predictions(models, X_test):
    preds = np.zeros((X_test.shape[0], len(models)))
    for i, name in enumerate(models.keys()):
        if name == 'lgb':
            preds[:, i] = models[name].predict(X_test, num_iteration=models[name].best_iteration)
        elif name == 'xgb':
            dtest = xgb.DMatrix(X_test)
            preds[:, i] = models[name].predict(dtest, ntree_limit=models[name].best_ntree_limit)
        elif name == 'cat':
            preds[:, i] = models[name].predict_proba(X_test)[:, 1]
    return preds

test_preds = get_test_predictions(final_models, X_test)
# Stack the base predictions using the meta-model
stacked_test = meta_model.predict_proba(test_preds)[:, 1]

# You can use 0.5 as a threshold or optimize your own threshold
final_test_pred = (stacked_test > 0.5).astype(int)

# ---------------------------
# 7. Create Submission
# ---------------------------
submission = pd.DataFrame({
    "PassengerId": test_passenger_ids,
    "Survived": final_test_pred
})
submission.to_csv("submission.csv", index=False)
print("Submission file created!")


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool

import optuna

# ---------------------------
# 1. Feature Engineering
# ---------------------------
def feature_engineering(df, is_train=True):
    # Extract Title from Name
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(
        ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],
        'Rare')
    df['Title'] = df['Title'].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})
    
    # Impute missing Age based on Title
    df['Age'] = df.groupby('Title')['Age'].transform(lambda x: x.fillna(x.median()))
    
    # Fill missing Fare (only in test)
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    
    # Fill missing Embarked with mode
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
    
    # Create FamilySize and IsAlone features
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = 1
    df.loc[df['FamilySize'] > 1, 'IsAlone'] = 0
    
    # Interaction feature: Age * Class
    df['Age_Class'] = df['Age'] * df['Pclass']
    
    # Extract Cabin letter; fill missing cabins with 'U'
    df['Cabin'].fillna('U', inplace=True)
    df['Cabin'] = df['Cabin'].apply(lambda x: x[0])
    
    # Optionally drop columns that are less informative
    if is_train:
        df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    else:
        # For test set, keep PassengerId for submission
        df.drop(['Name', 'Ticket'], axis=1, inplace=True)
    
    return df

def encode_features(train, test):
    # Combine datasets to encode categorical features consistently
    combined = pd.concat([train, test], sort=False)
    for col in combined.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        combined[col] = le.fit_transform(combined[col])
    return combined.iloc[:train.shape[0]].copy(), combined.iloc[train.shape[0]:].copy()

# ---------------------------
# 2. Load Data & Prepare
# ---------------------------
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
test_passenger_ids = test['PassengerId'].copy()

train = feature_engineering(train, is_train=True)
test = feature_engineering(test, is_train=False)

# Separate target variable and features
y = train['Survived']
X = train.drop('Survived', axis=1)
X_test = test.drop('PassengerId', axis=1)  # keep PassengerId for submission

# Encode categorical features
X, X_test = encode_features(X, X_test)

# Set up cross validation
NFOLDS = 5
cv = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)

# ---------------------------
# 3. Hyperparameter Optimization
# ---------------------------
# LightGBM Objective
def objective_lgb(trial, X, y, cv):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0)
    }
    cv_scores = []
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        train_set = lgb.Dataset(X_train, y_train)
        val_set = lgb.Dataset(X_val, y_val, reference=train_set)
        model = lgb.train(
            params, train_set, num_boost_round=1000,
            valid_sets=[val_set],
            callbacks=[lgb.early_stopping(stopping_rounds=50)]
        )
        preds = model.predict(X_val, num_iteration=model.best_iteration)
        cv_scores.append(log_loss(y_val, preds))
    return np.mean(cv_scores)

study_lgb = optuna.create_study(direction='minimize')
study_lgb.optimize(lambda trial: objective_lgb(trial, X, y, cv), n_trials=30)
best_params_lgb = study_lgb.best_trial.params
print("Best LGB params:", best_params_lgb)

# XGBoost Objective
# XGBoost Objective
def objective_xgb(trial, X, y, cv):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'booster': 'gbtree',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0)
    }
    cv_scores = []
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)
        evals = [(dval, 'eval')]
        model = xgb.train(params, dtrain, num_boost_round=1000,
                          evals=evals,
                          early_stopping_rounds=50,
                          verbose_eval=False)
        # Use iteration_range instead of ntree_limit
        preds = model.predict(dval, iteration_range=(0, model.best_iteration))
        cv_scores.append(log_loss(y_val, preds))
    return np.mean(cv_scores)


study_xgb = optuna.create_study(direction='minimize')
study_xgb.optimize(lambda trial: objective_xgb(trial, X, y, cv), n_trials=30)
best_params_xgb = study_xgb.best_trial.params
print("Best XGB params:", best_params_xgb)

# CatBoost Objective
def objective_cat(trial, X, y, cv):
    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.1),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-2, 10.0),
        'loss_function': 'Logloss',
        'verbose': False,
        'iterations': 1000
    }
    cv_scores = []
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        train_pool = Pool(X_train, y_train)
        val_pool = Pool(X_val, y_val)
        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50, verbose=False)
        preds = model.predict_proba(X_val)[:, 1]
        cv_scores.append(log_loss(y_val, preds))
    return np.mean(cv_scores)

study_cat = optuna.create_study(direction='minimize')
study_cat.optimize(lambda trial: objective_cat(trial, X, y, cv), n_trials=30)
best_params_cat = study_cat.best_trial.params
print("Best CatBoost params:", best_params_cat)

# ---------------------------
# 4. Out-Of-Fold Predictions for Stacking
# ---------------------------
# In the Out-Of-Fold Predictions function:

# In the Out-Of-Fold Predictions function:
def get_oof_predictions(models, X, y, cv):
    oof_preds = np.zeros((X.shape[0], len(models)))
    for i, name in enumerate(models.keys()):
        fold_preds = np.zeros(X.shape[0])
        for train_idx, val_idx in cv.split(X, y):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            if name == 'xgb':
                dtrain = xgb.DMatrix(X_train, label=y_train)
                dval = xgb.DMatrix(X_val, label=y_val)
                model = xgb.train(best_params_xgb, dtrain, num_boost_round=1000,
                                  evals=[(dval, 'eval')],
                                  early_stopping_rounds=50,
                                  verbose_eval=False)
                fold_preds[val_idx] = model.predict(xgb.DMatrix(X_val), iteration_range=(0, model.best_iteration))
            # Other model branches (lgb, cat) omitted for brevity
        oof_preds[:, i] = fold_preds
    return oof_preds


# List of base models for stacking
base_model_names = ['lgb', 'xgb', 'cat']
oof_train = get_oof_predictions(dict.fromkeys(base_model_names), X, y, cv)
print("OOF predictions shape:", oof_train.shape)

# Train meta-model on OOF predictions
meta_model = LogisticRegression()
meta_model.fit(oof_train, y)

# ---------------------------
# 5. Train Final Models on Full Data
# ---------------------------
# Final LightGBM model (no early stopping used here)
final_lgb = lgb.train(best_params_lgb, lgb.Dataset(X, y), num_boost_round=1000)

# Final XGBoost model
final_xgb = xgb.train(best_params_xgb, xgb.DMatrix(X, label=y), num_boost_round=1000)

# Final CatBoost model
final_cat = CatBoostClassifier(**best_params_cat, iterations=1000, loss_function='Logloss', verbose=False)
final_cat.fit(X, y)

# Prepare dictionary of trained models
final_models = {'lgb': final_lgb, 'xgb': final_xgb, 'cat': final_cat}

# ---------------------------
# 6. Predict on Test Set & Blend
# ---------------------------
def get_test_predictions(models, X_test):
    preds = np.zeros((X_test.shape[0], len(models)))
    for i, name in enumerate(models.keys()):
        if name == 'lgb':
            preds[:, i] = models[name].predict(X_test, num_iteration=models[name].best_iteration)
        elif name == 'xgb':
            dtest = xgb.DMatrix(X_test)
            # Use iteration_range instead of ntree_limit
            preds[:, i] = models[name].predict(dtest, iteration_range=(0, models[name].best_iteration))
        elif name == 'cat':
            preds[:, i] = models[name].predict_proba(X_test)[:, 1]
    return preds


test_preds = get_test_predictions(final_models, X_test)
# Stack the base predictions using the meta-model
stacked_test = meta_model.predict_proba(test_preds)[:, 1]

# You can use 0.5 as a threshold or optimize your own threshold
final_test_pred = (stacked_test > 0.5).astype(int)

# ---------------------------
# 7. Create Submission
# ---------------------------
submission = pd.DataFrame({
    "PassengerId": test_passenger_ids,
    "Survived": final_test_pred
})
submission.to_csv("submission.csv", index=False)
print("Submission file created!")
