# Titanic New Version

### Import Cell

In [1]:
!conda info --envs

# conda environments:
#
                         C:\ProgramData\anaconda3
base                  *  C:\ProgramData\anaconda3\envs\aiml
                         C:\ProgramData\anaconda3\envs\crisis
                         C:\ProgramData\anaconda3\envs\highres



In [2]:
%config IPCompleter.greedy=True

In [4]:
# Cell magic for visualization
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Core imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import sklearn as skl
import pickle


# Models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Optimization
import optuna

# Interpretation
import shap
shap.initjs()

ImportError: Error importing numpy: you should not try to import numpy from
        its source directory; please exit the numpy source tree, and relaunch
        your python interpreter from there.

In [None]:
import warnings
warnings.filterwarnings("ignore")

### Import training Dataset

In [None]:
train  = pd.read_csv('Datasets/train.csv')
test = pd.read_csv('Datasets/test.csv')

In [None]:
# Initial checks
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
train.head()

In [None]:
X_Pred_Passenger = test['PassengerId']

## Exploratory Data Analysis

In [None]:
train.describe()

In [None]:
train["Survived"].value_counts()

In [None]:
# Seperate the features
numerical_features = train.select_dtypes(include=['number'])
numerical_features = numerical_features.drop(columns=['Survived','PassengerId','Pclass'])
categorical_features = train.select_dtypes(exclude=['number'])
categorical_features_pred = test.select_dtypes(exclude=['number'])
# categorical_features_pred = categorical_features_pred.to_frame()
categorical_features["Pclass"] = train["Pclass"]
categorical_features_pred["Pclass"] = test["Pclass"]
categorical_features = categorical_features.drop(columns=['Name','Ticket'])
categorical_features_pred = categorical_features_pred.drop(columns=['Name','Ticket'])
target_column = "Survived"

### Numerical Data Heatmap Feature Correlations

In [None]:
def heatmap(numerical_features) :
    plt.figure(figsize=(10, 8))
    correlation_matrix = numerical_features.corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title('Correlation Heatmap of Numerical Features')
    plt.show()

In [None]:
heatmap(numerical_features)

### Bar Plots of Numerical Features vs. Target Variable

In [None]:
def bar_plots_numerical_features(train, numerical_features, target_column) :
    for feature in numerical_features:
        plt.figure(figsize=(8, 6))
        sns.barplot(data=train, x=target_column, y=feature)
        plt.title(f'Mean {feature} by {target_column}')
        plt.xlabel(target_column)
        plt.ylabel(feature)
        plt.xticks([0, 1], ['Not Survived', 'Survived']) # Assuming 0 and 1 are your target labels
        plt.show()

In [None]:
bar_plots_numerical_features(train, numerical_features, target_column)

### Bar Plots of Median Numerical Features vs. Target Variable

In [None]:
def bar_plots_numerical_features_median(train, numerical_features, target_column):
    for feature in numerical_features:
        plt.figure(figsize=(8, 6))
        sns.barplot(data=train, x=target_column, y=feature, estimator=np.median)
        plt.title(f'Median {feature} by {target_column}')
        plt.xlabel(target_column)
        plt.ylabel(feature)
        plt.xticks([0, 1], ['Negative Class', 'Positive Class'])
        plt.show()

In [None]:
bar_plots_numerical_features_median(train, numerical_features, target_column)

### Distribution of Numeric Features with Histograms

In [None]:
def histograms(train, numerical_features, target_column) :
    print("\n--- Histograms of Numeric Features ---")
    for feature in numerical_features:
        plt.figure(figsize=(8, 6))
        sns.histplot(data=train, x=feature, hue=target_column, kde=False, multiple="stack")
        plt.title(f'Histogram of {feature} by {target_column}')
        plt.xlabel(feature)
        plt.ylabel('Frequency')
        plt.legend(title=target_column, labels=['No', 'Yes']) # Assuming 0 is 'No', 1 is 'Yes'
        plt.show()

In [None]:
histograms(train, numerical_features, target_column)

### Distribution of Numeric Features with KDE Plots

In [None]:
def kde_plots(train, numerical_features, target_column) :
    print("\n--- KDE Plots of Numeric Features ---")
    for feature in numerical_features:
        plt.figure(figsize=(7, 5))
        sns.kdeplot(data=train, x=feature, hue=target_column, fill=True, alpha=.5, multiple="stack")
        plt.title(f'KDE Plot of {feature} by {target_column}')
        plt.xlabel(feature)
        plt.ylabel('Density')
        plt.legend(title=target_column, labels=['No', 'Yes']) # Assuming 0 is 'No', 1 is 'Yes'
        plt.show()

In [None]:
kde_plots(train, numerical_features, target_column)

### Combined Histograms and KDE Plots

In [None]:
def historgram_kde_plots(train, numerical_features, target_column) :
    print("\n--- Combined Histograms and KDE Plots of Numeric Features ---")
    for feature in numerical_features:
        plt.figure(figsize=(10, 6))
        sns.histplot(data=train, x=feature, hue=target_column, kde=True, multiple="stack")
        plt.title(f'Histogram and KDE of {feature} by {target_column}')
        plt.xlabel(feature)
        plt.ylabel('Density / Frequency')
        plt.legend(title=target_column, labels=['No', 'Yes']) # Assuming 0 is 'No', 1 is 'Yes'
        plt.show()

In [None]:
historgram_kde_plots(train, numerical_features, target_column)

### Pair Plots

In [None]:
def pair_plits(train) :
    sns.pairplot(train[['Survived', 'Pclass', 'Age', 'Fare', 'SibSp', 'Parch']], hue='Survived')
    plt.suptitle('Pairwise Relationships of Features', y=1.02)
    plt.show()

In [None]:
pair_plits(train)

### Target vs. Numerical Feature Analysis

In [None]:
# --- Target vs. Numerical Feature Analysis ---

def box_plots_numerical_features(train, numerical_features, target_column) :

    print("--- Boxplots of Numerical Features by Target ---")
    for feature in numerical_features:
        plt.figure(figsize=(7, 5))
        sns.boxplot(data=train, x=target_column, y=feature, hue=target_column)
        plt.title(f'Boxplot of {feature} by {target_column}')
        plt.xlabel(target_column)
        plt.ylabel(feature)
        plt.xticks([0, 1], ['Negative Class', 'Positive Class'])
        plt.show()
    
    print("\n--- Violin Plots of Numerical Features by Target ---")
    for feature in numerical_features:
        plt.figure(figsize=(7, 5))
        sns.violinplot(data=train, x=target_column, y=feature, split=True, inner="quart")
        plt.title(f'Violin Plot of {feature} by {target_column}')
        plt.xlabel(target_column)
        plt.ylabel(feature)
        plt.xticks([0, 1], ['Negative Class', 'Positive Class'])
        plt.show()

In [None]:
box_plots_numerical_features(train, numerical_features, target_column)

## Feature Engineering

In [None]:
# Fetch all the unique titles from the dataset

all_titles = []

for name_string in train["Name"] :
    comma_pos = name_string.find(',')
    dot_pos = name_string.find('.')
    title = name_string[comma_pos + 2:dot_pos]
    if title not in all_titles :
        all_titles.append(title)

print(all_titles)

In [None]:
# Function to return the title for a given name
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.find(substring) != -1:
            return substring
    # print(big_string)
    return np.nan

In [None]:
# Evalute the above function to each row and add it as a column

train['Title']=train['Name'].map(lambda x: substrings_in_string(x, all_titles))
test['Title']=test['Name'].map(lambda x: substrings_in_string(x, all_titles))

#replacing all titles with mr, mrs, miss, master
def replace_titles(x):
    title=x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Rare_Mr'
    elif title in ['Countess', 'Mme']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='Male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
train['Title']=train.apply(replace_titles, axis=1)
test['Title']=test.apply(replace_titles, axis=1)

categorical_features['Title'] = train['Title']
categorical_features_pred['Title'] = test['Title']
print(train.head())

In [None]:
test.head()

In [None]:
# Turning cabin number into Deck
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'NaN']

train['Deck']=train['Cabin'].map(lambda x: substrings_in_string(str(x), cabin_list))
test['Deck']=test['Cabin'].map(lambda x: substrings_in_string(str(x), cabin_list))

categorical_features['Deck'] = train['Deck']
categorical_features_pred['Deck'] = test['Deck']
print(train['Deck'].head())

In [None]:
test.head()

In [None]:
# Creating new family_size column
train['Family_Size']=train['SibSp']+train['Parch']+1
test['Family_Size']=test['SibSp']+test['Parch']+1
numerical_features['Family_Size'] = train['Family_Size']

In [None]:
test.head()

In [None]:
categorical_features = categorical_features.drop(columns = ['Cabin','Deck'])
categorical_features_pred = categorical_features_pred.drop(columns = ['Cabin','Deck'])

In [None]:
categorical_features.head(15)

In [None]:
categorical_features_pred.head(15)

In [None]:
numerical_features.head(10)

In [None]:
train.head(10)

In [None]:
train = train.drop(columns = ['Name','Ticket','PassengerId','Cabin','Deck'])
test = test.drop(columns = ['Name','Ticket','PassengerId','Cabin','Deck'])

In [None]:
train.head(10)

In [None]:
test.head()

In [None]:
def encode_categorical_features(categorical_features) :
    # label_encoders = {}
    X_encoded = categorical_features.copy()
    i = 0
    for col in categorical_features:
        print(categorical_features.columns[i])
        # le = skl.preprocessing.LabelEncoder()
        # X_encoded[col] = le.fit_transform(X_encoded[col])
        # label_encoders[col] = le
        X_encoded_dummies = pd.get_dummies(categorical_features[col], prefix=categorical_features.columns[i])
        X_encoded = pd.concat([X_encoded, X_encoded_dummies], axis=1)
        X_encoded.drop(col, axis=1, inplace=True)
        i += 1
        
    return X_encoded

In [None]:
# print(type(categorical_features_pred))
# categorical_features_pred = categorical_features_pred.to_frame()
# print(type(categorical_features_pred))
# categorical_features_pred.head()

In [None]:
X = train.drop([target_column], axis=1)
X_Pred = test
print(type(X_Pred))
y = train[target_column]
categorical_features_encoded = encode_categorical_features(categorical_features)
categorical_features_encoded_pred = encode_categorical_features(categorical_features_pred)
categorical_features_encoded.head()

In [None]:
X = pd.concat([train, categorical_features_encoded], axis=1)
X_Pred = pd.concat([test, categorical_features_encoded_pred], axis=1)
X.head()

In [None]:
# Add new feature Is_Alone

X['Is_Alone'] = False
X.loc[X['Family_Size'] == 1, 'Is_Alone'] = True

X_Pred['Is_Alone'] = False
X_Pred.loc[X_Pred['Family_Size'] == 1, 'Is_Alone'] = True

In [None]:
X.isna().sum()

In [None]:
X_Pred.isna().sum()

In [None]:
X['Age_Missing'] = X['Age'].isnull().astype(int)
X_Pred['Age_Missing'] = X_Pred['Age'].isnull().astype(int)

Impute with median

In [None]:
# # Handling the missing values

# import warnings
# warnings.filterwarnings("ignore")

# def impute_age(cols):
#     Age = cols[0]
#     Pclass = cols[1]

#     if pd.isnull(Age):
#         if Pclass == 1:
#             return X[X['Pclass'] == 1]['Age'].median()
#         elif Pclass == 2:
#             return X[X['Pclass'] == 2]['Age'].median()
#         else:
#             return X[X['Pclass'] == 3]['Age'].median()
#     return Age

# X['Age'] = X[['Age', 'Pclass']].apply(impute_age, axis=1)
# X_Pred['Age'] = X_Pred[['Age', 'Pclass']].apply(impute_age, axis=1)

# print(f"Number of NaN values in Age after Pclass-based imputation: {X['Age'].isna().sum()}")
# print(f"Number of NaN values in Age after Pclass-based imputation: {X_Pred['Age'].isna().sum()}")

In [None]:
# X = X.drop(columns=['Survived','Sex','Pclass','Title'])
# X_Pred = X_Pred.drop(columns=['Sex','Pclass','Title'])

In [None]:
# # Get indices of rows with missing 'Embarked' values
# dropped_indices = X[X['Embarked'].isna()].index

# # Drop rows from X
# X.dropna(subset=['Embarked'], inplace=True)

# # Drop corresponding rows from y using the same indices
# y.drop(index=dropped_indices, inplace=True)
# categorical_features_encoded.drop(index=dropped_indices, inplace=True)

In [None]:
# X = X.drop(columns=['Embarked'])
# X_Pred = X_Pred.drop(columns=['Embarked'])
# X.drop(columns=['Title_the Countess'], inplace=True)

Using Regression for missing values

In [None]:
# Fill Embarked with mode

X['Embarked'] = X['Embarked'].fillna(X['Embarked'].mode()[0])

In [None]:
X.drop(columns=['Title_the Countess'], inplace=True)

In [None]:
X = X.drop(columns=['Survived','Sex','Embarked','Pclass','Title'])
X_Pred = X_Pred.drop(columns=['Sex','Embarked','Pclass','Title'])

In [None]:
# Impute Values using Random Forest Regression

def impute_age(X, X_Pred) :
    
    Known_Age_X = X[X['Age'].notnull()]
    Unknown_Age_X = X[X['Age'].isnull()]
    Unknown_Age_X_Pred = X_Pred[X_Pred['Age'].isnull()]
    
    Known_Age_y = Known_Age_X['Age']
    Known_Age_X.drop(columns=['Age'], inplace=True)
    Unknown_Age_X.drop(columns=['Age'], inplace=True)
    Unknown_Age_X_Pred.drop(columns=['Age'], inplace=True)
    
    print("\nTraining RandomForestRegressor to predict missing 'Age' values...")
    rfr = skl.ensemble.RandomForestRegressor(n_estimators=100, random_state=42)
    rfr.fit(Known_Age_X, Known_Age_y)
    print("Training complete.")
    predicted_ages_X = rfr.predict(Unknown_Age_X)
    X.loc[X['Age'].isnull(), 'Age'] = predicted_ages_X

    predicted_ages_X_Pred = rfr.predict(Unknown_Age_X_Pred)
    X_Pred.loc[X_Pred['Age'].isnull(), 'Age'] = predicted_ages_X_Pred
    
    print("\nMissing Age values after imputation:", X['Age'].isnull().sum())
    print("Shape of X after age imputation:", X.shape)

    print("\nMissing Age values after imputation:", X_Pred['Age'].isnull().sum())
    print("Shape of X_Pred after age imputation:", X_Pred.shape)

impute_age(X, X_Pred)

In [None]:
X.isna().sum()

In [None]:
X_Pred.isna().sum()

In [None]:
# Fill missing 'Fare' with mean

X_Pred['Fare'] = X_Pred['Fare'].fillna(X_Pred['Fare'].mean())

### Mutual Information Scores

In [None]:
print(X.shape)
print(y.shape)
print(y.index.equals(X.index))  # Should return True

In [None]:
print(X_Pred.shape)

In [None]:
def mutual_information_scores(X, y) :
    # Calculate Mutual Information Scores
    mutual_info = skl.feature_selection.mutual_info_classif(categorical_features_encoded, y, random_state=42)
    
    # Create a Series of Feature Names and their MI Scores
    mi_scores = pd.Series(mutual_info, index=categorical_features_encoded.columns)
    
    # Sort the MI scores in descending order
    mi_scores_sorted = mi_scores.sort_values(ascending=False)
    
    # Display the Mutual Information Scores
    print("--- Feature-Target Mutual Information Scores ---")
    print(mi_scores_sorted)

In [None]:
mutual_information_scores(X, y)

### Target vs. Categorical Feature Analysis (using Barplots for proportions)

In [None]:
def bar_plots_categorical_features(train, categorical_features, target_column) :
    print("\n--- Bar Plots of Target Proportions by Categorical Features ---")
    for feature in categorical_features:
        category_proportions = train.groupby(feature)[target_column].value_counts(normalize=True).mul(100).rename('Percentage').reset_index()
        plt.figure(figsize=(7, 5))
        sns.barplot(data=category_proportions, x=feature, y='Percentage', hue=target_column)
        plt.title(f'Proportion of {target_column} by {feature}')
        plt.xlabel(feature)
        plt.ylabel('Percentage (%)')
        plt.legend(title=target_column, labels=['Negative Class', 'Positive Class'])
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()

In [None]:
bar_plots_categorical_features(train, categorical_features, target_column)

In [None]:
categorical_features.head()

In [None]:
# X = X.drop(columns=['Survived','Sex','Embarked','Pclass','Title'])
# X_Pred = X_Pred.drop(columns=['Sex','Embarked','Pclass','Title'])

In [None]:
X.head()

In [None]:
X_Pred.head()

In [None]:
numerical_features.head()

In [None]:
# Capping the Outliers

lower_limit_age = X['Age'].quantile(0.05)  # 5th percentile
upper_limit_age = X['Age'].quantile(0.95)  # 95th percentile
X['Age'] = np.clip(X['Age'], lower_limit_age, upper_limit_age)
X_Pred['Age'] = np.clip(X_Pred['Age'], lower_limit_age, upper_limit_age)
print("Capping the Age between " + lower_limit_age + " and " + upper_limit_age)

lower_limit_sibsp = X['SibSp'].quantile(0.05)  # 5th percentile
upper_limit_sibsp = X['SibSp'].quantile(0.95)  # 95th percentile
X['SibSp'] = np.clip(X['SibSp'], lower_limit_sibsp, upper_limit_sibsp)
X_Pred['SibSp'] = np.clip(X_Pred['SibSp'], lower_limit_sibsp, upper_limit_sibsp)
print("Capping the SibSp between " + lower_limit_sibsp + " and " + upper_limit_sibsp)

lower_limit_parch = X['Parch'].quantile(0.05)  # 5th percentile
upper_limit_parch = X['Parch'].quantile(0.95)  # 95th percentile
X['Parch'] = np.clip(X['Parch'], lower_limit_parch, upper_limit_parch)
X_Pred['Parch'] = np.clip(X_Pred['Parch'], lower_limit_parch, upper_limit_parch)
print("Capping the Parch between " + lower_limit_parch + " and " + upper_limit_parch)

lower_limit_fare = X['Fare'].quantile(0.05)  # 5th percentile
upper_limit_fare = X['Fare'].quantile(0.95)  # 95th percentile
X['Fare'] = np.clip(X['Fare'], lower_limit_fare, upper_limit_fare)
X_Pred['Fare'] = np.clip(X_Pred['Fare'], lower_limit_fare, upper_limit_fare)
print("Capping the Fare between " + lower_limit_fare + " and " + upper_limit_fare)

lower_limit_family_size = X['Family_Size'].quantile(0.05)  # 5th percentile
upper_limit_family_size = X['Family_Size'].quantile(0.95)  # 95th percentile
X['Family_Size'] = np.clip(X['Family_Size'], lower_limit_family_size, upper_limit_family_size)
X_Pred['Family_Size'] = np.clip(X_Pred['Family_Size'], lower_limit_family_size, upper_limit_family_size)
print("Capping the Family Size between " + lower_limit_family_size + " and " + upper_limit_family_size)

In [None]:
scaler = skl.preprocessing.StandardScaler()
X[['Age','SibSp','Parch','Fare','Family_Size']] = scaler.fit_transform(X[['Age','SibSp','Parch','Fare','Family_Size']])
# X_Pred[['Age','SibSp','Parch','Fare','Family_Size']] = scaler.fit_transform(X_Pred[['Age','SibSp','Parch','Fare','Family_Size']])

with open("standard_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

X_Pred[['Age','SibSp','Parch','Fare','Family_Size']] = scaler.transform(X_Pred[['Age','SibSp','Parch','Fare','Family_Size']])


In [None]:
X.dtypes

In [None]:
X_Pred.dtypes

## Training

In [None]:
# Split data
X_train, X_test, y_train, y_test = skl.model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.head()

In [None]:
# Define StratifiedKFold
skf = skl.model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Function to visualize folds
def plot_folds(folds, method_name, y_labels):
    plt.figure(figsize=(8, 5))
    for i, (train_idx, test_idx) in enumerate(folds.split(X_train, y_labels)):
        plt.scatter(test_idx, [i] * len(test_idx), label=f"Fold {i+1}", alpha=0.7)
    plt.yticks(range(5), [f"Fold {i+1}" for i in range(5)])
    plt.xlabel("Sample Index")
    plt.ylabel("Fold Number")
    plt.title(f"{method_name} Class Distribution")
    plt.legend()
    plt.show()

# Visualize StratifiedKFold before model optimization
plot_folds(skf, "StratifiedKFold", y_train)

In [None]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [None]:
def optimize_model(trial, model_type):
    params = {}
    
    if model_type == "Logistic Regression":
        params = {
            "C": trial.suggest_loguniform("C", 1e-3, 10),
            "max_iter": trial.suggest_int("max_iter", 100, 1000),
            "penalty": trial.suggest_categorical("penalty", ["l2"]),
            "solver": trial.suggest_categorical("solver", ["lbfgs", "saga", "newton-cg"]),
            "random_state": 42,
            "n_jobs": -1
        }
        model = skl.linear_model.LogisticRegression(**params)

    elif model_type == "Decision Tree":
        params = {
            "max_depth": trial.suggest_int("max_depth", 2, 30),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "criterion": trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"]),
            "random_state": 42
        }
        model = skl.tree.DecisionTreeClassifier(**params)

    elif model_type == "Random Forest":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "max_depth": trial.suggest_int("max_depth", 3, 30),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
            "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
            "random_state": 42,
            "n_jobs": -1
        }
        model = skl.ensemble.RandomForestClassifier(**params)

    elif model_type == "XGBoost":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "max_depth": trial.suggest_int("max_depth", 3, 15),
            "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
            "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
            "gamma": trial.suggest_loguniform("gamma", 1e-8, 10.0),
            "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
            "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
            "use_label_encoder": False,
            "eval_metric": "logloss",
            "n_jobs": -1,
            "random_state": 42,
        }
        model = XGBClassifier(**params)

    elif model_type == "LightGBM":
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
            "max_depth": trial.suggest_int("max_depth", 3, 30),
            "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
            "num_leaves": trial.suggest_int("num_leaves", 7, 512),
            "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
            "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
            "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
            "random_state": 42,
            "n_jobs": -1
        }
        model = LGBMClassifier(**params)

    elif model_type == "CatBoost":
        params = {
            "iterations": trial.suggest_int("iterations", 100, 1000),
            "depth": trial.suggest_int("depth", 3, 10),
            "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
            "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-3, 10.0),
            "border_count": trial.suggest_int("border_count", 32, 255),
            "random_strength": trial.suggest_uniform("random_strength", 1e-9, 10.0),
            "verbose": 0,
            "random_seed": 42,
        }
        model = CatBoostClassifier(**params)

    # score = skl.model_selection.cross_val_score(
    #     model, X_train, y_train, cv=5, scoring="accuracy", n_jobs=-1
    # )
    
    accuracies = []
    for train_idx, test_idx in skf.split(X_train, y_train):
        X_train_fold, X_test_fold = X_train.iloc[train_idx], X_train.iloc[test_idx]
        y_train_fold, y_test_fold = y_train.iloc[train_idx], y_train.iloc[test_idx]

        model.fit(X_train_fold, y_train_fold)
        accuracies.append(model.score(X_test_fold, y_test_fold))

    return np.mean(accuracies)
    
    # return score.mean()

In [None]:
# Optimize each model

optimized_models = {}
for model_name in [
    "Logistic Regression",
    "Decision Tree",
    "Random Forest",
    "XGBoost",
    # "LightGBM",
    "CatBoost"
]:
    print(f"Optimizing {model_name}...")
    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: optimize_model(trial, model_name), n_trials=20, show_progress_bar=True)
    optimized_models[model_name] = study.best_params
    print(f"Best params for {model_name}: {study.best_params}\n")

In [None]:
base_model = skl.ensemble.RandomForestClassifier(**optimized_models["Random Forest"])
meta_model = skl.linear_model.LogisticRegression(**optimized_models["Logistic Regression"])

models = {
    "Logistic Regression": skl.linear_model.LogisticRegression(**optimized_models["Logistic Regression"]),
    "Decision Tree": skl.tree.DecisionTreeClassifier(**optimized_models["Decision Tree"]),
    "Random Forest": skl.ensemble.RandomForestClassifier(**optimized_models["Random Forest"]),
    "XGBoost": XGBClassifier(**optimized_models["XGBoost"]),
    # "LightGBM": LGBMClassifier(**optimized_models["LightGBM"], verbose=-1),
    "CatBoost": CatBoostClassifier(**optimized_models["CatBoost"], verbose=0),
    "Stacked Model": skl.ensemble.StackingClassifier(
        estimators=[("rf", base_model)],
        final_estimator=meta_model,
        cv=5  # Stratified cross-validation
    )
}

In [None]:
# base_model = skl.ensemble.RandomForestClassifier(**optimized_models["Random Forest"])
# meta_model = skl.linear_model.LogisticRegression(**optimized_models["Logistic Regression"])

# # Implement stacking
# stacked_model = skl.ensemble.StackingClassifier(
#     estimators=[("rf", base_model)],
#     final_estimator=meta_model,
#     cv=5  # Stratified cross-validation
# )

In [None]:
# # Select best-performing model (replace with actual best model)
# best_model = models["XGBoost"]  # Example

# # Fit SHAP explainer
# explainer = shap.Explainer(best_model, X_train)
# shap_values = explainer(X_test)

# # Plot feature importance
# shap.summary_plot(shap_values, X_test, feature_names=X.columns)

In [None]:
X_Pred.head()

In [None]:
X_Pred['Fare'].fillna(X_Pred['Fare'].median(), inplace=True)

In [None]:
X_Pred.isna().sum()

In [None]:
X_Pred_Passenger.head()

In [None]:
# Train & evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = skl.metrics.accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")

In [None]:
bool_cols = X_test.select_dtypes(include=['bool']).columns
X_test[bool_cols] = X_test[bool_cols].astype(int)
X_train[bool_cols] = X_train[bool_cols].astype(int)

In [None]:
print(X_test.dtypes)

In [None]:
# Apply SHAP
explainer = shap.Explainer(models["Random Forest"], X_train)
shap_values = explainer(X_test) # This is the full Explanation object
print(shap_values.shape)

# Visualize feature importance
print(f"SHAP Summary for Random Forest")

# shap.summary_plot(shap_values, X_test)
print("Survival Dependence")
shap.plots.beeswarm(shap_values[..., 1])  # Select SHAP values for the survival class
print("Non-Survival Dependence")
shap.plots.beeswarm(shap_values[..., 0])  # Select SHAP values for the non-survival class

In [None]:
# Loop through all trained models and save predictions separately
for model_name, model in models.items():
    predictions = model.predict(X_Pred)  # Generate predictions using X_Pred

    # Create a submission DataFrame with required format
    submission_df = pd.DataFrame({"PassengerId": X_Pred_Passenger, "Survived": predictions})

    # Save each model’s predictions as a separate CSV file
    filename = f"{model_name}_submission.csv"
    submission_df.to_csv(filename, index=False)
    
    print(f"Saved: {filename}")


In [None]:
with open('random_forest.pkl', 'wb') as f:
    pickle.dump(models["Random Forest"], f)