In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.ticker import MaxNLocator
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# Set styling for matplotlib
plt.style.use('fivethirtyeight')
sns.set_palette("deep")
sns.set_style("whitegrid")

# Load the datasets
print("Loading and preparing data...")
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

# First: apply basic feature engineering to make sure 'Title' is created
def extract_title(df):
    # Extract titles from names
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    
    # Consolidate rare titles
    title_mapping = {
        'Mr': 'Mr',
        'Miss': 'Miss',
        'Mrs': 'Mrs',
        'Master': 'Master',
        'Dr': 'Officer',
        'Rev': 'Officer',
        'Col': 'Officer',
        'Major': 'Officer',
        'Mlle': 'Miss',
        'Mme': 'Mrs',
        'Don': 'Royalty',
        'Sir': 'Royalty',
        'Lady': 'Royalty',
        'Countess': 'Royalty',
        'Jonkheer': 'Royalty',
        'Capt': 'Officer',
        'Ms': 'Miss',
        'Dona': 'Royalty'
    }
    df['Title'] = df['Title'].map(title_mapping)
    return df

# Apply title extraction to both datasets
train_df = extract_title(train_df)
test_df = extract_title(test_df)

# Function for comprehensive EDA visualizations
def visualize_titanic_data(df):
    # Create a figure with subplots
    fig = plt.figure(figsize=(20, 16))
    gs = gridspec.GridSpec(3, 3)
    
    # 1. Survival by Gender
    ax1 = plt.subplot(gs[0, 0])
    survival_by_gender = df.groupby(['Sex', 'Survived']).size().unstack()
    survival_by_gender_pct = survival_by_gender.div(survival_by_gender.sum(axis=1), axis=0) * 100
    survival_by_gender_pct.plot(kind='bar', stacked=True, ax=ax1, color=['#ff6b6b', '#4ecdc4'])
    ax1.set_title('Survival Rate by Gender', fontsize=14, fontweight='bold')
    ax1.set_ylabel('Percentage (%)', fontweight='bold')
    ax1.set_xticklabels(['Female', 'Male'], rotation=0)
    
    for p in ax1.patches:
        width, height = p.get_width(), p.get_height()
        if height > 5:  # Only show percentage for visible segments
            x, y = p.get_xy() 
            ax1.text(x + width/2, y + height/2, f'{height:.1f}%', ha='center', va='center')
    
    # 2. Survival by Class
    ax2 = plt.subplot(gs[0, 1])
    survival_by_class = df.groupby(['Pclass', 'Survived']).size().unstack()
    survival_by_class_pct = survival_by_class.div(survival_by_class.sum(axis=1), axis=0) * 100
    survival_by_class_pct.plot(kind='bar', stacked=True, ax=ax2, color=['#ff6b6b', '#4ecdc4'])
    ax2.set_title('Survival Rate by Class', fontsize=14, fontweight='bold')
    ax2.set_ylabel('Percentage (%)', fontweight='bold')
    ax2.set_xticklabels(['1st Class', '2nd Class', '3rd Class'], rotation=0)
    
    for p in ax2.patches:
        width, height = p.get_width(), p.get_height()
        if height > 5:
            x, y = p.get_xy() 
            ax2.text(x + width/2, y + height/2, f'{height:.1f}%', ha='center', va='center')
    
    # 3. Age Distribution by Survival
    ax3 = plt.subplot(gs[0, 2])
    sns.kdeplot(data=df, x='Age', hue='Survived', fill=True, common_norm=False, alpha=0.7, 
                palette=['#ff6b6b', '#4ecdc4'], ax=ax3)
    ax3.set_title('Age Distribution by Survival', fontsize=14, fontweight='bold')
    ax3.set_xlabel('Age', fontweight='bold')
    ax3.set_ylabel('Density', fontweight='bold')
    
    # 4. Fare Distribution by Survival
    ax4 = plt.subplot(gs[1, 0])
    df_fare = df.copy()
    df_fare['Fare'] = df_fare['Fare'].clip(upper=df_fare['Fare'].quantile(0.99))  # Remove outliers
    sns.boxplot(x='Pclass', y='Fare', hue='Survived', data=df_fare, palette=['#ff6b6b', '#4ecdc4'], ax=ax4)
    ax4.set_title('Fare Distribution by Class and Survival', fontsize=14, fontweight='bold')
    ax4.set_xlabel('Passenger Class', fontweight='bold')
    ax4.set_ylabel('Fare (£)', fontweight='bold')
    
    # 5. Family Size vs Survival
    ax5 = plt.subplot(gs[1, 1])
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    family_survival = df.groupby(['FamilySize', 'Survived']).size().unstack().fillna(0)
    family_survival_rate = family_survival[1] / (family_survival[0] + family_survival[1]) * 100
    
    # Bar chart for counts
    bars = ax5.bar(family_survival.index, family_survival.sum(axis=1), color='#b8e994')
    ax5.set_xlabel('Family Size', fontweight='bold')
    ax5.set_ylabel('Count', fontweight='bold', color='#6a8d73')
    ax5.yaxis.set_major_locator(MaxNLocator(integer=True))
    
    # Line chart for survival rate
    ax5_twin = ax5.twinx()
    line = ax5_twin.plot(family_survival.index, family_survival_rate, 'o-', color='#ff6b6b', linewidth=3, markersize=8)
    ax5_twin.set_ylabel('Survival Rate (%)', fontweight='bold', color='#ff6b6b')
    ax5_twin.grid(False)
    
    ax5.set_title('Family Size: Count vs Survival Rate', fontsize=14, fontweight='bold')
    
    # 6. Embarked vs Survival
    ax6 = plt.subplot(gs[1, 2])
    df['Embarked'] = df['Embarked'].fillna('S')  # Fill missing values for visualization
    embarked_survival = df.groupby(['Embarked', 'Survived']).size().unstack().fillna(0)
    embarked_survival_pct = embarked_survival.div(embarked_survival.sum(axis=1), axis=0) * 100
    embarked_survival_pct.plot(kind='bar', stacked=True, ax=ax6, color=['#ff6b6b', '#4ecdc4'])
    
    # Map port codes to names
    port_names = {'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'}
    ax6.set_xticklabels([port_names.get(i, i) for i in embarked_survival.index], rotation=0)
    ax6.set_title('Survival Rate by Port of Embarkation', fontsize=14, fontweight='bold')
    ax6.set_ylabel('Percentage (%)', fontweight='bold')
    
    for p in ax6.patches:
        width, height = p.get_width(), p.get_height()
        if height > 5:
            x, y = p.get_xy() 
            ax6.text(x + width/2, y + height/2, f'{height:.1f}%', ha='center', va='center')
    
    # 7. Title vs Survival
    ax7 = plt.subplot(gs[2, 0:2])
    title_survival = df.groupby(['Title', 'Survived']).size().unstack().fillna(0)
    # Sort by survival rate
    title_survival['SurvivalRate'] = title_survival[1] / (title_survival[0] + title_survival[1])
    title_survival = title_survival.sort_values('SurvivalRate', ascending=False)
    title_survival = title_survival.drop('SurvivalRate', axis=1)
    
    title_survival_pct = title_survival.div(title_survival.sum(axis=1), axis=0) * 100
    title_survival_pct.plot(kind='barh', stacked=True, ax=ax7, color=['#ff6b6b', '#4ecdc4'], width=0.7)
    ax7.set_title('Survival Rate by Title', fontsize=14, fontweight='bold')
    ax7.set_xlabel('Percentage (%)', fontweight='bold')
    
    for p in ax7.patches:
        width, height = p.get_width(), p.get_height()
        if width > 5:
            x, y = p.get_xy() 
            ax7.text(x + width/2, y + height/2, f'{width:.1f}%', ha='center', va='center')
    
    # 8. Age-Class Heatmap for Survival Rate
    ax8 = plt.subplot(gs[2, 2])
    # Create age bins for better visualization
    df['AgeBin'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 60, 100], 
                           labels=['Child', 'Teen', 'Young Adult', 'Adult', 'Senior'])
    age_class_survival = pd.crosstab(index=[df['AgeBin']], 
                                     columns=[df['Pclass'], df['Survived']], 
                                     values=df['PassengerId'], 
                                     aggfunc='count').fillna(0)
    
    # Calculate survival rate
    survival_matrix = np.zeros((len(age_class_survival), 3))
    for i in range(len(age_class_survival)):
        for j in range(3):
            pclass = j + 1
            if pclass in age_class_survival.columns.levels[0]:
                if 1 in age_class_survival.columns.levels[1] and 0 in age_class_survival.columns.levels[1]:
                    # Make sure both survived=0 and survived=1 exist
                    survived = age_class_survival.iloc[i].get((pclass, 1), 0)
                    total = survived + age_class_survival.iloc[i].get((pclass, 0), 0)
                    survival_matrix[i, j] = survived / total * 100 if total > 0 else 0
    
    # Plot heatmap
    sns.heatmap(survival_matrix, annot=True, fmt='.1f', cmap='RdYlGn', 
                xticklabels=['1st Class', '2nd Class', '3rd Class'],
                yticklabels=age_class_survival.index, ax=ax8)
    ax8.set_title('Survival Rate (%) by Age and Class', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.subplots_adjust(top=0.95)
    fig.suptitle('Titanic Survival Analysis', fontsize=20, fontweight='bold', y=0.98)
    plt.show()

# Function for interactive Plotly visualizations
def create_interactive_visualizations(df):
    # 1. Interactive Survival by Age and Class
    fig1 = px.scatter(df, x='Age', y='Fare', color='Survived', 
                     size='FamilySize', facet_col='Pclass', 
                     hover_name='Name', hover_data=['Sex', 'Ticket', 'Cabin', 'Embarked'],
                     title='Survival by Age, Fare, Class and Family Size',
                     color_continuous_scale=['#ff6b6b', '#4ecdc4'],
                     labels={'Survived': 'Survived', 'Pclass': 'Passenger Class'},
                     height=600)
    fig1.update_layout(title_font_size=20)
    fig1.show()
    
    # 2. Interactive Survival Correlation Heatmap
    correlation_data = df[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 
                          'Fare', 'Sex_encoded', 'Embarked_encoded', 'FamilySize']].corr()
    
    fig2 = go.Figure(data=go.Heatmap(
                    z=correlation_data.values,
                    x=correlation_data.columns,
                    y=correlation_data.columns,
                    colorscale='RdBu_r',
                    zmin=-1, zmax=1,
                    text=np.round(correlation_data.values, 2),
                    texttemplate="%{text}",
                    textfont={"size":10}))
    
    fig2.update_layout(title='Feature Correlation Heatmap',
                      title_font_size=20,
                      height=600)
    fig2.show()

# Enhanced Feature Engineering
def enhance_features(df):
    # Make a copy to avoid modifying the original
    df = df.copy()
    
    # Extract titles from names (already done in the preprocessing step, but kept here for completeness)
    if 'Title' not in df.columns:
        df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
        
        # Consolidate rare titles
        title_mapping = {
            'Mr': 'Mr',
            'Miss': 'Miss',
            'Mrs': 'Mrs',
            'Master': 'Master',
            'Dr': 'Officer',
            'Rev': 'Officer',
            'Col': 'Officer',
            'Major': 'Officer',
            'Mlle': 'Miss',
            'Mme': 'Mrs',
            'Don': 'Royalty',
            'Sir': 'Royalty',
            'Lady': 'Royalty',
            'Countess': 'Royalty',
            'Jonkheer': 'Royalty',
            'Capt': 'Officer',
            'Ms': 'Miss',
            'Dona': 'Royalty'
        }
        df['Title'] = df['Title'].map(title_mapping)
    
    # Family size and type
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)
    df['FamilyType'] = pd.cut(df['FamilySize'], bins=[0, 1, 4, 20], labels=['Solo', 'Small', 'Large'])
    
    # Age bands
    df['AgeBand'] = pd.cut(df['Age'], bins=[0, 12, 18, 35, 60, 100], 
                           labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior'])
    
    # Extract cabin deck
    df['Deck'] = df['Cabin'].str.slice(0, 1)
    df['Deck'] = df['Deck'].fillna('U')  # U for unknown
    
    # Fare bands
    df['FareBand'] = pd.qcut(df['Fare'].fillna(df['Fare'].median()), 5, 
                            labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
    
    # Combine Age and Class
    df['Age*Class'] = df['Age'].fillna(df['Age'].median()) * df['Pclass']
    
    # Extract ticket prefix
    df['TicketPrefix'] = df['Ticket'].str.extract('([A-Za-z]+)', expand=False)
    df['TicketPrefix'] = df['TicketPrefix'].fillna('NUM')
    
    # Extract ticket length which may indicate social status
    df['TicketLength'] = df['Ticket'].apply(lambda x: len(str(x)))
    
    # Create surname feature
    df['Surname'] = df['Name'].str.split(',').str[0]
    
    # Interaction terms
    df['FamilySizeClass'] = df['FamilySize'] * df['Pclass']
    
    return df

# Improved data preprocessing function
def preprocess_data(train_df, test_df):
    # Apply feature engineering
    train_processed = enhance_features(train_df)
    test_processed = enhance_features(test_df)
    
    # Impute missing ages with KNN
    age_features = ['Pclass', 'SibSp', 'Parch', 'Fare']
    knn_imputer = KNNImputer(n_neighbors=5)
    
    # Fit imputer on the training set
    train_age_data = train_processed[['Age'] + age_features].copy()
    train_processed['Age'] = knn_imputer.fit_transform(train_age_data)[:, 0]
    
    # Apply the same imputer to the test set
    test_age_data = test_processed[['Age'] + age_features].copy()
    test_processed['Age'] = knn_imputer.transform(test_age_data)[:, 0]
    
    # Fill other missing values
    train_processed['Embarked'] = train_processed['Embarked'].fillna(train_processed['Embarked'].mode()[0])
    test_processed['Fare'] = test_processed['Fare'].fillna(test_processed['Fare'].median())
    
    # Encode categorical features
    for col in ['Sex', 'Embarked', 'Title', 'Deck', 'TicketPrefix', 'FamilyType', 'AgeBand', 'FareBand']:
        if col in train_processed.columns:
            # Create a combined encoder to ensure consistent encoding for both datasets
            combined = pd.concat([train_processed[col].astype(str), test_processed[col].astype(str)])
            encoder = LabelEncoder().fit(combined)
            
            # Apply to both datasets
            train_processed[f'{col}_encoded'] = encoder.transform(train_processed[col].astype(str))
            test_processed[f'{col}_encoded'] = encoder.transform(test_processed[col].astype(str))
    
    # Add survival features (for train data only)
    if 'Survived' in train_df.columns:
        # Add survival rate by title
        title_survival_rate = train_processed.groupby('Title')['Survived'].mean()
        train_processed['TitleSurvivalRate'] = train_processed['Title'].map(title_survival_rate)
        test_processed['TitleSurvivalRate'] = test_processed['Title'].map(title_survival_rate)
        
        # Add survival rate by deck
        deck_survival_rate = train_processed.groupby('Deck')['Survived'].mean()
        train_processed['DeckSurvivalRate'] = train_processed['Deck'].map(deck_survival_rate)
        test_processed['DeckSurvivalRate'] = test_processed['Deck'].map(deck_survival_rate)
        
        # Add family survival rate
        family_survival = train_processed.groupby('Surname')['Survived'].mean()
        train_processed['FamilySurvivalRate'] = train_processed['Surname'].map(family_survival)
        test_processed['FamilySurvivalRate'] = test_processed['Surname'].map(family_survival)
        
        # Fill NaN survival rates
        for col in ['TitleSurvivalRate', 'DeckSurvivalRate', 'FamilySurvivalRate']:
            mean_rate = train_processed[col].mean()
            train_processed[col] = train_processed[col].fillna(mean_rate)
            test_processed[col] = test_processed[col].fillna(mean_rate)
    
    return train_processed, test_processed

# Function for feature selection
def select_features(df, importance_threshold=0.01):
    # Base features that are likely important
    base_features = ['Pclass', 'Sex_encoded', 'Age', 'Fare', 'Embarked_encoded', 
                    'Title_encoded', 'FamilySize', 'IsAlone', 'TitleSurvivalRate', 
                    'DeckSurvivalRate', 'FamilySurvivalRate']
    
    # Add engineered features
    engineered_features = ['Age*Class', 'FamilySizeClass', 'Deck_encoded', 
                          'AgeBand_encoded', 'FareBand_encoded']
    
    # Start with base features
    selected_features = base_features + engineered_features
    
    # Remove features that don't exist in the dataframe
    selected_features = [f for f in selected_features if f in df.columns]
    
    return selected_features

# Function to train and evaluate multiple models
def train_evaluate_models(X_train, y_train, X_val, y_val):
    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000, C=0.1),
        'Random Forest': RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42),
        'SVM': SVC(probability=True, random_state=42)
    }
    
    results = {}
    feature_importances = None
    
    plt.figure(figsize=(15, 10))
    
    for i, (name, model) in enumerate(models.items()):
        # Train model
        model.fit(X_train, y_train)
        
        # Get predictions
        y_pred = model.predict(X_val)
        y_pred_proba = model.predict_proba(X_val)[:, 1] if hasattr(model, 'predict_proba') else None
        
        # Calculate metrics
        accuracy = accuracy_score(y_val, y_pred)
        report = classification_report(y_val, y_pred, output_dict=True)
        cm = confusion_matrix(y_val, y_pred)
        
        # Add to results
        results[name] = {
            'accuracy': accuracy,
            'precision': report['1']['precision'],
            'recall': report['1']['recall'],
            'f1': report['1']['f1-score']
        }
        
        # Confusion Matrix
        plt.subplot(2, 2, i+1)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                   xticklabels=['Not Survived', 'Survived'],
                   yticklabels=['Not Survived', 'Survived'])
        plt.title(f'{name} (Accuracy: {accuracy:.3f})', fontsize=12, fontweight='bold')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        
        # Store feature importances for Random Forest
        if name == 'Random Forest' and hasattr(model, 'feature_importances_'):
            # Get column names (fixed)
            feature_names = X_train.columns if hasattr(X_train, 'columns') else None
            if feature_names is not None:
                feature_importances = pd.Series(model.feature_importances_, index=feature_names)
    
    plt.tight_layout()
    plt.suptitle('Confusion Matrices for Different Models', fontsize=16, fontweight='bold', y=1.02)
    plt.show()
    
    # Plot feature importances for Random Forest
    if feature_importances is not None:
        plt.figure(figsize=(12, 8))
        feature_importances.sort_values(ascending=False).plot(kind='bar')
        plt.title('Feature Importances from Random Forest', fontsize=14, fontweight='bold')
        plt.ylabel('Importance')
        plt.tight_layout()
        plt.show()
    
    # Plot comparative metrics
    metrics = ['accuracy', 'precision', 'recall', 'f1']
    comp_data = pd.DataFrame({model_name: [results[model_name][metric] for metric in metrics] 
                             for model_name in models.keys()}, index=metrics)
    
    plt.figure(figsize=(12, 6))
    comp_data.plot(kind='bar')
    plt.title('Model Performance Comparison', fontsize=14, fontweight='bold')
    plt.ylabel('Score')
    plt.ylim(0, 1)
    plt.xticks(rotation=0)
    plt.legend(title='Models')
    plt.tight_layout()
    plt.show()
    
    return results, models

# Function to create ensemble model
def create_ensemble(models):
    print("Creating ensemble model...")
    estimators = [(name, model) for name, model in models.items()]
    ensemble = VotingClassifier(estimators=estimators, voting='soft')
    return ensemble

# ROC curve and precision-recall curve
def plot_model_curves(models, X_val, y_val):
    plt.figure(figsize=(15, 6))
    
    # ROC curve
    plt.subplot(1, 2, 1)
    for name, model in models.items():
        if hasattr(model, 'predict_proba'):
            y_pred_proba = model.predict_proba(X_val)[:, 1]
            fpr, tpr, _ = roc_curve(y_val, y_pred_proba)
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.3f})')
    
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves', fontsize=14, fontweight='bold')
    plt.legend(loc='lower right')
    
    # Precision-Recall curve
    plt.subplot(1, 2, 2)
    for name, model in models.items():
        if hasattr(model, 'predict_proba'):
            y_pred_proba = model.predict_proba(X_val)[:, 1]
            precision, recall, _ = precision_recall_curve(y_val, y_pred_proba)
            plt.plot(recall, precision, label=f'{name}')
    
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curves', fontsize=14, fontweight='bold')
    plt.legend(loc='best')
    
    plt.tight_layout()
    plt.show()

# Main execution flow
print("Starting Titanic survival analysis...")

# Run EDA visualizations on the training data with basic feature engineering
print("Generating EDA visualizations...")
train_with_features = enhance_features(train_df)
train_with_features['Sex_encoded'] = LabelEncoder().fit_transform(train_with_features['Sex'])
train_with_features['Embarked_encoded'] = LabelEncoder().fit_transform(train_with_features['Embarked'].fillna('S'))
visualize_titanic_data(train_with_features)

# Interactive visualizations
print("Creating interactive visualizations...")
create_interactive_visualizations(train_with_features)

# Preprocess data
print("Preprocessing data...")
train_processed, test_processed = preprocess_data(train_df, test_df)

# Select features
features = select_features(train_processed)
print(f"Selected {len(features)} features: {', '.join(features)}")

# Prepare training data
X = train_processed[features]
y = train_df['Survived']
X_test = test_processed[features]

# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale data - Modified to preserve DataFrame structure
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Train models
print("Training multiple models...")
results, models = train_evaluate_models(X_train_scaled, y_train, X_val_scaled, y_val)

# Plot model performance curves
print("Plotting model performance curves...")
plot_model_curves(models, X_val_scaled, y_val)

# Create and train ensemble model
ensemble = create_ensemble(models)
ensemble.fit(X_train_scaled, y_train)
ensemble_pred = ensemble.predict(X_val_scaled)
ensemble_accuracy = accuracy_score(y_val, ensemble_pred)
print(f'Ensemble Model Validation Accuracy: {ensemble_accuracy:.4f}')
print('Ensemble Classification Report:\n', classification_report(y_val, ensemble_pred))

# Make final predictions with ensemble
print("Making final predictions...")
final_predictions = ensemble.predict(X_test_scaled)

# Create submission file
submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': final_predictions.astype(int)})
submission.to_csv('submission.csv', index=False)
print('Submission file created.')

# Summary of model performance
print("\nModel Performance Summary:")
for model_name, metrics in results.items():
    print(f"{model_name}: Accuracy = {metrics['accuracy']:.4f}, F1 = {metrics['f1']:.4f}")
print(f"Ensemble Model: Accuracy = {ensemble_accuracy:.4f}")