In [4]:
%pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, 
    roc_auc_score
)
from sklearn.pipeline import Pipeline

# --- Import new tools for Strategy 1 ---
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# --- 1. Load Data ---
print("Loading train.csv...")
try:
    df = pd.read_csv('train.csv')
except FileNotFoundError:
    print("Error: train.csv not found.")
    # In a real script, you might exit here
    # exit() 
else:
    print("Data loaded successfully.")

    # --- 2. Basic Cleaning & Define X/y ---
    print("Cleaning data and defining features (X) and target (y)...")
    
    cols_to_drop = ['Unnamed: 0', 'company_name', 'fyear']
    cols_to_drop_existing = [col for col in cols_to_drop if col in df.columns]
    df = df.drop(columns=cols_to_drop_existing)
    
    df = df.dropna(subset=['status_label'])

    target = 'status_label'
    y = df[target].map({'alive': 0, 'failed': 1})
    X = df.drop(columns=[target])

    # --- 3. Split Data into Training and Validation Sets ---
    print("Splitting data into 80% train and 20% validation sets...")
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, 
        test_size=0.2,
        random_state=42,
        stratify=y  # Crucial for imbalanced data
    )

    # --- 4. Define Preprocessing Pipelines ---
    print("Defining preprocessing pipelines...")
    
    numerical_features = [f'X{i}' for i in range(1, 19)]
    categorical_features = ['Division', 'MajorGroup']

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

    # -----------------------------------------------------------------
    # --- 5. Define the FULL Pipeline (Preprocessor + SMOTE + Model) ---
    # -----------------------------------------------------------------
    
    print("\n--- Building Full Pipeline with SMOTE ---")
    
    # Define the model. Note we REMOVED class_weight='balanced'
    # SMOTE will handle the imbalance instead.
    model = RandomForestClassifier(
        random_state=42,
        # class_weight='balanced', # <-- REMOVED
        n_estimators=150,      # Number of trees in the forest
        max_depth=10,          # Limits the depth of each tree
        n_jobs=-1              # Use all available CPU cores
    )
    
    # Create the imbalanced-learn pipeline
    # This chains the preprocessor, the SMOTE sampler, and the model
    # SMOTE will *only* be applied during .fit() and *only* to the training data
    full_pipeline = ImbPipeline(steps=[
        ('preprocessor', preprocessor),
        ('sampler', SMOTE(random_state=42)),
        ('model', model)
    ])

    # -----------------------------------------------------------------
    # --- 6. Train the Model ---
    # -----------------------------------------------------------------
    
    print("\n--- Training Model with SMOTE ---")
    
    # Train the entire pipeline on the *raw* training data.
    # The pipeline handles all preprocessing and sampling internally.
    full_pipeline.fit(X_train, y_train)
    
    print("Model training complete.")

    # --- 7. Evaluate the Model ---
    print("\n--- Model Evaluation on Validation Set ---")
    
    # Predict on the *raw* validation data.
    # The pipeline will automatically apply the preprocessor (but not SMOTE)
    y_pred = full_pipeline.predict(X_val)
    
    # Get prediction probabilities for the 'failed' class (class 1)
    y_pred_proba = full_pipeline.predict_proba(X_val)[:, 1]

    # --- Print Metrics ---
    
    # Accuracy
    print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
    print("--------------------------------------------------")

    # ROC-AUC Score
    print(f"ROC-AUC Score: {roc_auc_score(y_val, y_pred_proba):.4f}")
    print("--------------------------------------------------")

    # Confusion Matrix
    print("Confusion Matrix:")
    print(confusion_matrix(y_val, y_pred))
    print("--------------------------------------------------")

    # Classification Report
    print("Classification Report:")
    print(classification_report(y_val, y_pred, target_names=['alive (0)', 'failed (1)']))
    print("--------------------------------------------------")

Loading train.csv...
Data loaded successfully.
Cleaning data and defining features (X) and target (y)...
Splitting data into 80% train and 20% validation sets...
Defining preprocessing pipelines...

--- Building Full Pipeline with SMOTE ---

--- Training Model with SMOTE ---
Model training complete.

--- Model Evaluation on Validation Set ---
Accuracy: 0.7909
--------------------------------------------------
ROC-AUC Score: 0.7114
--------------------------------------------------
Confusion Matrix:
[[9555 2162]
 [ 464  377]]
--------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

   alive (0)       0.95      0.82      0.88     11717
  failed (1)       0.15      0.45      0.22       841

    accuracy                           0.79     12558
   macro avg       0.55      0.63      0.55     12558
weighted avg       0.90      0.79      0.84     12558

--------------------------------------------------


In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV # Import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline # <-- Make sure this is imported
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, 
    roc_auc_score
)

# --- Import imblearn tools ---
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# --- 1. Load Data ---
print("Loading train.csv...")
try:
    df = pd.read_csv('train.csv')
except FileNotFoundError:
    print("Error: train.csv not found.")
    # exit() 
else:
    print("Data loaded successfully.")

    # --- 2. Basic Cleaning & Define X/y ---
    print("Cleaning data and defining features (X) and target (y)...")
    
    cols_to_drop = ['Unnamed: 0', 'company_name', 'fyear']
    cols_to_drop_existing = [col for col in cols_to_drop if col in df.columns]
    df = df.drop(columns=cols_to_drop_existing)
    
    df = df.dropna(subset=['status_label'])

    target = 'status_label'
    y = df[target].map({'alive': 0, 'failed': 1})
    X = df.drop(columns=[target])

    # --- 3. Split Data into Training and Validation Sets ---
    print("Splitting data into 80% train and 20% validation sets...")
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, 
        test_size=0.2,
        random_state=42,
        stratify=y 
    )

    # --- 4. Define Preprocessing Pipelines ---
    print("Defining preprocessing pipelines...")
    
    numerical_features = [f'X{i}' for i in range(1, 19)]
    categorical_features = ['Division', 'MajorGroup']

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

    # -----------------------------------------------------------------
    # --- 5. Define the FULL Pipeline and Parameter Grid ---
    # -----------------------------------------------------------------
    
    print("\n--- Building Full Pipeline for GridSearchCV ---")
    
    # Define the model (we'll set params in the grid)
    model = RandomForestClassifier(random_state=42, n_jobs=-1)
    
    # Create the full pipeline
    full_pipeline = ImbPipeline(steps=[
        ('preprocessor', preprocessor),
        ('sampler', SMOTE(random_state=42)),
        ('model', model)
    ])

    # --- Define the Parameter Grid ---
    # We prefix parameters with 'model__' to tell the pipeline
    # to apply them to the 'model' step.
    # NOTE: This is a small grid to run quickly. 
    # For a real search, you'd try more values.
    param_grid = {
        'model__n_estimators': [100, 200],      # Number of trees
        'model__max_depth': [10, 20],           # Max depth of trees
        'model__min_samples_leaf': [1, 5]       # Min samples at a leaf node
        # 'sampler__k_neighbors': [3, 5]        # You can even tune SMOTE
    }

    # -----------------------------------------------------------------
    # --- 6. Set up and Run GridSearchCV ---
    # -----------------------------------------------------------------

    print("\n--- Starting Hyperparameter Tuning (GridSearchCV) ---")
    
    # Set up the Grid Search
    # We optimize for 'f1_macro' (average F1 of both classes)
    # cv=3 is a 3-fold cross-validation.
    grid_search = GridSearchCV(
        estimator=full_pipeline, 
        param_grid=param_grid, 
        scoring='f1_macro', # <-- Optimize for F1-score!
        cv=3, 
        n_jobs=-1,      # Use all cores (can be slow)
        verbose=2       # Shows progress
    )
    
    # Train the grid search
    grid_search.fit(X_train, y_train)

    print("\nTuning complete.")
    print(f"Best parameters found: {grid_search.best_params_}")
    print(f"Best cross-validation F1-macro score: {grid_search.best_score_:.4f}")

    # -----------------------------------------------------------------
    # --- 7. Evaluate the BEST Model ---
    # -----------------------------------------------------------------

    print("\n--- Model Evaluation of Best Model on Validation Set ---")
    
    # 'grid_search' object now contains the best model found
    best_model = grid_search.best_estimator_ 

    # Predict using the best model
    y_pred = best_model.predict(X_val)
    y_pred_proba = best_model.predict_proba(X_val)[:, 1]

    # --- Print Metrics ---
    
    print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
    print("--------------------------------------------------")
    print(f"ROC-AUC Score: {roc_auc_score(y_val, y_pred_proba):.4f}")
    print("--------------------------------------------------")
    print("Confusion Matrix:")
    print(confusion_matrix(y_val, y_pred))
    print("--------------------------------------------------")
    print("Classification Report:")
    print(classification_report(y_val, y_pred, target_names=['alive (0)', 'failed (1)']))
    print("--------------------------------------------------")

Loading train.csv...
Data loaded successfully.
Cleaning data and defining features (X) and target (y)...
Splitting data into 80% train and 20% validation sets...
Defining preprocessing pipelines...

--- Building Full Pipeline for GridSearchCV ---

--- Starting Hyperparameter Tuning (GridSearchCV) ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits

Tuning complete.
Best parameters found: {'model__max_depth': 20, 'model__min_samples_leaf': 1, 'model__n_estimators': 100}
Best cross-validation F1-macro score: 0.5852

--- Model Evaluation of Best Model on Validation Set ---
Accuracy: 0.8824
--------------------------------------------------
ROC-AUC Score: 0.7680
--------------------------------------------------
Confusion Matrix:
[[10792   925]
 [  552   289]]
--------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

   alive (0)       0.95      0.92      0.94     11717
  failed (1)       0.24      0.34     

In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV # Import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline # <-- We use the standard pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, 
    roc_auc_score
)

# --- Import the new model ---
import xgboost as xgb

# --- 1. Load Data ---
print("Loading train.csv...")
try:
    df = pd.read_csv('train.csv')
except FileNotFoundError:
    print("Error: train.csv not found.")
    # exit() 
else:
    print("Data loaded successfully.")

    # --- 2. Basic Cleaning & Define X/y ---
    print("Cleaning data and defining features (X) and target (y)...")
    
    cols_to_drop = ['Unnamed: 0', 'company_name', 'fyear']
    cols_to_drop_existing = [col for col in cols_to_drop if col in df.columns]
    df = df.drop(columns=cols_to_drop_existing)
    
    df = df.dropna(subset=['status_label'])

    target = 'status_label'
    y = df[target].map({'alive': 0, 'failed': 1})
    X = df.drop(columns=[target])

    # --- 3. Split Data into Training and Validation Sets ---
    print("Splitting data into 80% train and 20% validation sets...")
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, 
        test_size=0.2,
        random_state=42,
        stratify=y 
    )

    # --- 4. Define Preprocessing Pipelines ---
    print("Defining preprocessing pipelines...")
    
    numerical_features = [f'X{i}' for i in range(1, 19)]
    categorical_features = ['Division', 'MajorGroup']

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

    # -----------------------------------------------------------------
    # --- 5. Calculate scale_pos_weight & Define XGBoost Pipeline ---
    # -----------------------------------------------------------------
    
    # Calculate scale_pos_weight for XGBoost
    # This is the ratio of (count of 'alive') / (count of 'failed')
    # We calculate it *only* on the y_train set
    scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
    print(f"\nCalculated scale_pos_weight for XGBoost: {scale_pos_weight:.2f}")

    # Define the XGBoost model, passing in the imbalance parameter
    model_xgb = xgb.XGBClassifier(
        random_state=42,
        scale_pos_weight=scale_pos_weight, # <-- This handles imbalance
        n_jobs=-1,
        eval_metric='logloss' # Suppresses a warning
    )
    
    # Create a *standard* sklearn pipeline (NO SMOTE)
    full_pipeline_xgb = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_xgb)
    ])

    # --- Define the Parameter Grid for XGBoost ---
    # Note: 'learning_rate' is a key parameter for boosting models
    param_grid_xgb = {
        'model__n_estimators': [100, 200],      # Number of trees
        'model__max_depth': [5, 10],            # Max depth of trees
        'model__learning_rate': [0.1, 0.05]     # Step size shrinkage
    }

    # -----------------------------------------------------------------
    # --- 6. Set up and Run GridSearchCV for XGBoost ---
    # -----------------------------------------------------------------

    print("\n--- Starting Hyperparameter Tuning for XGBoost ---")
    
    grid_search_xgb = GridSearchCV(
        estimator=full_pipeline_xgb, 
        param_grid=param_grid_xgb, 
        scoring='f1_macro', # <-- Still optimizing for F1
        cv=3, 
        n_jobs=-1,
        verbose=2
    )
    
    # Train the grid search on the raw training data
    grid_search_xgb.fit(X_train, y_train)

    print("\nTuning complete.")
    print(f"Best XGBoost parameters found: {grid_search_xgb.best_params_}")
    print(f"Best cross-validation F1-macro score: {grid_search_xgb.best_score_:.4f}")

    # -----------------------------------------------------------------
    # --- 7. Evaluate the BEST XGBoost Model ---
    # -----------------------------------------------------------------

    print("\n--- Model Evaluation of Best XGBoost Model on Validation Set ---")
    
    best_model_xgb = grid_search_xgb.best_estimator_ 

    # Predict using the best model
    y_pred = best_model_xgb.predict(X_val)
    y_pred_proba = best_model_xgb.predict_proba(X_val)[:, 1]

    # --- Print Metrics ---
    
    print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
    print("--------------------------------------------------")
    print(f"ROC-AUC Score: {roc_auc_score(y_val, y_pred_proba):.4f}")
    print("--------------------------------------------------")
    print("Confusion Matrix:")
    print(confusion_matrix(y_val, y_pred))
    print("--------------------------------------------------")
    print("Classification Report:")
    print(classification_report(y_val, y_pred, target_names=['alive (0)', 'failed (1)']))
    print("--------------------------------------------------")

Loading train.csv...
Data loaded successfully.
Cleaning data and defining features (X) and target (y)...
Splitting data into 80% train and 20% validation sets...
Defining preprocessing pipelines...

Calculated scale_pos_weight for XGBoost: 13.94

--- Starting Hyperparameter Tuning for XGBoost ---
Fitting 3 folds for each of 8 candidates, totalling 24 fits

Tuning complete.
Best XGBoost parameters found: {'model__learning_rate': 0.1, 'model__max_depth': 10, 'model__n_estimators': 200}
Best cross-validation F1-macro score: 0.6673

--- Model Evaluation of Best XGBoost Model on Validation Set ---
Accuracy: 0.9137
--------------------------------------------------
ROC-AUC Score: 0.8466
--------------------------------------------------
Confusion Matrix:
[[11077   640]
 [  444   397]]
--------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

   alive (0)       0.96      0.95      0.95     11717
  failed (1)       0.38    

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, 
    roc_auc_score,
    precision_recall_curve,  # <-- For threshold tuning
    f1_score                 # <-- For threshold tuning
)

# --- Import XGBoost ---
import xgboost as xgb

# --- 1. Load Data ---
print("Loading train.csv...")
try:
    df = pd.read_csv('train.csv')
except FileNotFoundError:
    print("Error: train.csv not found.")
    # exit() 
else:
    print("Data loaded successfully.")

    # --- 2. Basic Cleaning & Define X/y ---
    print("Cleaning data and defining features (X) and target (y)...")
    
    cols_to_drop = ['Unnamed: 0', 'company_name', 'fyear']
    cols_to_drop_existing = [col for col in cols_to_drop if col in df.columns]
    df = df.drop(columns=cols_to_drop_existing)
    
    df = df.dropna(subset=['status_label'])

    target = 'status_label'
    y = df[target].map({'alive': 0, 'failed': 1})
    X = df.drop(columns=[target])

    # --- 3. Split Data into Training and Validation Sets ---
    print("Splitting data into 80% train and 20% validation sets...")
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, 
        test_size=0.2,
        random_state=42,
        stratify=y 
    )

    # --- 4. Define Preprocessing Pipelines ---
    print("Defining preprocessing pipelines...")
    
    numerical_features = [f'X{i}' for i in range(1, 19)]
    categorical_features = ['Division', 'MajorGroup']

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

    # -----------------------------------------------------------------
    # --- 5. Define and Train the BEST XGBoost Model ---
    # -----------------------------------------------------------------
    
    # Calculate scale_pos_weight
    scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
    print(f"\nCalculated scale_pos_weight for XGBoost: {scale_pos_weight:.2f}")

    # Define the XGBoost model with the BEST parameters from Strategy 3
    print("Defining best XGBoost model...")
    best_model_xgb = xgb.XGBClassifier(
        random_state=42,
        scale_pos_weight=scale_pos_weight,
        n_jobs=-1,
        eval_metric='logloss',
        learning_rate=0.1,  # <-- Best param
        max_depth=10,       # <-- Best param
        n_estimators=200    # <-- Best param
    )
    
    # Create the *standard* sklearn pipeline
    full_pipeline_xgb = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', best_model_xgb)
    ])

    # -----------------------------------------------------------------
    # --- 6. Train the Best Model ---
    # -----------------------------------------------------------------

    print("\n--- Training Best XGBoost Model ---")
    full_pipeline_xgb.fit(X_train, y_train)
    print("Training complete.")

    # -----------------------------------------------------------------
    # --- 7. Evaluation with Default Threshold (0.5) ---
    # -----------------------------------------------------------------

    print("\n--- Model Evaluation with Default 0.5 Threshold ---")
    
    # Get standard predictions and probabilities
    y_pred_default = full_pipeline_xgb.predict(X_val)
    y_pred_proba = full_pipeline_xgb.predict_proba(X_val)[:, 1]

    print(f"Accuracy: {accuracy_score(y_val, y_pred_default):.4f}")
    print(f"ROC-AUC Score: {roc_auc_score(y_val, y_pred_proba):.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_val, y_pred_default))
    print("Classification Report (Default Threshold):")
    print(classification_report(y_val, y_pred_default, target_names=['alive (0)', 'failed (1)']))
    print("--------------------------------------------------")

    # -----------------------------------------------------------------
    # --- STRATEGY 4: Find and Apply Optimal Threshold ---
    # -----------------------------------------------------------------

    print("\n--- Strategy 4: Finding Optimal Threshold ---")

    # We already have the probabilities from the step above (y_pred_proba)
    
    # Calculate precision, recall, and thresholds
    precision, recall, thresholds = precision_recall_curve(y_val, y_pred_proba)

    # Calculate F1-score for all thresholds
    # We add a small epsilon (1e-9) to avoid division by zero
    f1_scores = (2 * precision * recall) / (precision + recall + 1e-9)
    
    # Find the threshold that gives the maximum F1-score
    # Note: thresholds array is 1 shorter than precision/recall,
    # so we index f1_scores up to the length of thresholds.
    best_f1_idx = np.argmax(f1_scores[:-1]) # Get index of best F1
    best_threshold = thresholds[best_f1_idx]
    best_f1 = f1_scores[best_f1_idx]

    print(f"Best F1-Score found: {best_f1:.4f}")
    print(f"Optimal Threshold: {best_threshold:.4f}")

    # --- Evaluate with the NEW threshold ---
    print("\n--- Model Evaluation with Optimal Threshold ---")

    # Get new predictions based on the optimal threshold
    y_pred_optimal = (y_pred_proba >= best_threshold).astype(int)

    print(f"Accuracy: {accuracy_score(y_val, y_pred_optimal):.4f}")
    print("--------------------------------------------------")
    print("Confusion Matrix (Optimal Threshold):")
    print(confusion_matrix(y_val, y_pred_optimal))
    print("--------------------------------------------------")
    print("Classification Report (Optimal Threshold):")
    print(classification_report(y_val, y_pred_optimal, target_names=['alive (0)', 'failed (1)']))
    print("--------------------------------------------------")

Loading train.csv...
Data loaded successfully.
Cleaning data and defining features (X) and target (y)...
Splitting data into 80% train and 20% validation sets...
Defining preprocessing pipelines...

Calculated scale_pos_weight for XGBoost: 13.94
Defining best XGBoost model...

--- Training Best XGBoost Model ---
Training complete.

--- Model Evaluation with Default 0.5 Threshold ---
Accuracy: 0.9137
ROC-AUC Score: 0.8466
Confusion Matrix:
[[11077   640]
 [  444   397]]
Classification Report (Default Threshold):
              precision    recall  f1-score   support

   alive (0)       0.96      0.95      0.95     11717
  failed (1)       0.38      0.47      0.42       841

    accuracy                           0.91     12558
   macro avg       0.67      0.71      0.69     12558
weighted avg       0.92      0.91      0.92     12558

--------------------------------------------------

--- Strategy 4: Finding Optimal Threshold ---
Best F1-Score found: 0.4285
Optimal Threshold: 0.5278

---

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV # Import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, 
    roc_auc_score,
    precision_recall_curve, # For final threshold tuning
    f1_score                # For final threshold tuning
)

# --- Import the new model ---
import xgboost as xgb

# --- 1. Load Data ---
print("Loading train.csv...")
try:
    df = pd.read_csv('train.csv')
except FileNotFoundError:
    print("Error: train.csv not found.")
    # exit() 
else:
    print("Data loaded successfully.")

    # --- 2. Basic Cleaning & Define X/y ---
    print("Cleaning data and defining features (X) and target (y)...")
    
    cols_to_drop = ['Unnamed: 0', 'company_name', 'fyear']
    cols_to_drop_existing = [col for col in cols_to_drop if col in df.columns]
    df = df.drop(columns=cols_to_drop_existing)
    
    df = df.dropna(subset=['status_label'])

    target = 'status_label'
    y = df[target].map({'alive': 0, 'failed': 1})
    X = df.drop(columns=[target])

    # --- 3. Split Data into Training and Validation Sets ---
    print("Splitting data into 80% train and 20% validation sets...")
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, 
        test_size=0.2,
        random_state=42,
        stratify=y 
    )

    # --- 4. Define Preprocessing Pipelines ---
    print("Defining preprocessing pipelines...")
    
    numerical_features = [f'X{i}' for i in range(1, 19)]
    categorical_features = ['Division', 'MajorGroup']

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

    # -----------------------------------------------------------------
    # --- 5. Calculate scale_pos_weight & Define XGBoost Pipeline ---
    # -----------------------------------------------------------------
    
    # Calculate scale_pos_weight for XGBoost
    scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
    print(f"\nCalculated scale_pos_weight for XGBoost: {scale_pos_weight:.2f}")

    # Define the XGBoost model
    model_xgb = xgb.XGBClassifier(
        random_state=42,
        scale_pos_weight=scale_pos_weight, # <-- Handles imbalance
        n_jobs=-1,
        eval_metric='logloss' 
    )
    
    # Create a *standard* sklearn pipeline (NO SMOTE)
    full_pipeline_xgb = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_xgb)
    ])

    # --- Define the EXPANDED Parameter Grid for XGBoost ---
    # This grid is more focused based on our last results.
    param_grid_xgb_expanded = {
        'model__n_estimators': [200, 300],      # Test higher estimators
        'model__max_depth': [8, 10, 12],        # Explore around the previous best (10)
        'model__learning_rate': [0.1, 0.05],    # 0.1 was good, 0.05 is also common
        'model__min_child_weight': [1, 5]       # New param to control overfitting
    }
    # Total fits: 2 * 3 * 2 * 2 = 24 candidates. With cv=3, this is 72 model fits.

    # -----------------------------------------------------------------
    # --- 6. Set up and Run EXPANDED GridSearchCV for XGBoost ---
    # -----------------------------------------------------------------

    print("\n--- Starting EXPANDED Hyperparameter Tuning for XGBoost (Strategy 5) ---")
    
    grid_search_xgb = GridSearchCV(
        estimator=full_pipeline_xgb, 
        param_grid=param_grid_xgb_expanded, 
        scoring='f1_macro', # <-- Still optimizing for F1
        cv=3, 
        n_jobs=-1,
        verbose=2
    )
    
    # Train the grid search on the raw training data
    grid_search_xgb.fit(X_train, y_train)

    print("\nTuning complete.")
    print(f"Best XGBoost parameters found: {grid_search_xgb.best_params_}")
    print(f"Best cross-validation F1-macro score: {grid_search_xgb.best_score_:.4f}")

    # -----------------------------------------------------------------
    # --- 7. Evaluate the NEW BEST XGBoost Model ---
    # -----------------------------------------------------------------

    print("\n--- Model Evaluation of NEW Best XGBoost Model (Default 0.5 Threshold) ---")
    
    best_model_xgb = grid_search_xgb.best_estimator_ 

    # Predict using the new best model
    y_pred_default = best_model_xgb.predict(X_val)
    y_pred_proba = best_model_xgb.predict_proba(X_val)[:, 1]

    # --- Print Metrics (Default Threshold) ---
    print(f"Accuracy: {accuracy_score(y_val, y_pred_default):.4f}")
    print(f"ROC-AUC Score: {roc_auc_score(y_val, y_pred_proba):.4f}")
    print("Classification Report (Default Threshold):")
    print(classification_report(y_val, y_pred_default, target_names=['alive (0)', 'failed (1)']))
    print("--------------------------------------------------")

    # -----------------------------------------------------------------
    # --- 8. Apply Optimal Threshold Tuning to the NEW Best Model ---
    # -----------------------------------------------------------------
    
    print("\n--- Applying Optimal Threshold Tuning to NEW Best Model ---")

    precision, recall, thresholds = precision_recall_curve(y_val, y_pred_proba)
    f1_scores = (2 * precision * recall) / (precision + recall + 1e-9)
    
    best_f1_idx = np.argmax(f1_scores[:-1])
    best_threshold = thresholds[best_f1_idx]
    best_f1 = f1_scores[best_f1_idx]

    print(f"Best F1-Score found: {best_f1:.4f}")
    print(f"Optimal Threshold: {best_threshold:.4f}")

    # --- Evaluate with the NEW threshold ---
    print("\n--- Final Model Evaluation (New Model + Optimal Threshold) ---")

    y_pred_optimal = (y_pred_proba >= best_threshold).astype(int)

    print(f"Accuracy: {accuracy_score(y_val, y_pred_optimal):.4f}")
    print("--------------------------------------------------")
    print("Confusion Matrix (Optimal Threshold):")
    print(confusion_matrix(y_val, y_pred_optimal))
    print("--------------------------------------------------")
    print("Classification Report (Optimal Threshold):")
    print(classification_report(y_val, y_pred_optimal, target_names=['alive (0)', 'failed (1)']))
    print("--------------------------------------------------")

Loading train.csv...
Data loaded successfully.
Cleaning data and defining features (X) and target (y)...
Splitting data into 80% train and 20% validation sets...
Defining preprocessing pipelines...

Calculated scale_pos_weight for XGBoost: 13.94

--- Starting EXPANDED Hyperparameter Tuning for XGBoost (Strategy 5) ---
Fitting 3 folds for each of 24 candidates, totalling 72 fits

Tuning complete.
Best XGBoost parameters found: {'model__learning_rate': 0.1, 'model__max_depth': 10, 'model__min_child_weight': 5, 'model__n_estimators': 300}
Best cross-validation F1-macro score: 0.6735

--- Model Evaluation of NEW Best XGBoost Model (Default 0.5 Threshold) ---
Accuracy: 0.9274
ROC-AUC Score: 0.8572
Classification Report (Default Threshold):
              precision    recall  f1-score   support

   alive (0)       0.96      0.96      0.96     11717
  failed (1)       0.45      0.42      0.43       841

    accuracy                           0.93     12558
   macro avg       0.71      0.69    

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, 
    roc_auc_score,
    precision_recall_curve,
    f1_score
)

# --- Import the new model ---
import lightgbm as lgb

# --- 1. Load Data ---
print("Loading train.csv...")
try:
    df = pd.read_csv('train.csv')
except FileNotFoundError:
    print("Error: train.csv not found.")
    # exit() 
else:
    print("Data loaded successfully.")

    # --- 2. Basic Cleaning & Define X/y ---
    print("Cleaning data and defining features (X) and target (y)...")
    
    cols_to_drop = ['Unnamed: 0', 'company_name', 'fyear']
    cols_to_drop_existing = [col for col in cols_to_drop if col in df.columns]
    df = df.drop(columns=cols_to_drop_existing)
    
    df = df.dropna(subset=['status_label'])

    target = 'status_label'
    y = df[target].map({'alive': 0, 'failed': 1})
    X = df.drop(columns=[target])

    # --- 3. Split Data into Training and Validation Sets ---
    print("Splitting data into 80% train and 20% validation sets...")
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, 
        test_size=0.2,
        random_state=42,
        stratify=y 
    )

    # --- 4. Define Preprocessing Pipelines ---
    print("Defining preprocessing pipelines...")
    
    numerical_features = [f'X{i}' for i in range(1, 19)]
    categorical_features = ['Division', 'MajorGroup']

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

    # -----------------------------------------------------------------
    # --- 5. Define LightGBM Pipeline ---
    # -----------------------------------------------------------------
    
    print("\nDefining LightGBM model pipeline...")
    
    # Define the LightGBM model
    # We use class_weight='balanced' to handle imbalance
    model_lgb = lgb.LGBMClassifier(
        random_state=42,
        class_weight='balanced', # <-- This handles imbalance
        n_jobs=-1,
        verbose=-1 # Suppresses LightGBM's internal warnings
    )
    
    # Create a *standard* sklearn pipeline
    full_pipeline_lgb = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_lgb)
    ])

    # --- Define the Parameter Grid for LightGBM ---
    # Note: 'num_leaves' is a key parameter for LGBM, often
    # tuned instead of 'max_depth'.
    param_grid_lgb = {
        'model__n_estimators': [200, 300],
        'model__learning_rate': [0.1, 0.05],
        'model__num_leaves': [31, 50, 70],      # Default is 31
        'model__min_child_samples': [10, 20]  # Equivalent to min_child_weight
    }
    # Total fits: 2 * 2 * 3 * 2 = 24 candidates
    print(f"Grid search will test {2*2*3*2} candidates.")

    # -----------------------------------------------------------------
    # --- 6. Set up and Run GridSearchCV for LightGBM ---
    # -----------------------------------------------------------------

    print("\n--- Starting Hyperparameter Tuning for LightGBM ---")
    
    grid_search_lgb = GridSearchCV(
        estimator=full_pipeline_lgb, 
        param_grid=param_grid_lgb, 
        scoring='f1_macro', # <-- Still optimizing for F1
        cv=3, 
        n_jobs=-1,
        verbose=2
    )
    
    # Train the grid search on the raw training data
    grid_search_lgb.fit(X_train, y_train)

    print("\nTuning complete.")
    print(f"Best LightGBM parameters found: {grid_search_lgb.best_params_}")
    print(f"Best cross-validation F1-macro score: {grid_search_lgb.best_score_:.4f}")

    # -----------------------------------------------------------------
    # --- 7. Evaluate the BEST LightGBM Model ---
    # -----------------------------------------------------------------

    print("\n--- Model Evaluation of Best LightGBM Model (Default 0.5 Threshold) ---")
    
    best_model_lgb = grid_search_lgb.best_estimator_ 

    # Predict using the new best model
    y_pred_default = best_model_lgb.predict(X_val)
    y_pred_proba = best_model_lgb.predict_proba(X_val)[:, 1]

    # --- Print Metrics (Default Threshold) ---
    print(f"Accuracy: {accuracy_score(y_val, y_pred_default):.4f}")
    print(f"ROC-AUC Score: {roc_auc_score(y_val, y_pred_proba):.4f}")
    print("Classification Report (Default Threshold):")
    print(classification_report(y_val, y_pred_default, target_names=['alive (0)', 'failed (1)']))
    print("--------------------------------------------------")

    # -----------------------------------------------------------------
    # --- 8. Apply Optimal Threshold Tuning to the NEW Best Model ---
    # -----------------------------------------------------------------
    
    print("\n--- Applying Optimal Threshold Tuning to Best LightGBM Model ---")

    precision, recall, thresholds = precision_recall_curve(y_val, y_pred_proba)
    f1_scores = (2 * precision * recall) / (precision + recall + 1e-9)
    
    best_f1_idx = np.argmax(f1_scores[:-1])
    best_threshold = thresholds[best_f1_idx]
    best_f1 = f1_scores[best_f1_idx]

    print(f"Best F1-Score found: {best_f1:.4f}")
    print(f"Optimal Threshold: {best_threshold:.4f}")

    # --- Evaluate with the NEW threshold ---
    print("\n--- Final Model Evaluation (LGBM + Optimal Threshold) ---")

    y_pred_optimal = (y_pred_proba >= best_threshold).astype(int)

    print(f"Accuracy: {accuracy_score(y_val, y_pred_optimal):.4f}")
    print("--------------------------------------------------")
    print("Confusion Matrix (Optimal Threshold):")
    print(confusion_matrix(y_val, y_pred_optimal))
    print("--------------------------------------------------")
    print("Classification Report (Optimal Threshold):")
    print(classification_report(y_val, y_pred_optimal, target_names=['alive (0)', 'failed (1)']))
    print("--------------------------------------------------")

Loading train.csv...
Data loaded successfully.
Cleaning data and defining features (X) and target (y)...
Splitting data into 80% train and 20% validation sets...
Defining preprocessing pipelines...

Defining LightGBM model pipeline...
Grid search will test 24 candidates.

--- Starting Hyperparameter Tuning for LightGBM ---
Fitting 3 folds for each of 24 candidates, totalling 72 fits

Tuning complete.
Best LightGBM parameters found: {'model__learning_rate': 0.1, 'model__min_child_samples': 10, 'model__n_estimators': 300, 'model__num_leaves': 70}
Best cross-validation F1-macro score: 0.6716

--- Model Evaluation of Best LightGBM Model (Default 0.5 Threshold) ---
Accuracy: 0.9155
ROC-AUC Score: 0.8613
Classification Report (Default Threshold):
              precision    recall  f1-score   support

   alive (0)       0.96      0.95      0.95     11717
  failed (1)       0.39      0.44      0.41       841

    accuracy                           0.92     12558
   macro avg       0.67      0.



In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, 
    roc_auc_score,
    precision_recall_curve,
    f1_score
)

# --- Import Ensemble Tools ---
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression # Our "meta-model"

# --- Import Base Models ---
import xgboost as xgb
import lightgbm as lgb

# --- 1. Load Data ---
print("Loading train.csv...")
try:
    df = pd.read_csv('train.csv')
except FileNotFoundError:
    print("Error: train.csv not found.")
    # exit() 
else:
    print("Data loaded successfully.")

    # --- 2. Basic Cleaning & Define X/y ---
    print("Cleaning data and defining features (X) and target (y)...")
    
    cols_to_drop = ['Unnamed: 0', 'company_name', 'fyear']
    cols_to_drop_existing = [col for col in cols_to_drop if col in df.columns]
    df = df.drop(columns=cols_to_drop_existing)
    
    df = df.dropna(subset=['status_label'])

    target = 'status_label'
    y = df[target].map({'alive': 0, 'failed': 1})
    X = df.drop(columns=[target])

    # --- 3. Split Data into Training and Validation Sets ---
    print("Splitting data into 80% train and 20% validation sets...")
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, 
        test_size=0.2,
        random_state=42,
        stratify=y 
    )

    # --- 4. Define Preprocessing Pipelines ---
    print("Defining preprocessing pipelines...")
    
    numerical_features = [f'X{i}' for i in range(1, 19)]
    categorical_features = ['Division', 'MajorGroup']

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )
    
    # --- 5. Calculate scale_pos_weight for XGBoost ---
    scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
    print(f"\nCalculated scale_pos_weight for XGBoost: {scale_pos_weight:.2f}")


    # -----------------------------------------------------------------
    # --- 6. Define the Stacking Classifier Pipeline ---
    # -----------------------------------------------------------------
    
    print("\nDefining Stacking Classifier pipeline...")
    
    # --- Define our best models with their optimal parameters ---
    
    best_model_xgb = xgb.XGBClassifier(
        random_state=42,
        scale_pos_weight=scale_pos_weight,
        learning_rate=0.1,
        max_depth=10,
        min_child_weight=5,
        n_estimators=300,
        n_jobs=-1,
        eval_metric='logloss'
    )

    best_model_lgb = lgb.LGBMClassifier(
        random_state=42,
        class_weight='balanced',
        learning_rate=0.1,
        min_child_samples=10,
        n_estimators=300,
        num_leaves=70,
        n_jobs=-1,
        verbose=-1
    )
    
    # --- Create the list of base estimators ---
    estimators = [
        ('xgb', best_model_xgb),
        ('lgbm', best_model_lgb)
    ]
    
    # --- Create the Stacking Classifier ---
    # It will use Logistic Regression to combine the outputs
    stacking_model = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(),
        cv=3, # Cross-validation for training the meta-model
        n_jobs=-1
    )
    
    # --- Create the FULL pipeline (Preprocessor + Stacker) ---
    full_pipeline_stack = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('stacker', stacking_model)
    ])

    # -----------------------------------------------------------------
    # --- 7. Train the Stacking Classifier ---
    # -----------------------------------------------------------------
    
    print("\n--- Training Stacking Classifier ---")
    # This will take some time as it's training multiple models
    
    full_pipeline_stack.fit(X_train, y_train)
    
    print("Training complete.")

    # -----------------------------------------------------------------
    # --- 8. Evaluate the Stacking Classifier ---
    # -----------------------------------------------------------------

    print("\n--- Model Evaluation of Stacking Model (Default 0.5 Threshold) ---")
    
    y_pred_default = full_pipeline_stack.predict(X_val)
    y_pred_proba = full_pipeline_stack.predict_proba(X_val)[:, 1]

    # --- Print Metrics (Default Threshold) ---
    print(f"Accuracy: {accuracy_score(y_val, y_pred_default):.4f}")
    print(f"ROC-AUC Score: {roc_auc_score(y_val, y_pred_proba):.4f}")
    print("Classification Report (Default Threshold):")
    print(classification_report(y_val, y_pred_default, target_names=['alive (0)', 'failed (1)']))
    print("--------------------------------------------------")

    # -----------------------------------------------------------------
    # --- 9. Apply Optimal Threshold Tuning to the Stacking Model ---
    # -----------------------------------------------------------------
    
    print("\n--- Applying Optimal Threshold Tuning to Stacking Model ---")

    precision, recall, thresholds = precision_recall_curve(y_val, y_pred_proba)
    f1_scores = (2 * precision * recall) / (precision + recall + 1e-9)
    
    best_f1_idx = np.argmax(f1_scores[:-1])
    best_threshold = thresholds[best_f1_idx]
    best_f1 = f1_scores[best_f1_idx]

    print(f"Best F1-Score found: {best_f1:.4f}")
    print(f"Optimal Threshold: {best_threshold:.4f}")

    # --- Evaluate with the NEW threshold ---
    print("\n--- Final Model Evaluation (Stacking Model + Optimal Threshold) ---")

    y_pred_optimal = (y_pred_proba >= best_threshold).astype(int)

    print(f"Accuracy: {accuracy_score(y_val, y_pred_optimal):.4f}")
    print("--------------------------------------------------")
    print("Confusion Matrix (Optimal Threshold):")
    print(confusion_matrix(y_val, y_pred_optimal))
    print("--------------------------------------------------")
    print("Classification Report (Optimal Threshold):")
    print(classification_report(y_val, y_pred_optimal, target_names=['alive (0)', 'failed (1)']))
    print("--------------------------------------------------")

Loading train.csv...
Data loaded successfully.
Cleaning data and defining features (X) and target (y)...
Splitting data into 80% train and 20% validation sets...
Defining preprocessing pipelines...

Calculated scale_pos_weight for XGBoost: 13.94

Defining Stacking Classifier pipeline...

--- Training Stacking Classifier ---
Training complete.

--- Model Evaluation of Stacking Model (Default 0.5 Threshold) ---
Accuracy: 0.9398
ROC-AUC Score: 0.8683
Classification Report (Default Threshold):
              precision    recall  f1-score   support

   alive (0)       0.95      0.98      0.97     11717
  failed (1)       0.59      0.32      0.42       841

    accuracy                           0.94     12558
   macro avg       0.77      0.65      0.69     12558
weighted avg       0.93      0.94      0.93     12558

--------------------------------------------------

--- Applying Optimal Threshold Tuning to Stacking Model ---
Best F1-Score found: 0.4501
Optimal Threshold: 0.2858

--- Final M



In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, 
    roc_auc_score,
    precision_recall_curve,
    f1_score
)

# --- Import Ensemble Tools ---
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression # Our "meta-model"

# --- Import Base Models ---
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier # <-- 1. Import CatBoost

# --- 1. Load Data ---
print("Loading train.csv...")
try:
    df = pd.read_csv('train.csv')
except FileNotFoundError:
    print("Error: train.csv not found.")
    # exit() 
else:
    print("Data loaded successfully.")

    # --- 2. Basic Cleaning & Define X/y ---
    print("Cleaning data and defining features (X) and target (y)...")
    
    cols_to_drop = ['Unnamed: 0', 'company_name', 'fyear']
    cols_to_drop_existing = [col for col in cols_to_drop if col in df.columns]
    df = df.drop(columns=cols_to_drop_existing)
    
    df = df.dropna(subset=['status_label'])

    target = 'status_label'
    y = df[target].map({'alive': 0, 'failed': 1})
    X = df.drop(columns=[target])

    # --- 3. Split Data into Training and Validation Sets ---
    print("Splitting data into 80% train and 20% validation sets...")
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, 
        test_size=0.2,
        random_state=42,
        stratify=y 
    )

    # --- 4. Define Preprocessing Pipelines ---
    print("Defining preprocessing pipelines...")
    
    numerical_features = [f'X{i}' for i in range(1, 19)]
    categorical_features = ['Division', 'MajorGroup']

    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )
    
    # --- 5. Calculate scale_pos_weight for XGBoost ---
    scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
    print(f"\nCalculated scale_pos_weight for XGBoost: {scale_pos_weight:.2f}")


    # -----------------------------------------------------------------
    # --- 6. Define the Stacking Classifier Pipeline ---
    # -----------------------------------------------------------------
    
    print("\nDefining Stacking Classifier pipeline with XGB, LGBM, and CatBoost...")
    
    # --- Define our best models with their optimal parameters ---
    
    best_model_xgb = xgb.XGBClassifier(
        random_state=42,
        scale_pos_weight=scale_pos_weight,
        learning_rate=0.1,
        max_depth=10,
        min_child_weight=5,
        n_estimators=300,
        n_jobs=-1,
        eval_metric='logloss'
    )

    best_model_lgb = lgb.LGBMClassifier(
        random_state=42,
        class_weight='balanced',
        learning_rate=0.1,
        min_child_samples=10,
        n_estimators=300,
        num_leaves=70,
        n_jobs=-1,
        verbose=-1
    )
    
    # --- 2. Define the new CatBoost model ---
    # We use auto_class_weights to handle imbalance
    model_cat = CatBoostClassifier(
        random_state=42,
        auto_class_weights='Balanced',
        n_estimators=300,
        learning_rate=0.1,
        verbose=0 # Suppress training output
    )
    
    # --- 3. Create the new list of base estimators ---
    estimators = [
        ('xgb', best_model_xgb),
        ('lgbm', best_model_lgb),
        ('cat', model_cat) # <-- The new addition
    ]
    
    # --- Create the Stacking Classifier ---
    stacking_model = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(),
        cv=3, # Cross-validation for training the meta-model
        n_jobs=-1
    )
    
    # --- Create the FULL pipeline (Preprocessor + Stacker) ---
    full_pipeline_stack = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('stacker', stacking_model)
    ])

    # -----------------------------------------------------------------
    # --- 7. Train the Stacking Classifier ---
    # -----------------------------------------------------------------
    
    print("\n--- Training Stacking Classifier (XGB+LGBM+CAT) ---")
    # This will take even longer now
    
    full_pipeline_stack.fit(X_train, y_train)
    
    print("Training complete.")

    # -----------------------------------------------------------------
    # --- 8. Evaluate the Stacking Classifier ---
    # -----------------------------------------------------------------

    print("\n--- Model Evaluation of Stacking Model (Default 0.5 Threshold) ---")
    
    y_pred_default = full_pipeline_stack.predict(X_val)
    y_pred_proba = full_pipeline_stack.predict_proba(X_val)[:, 1]

    # --- Print Metrics (Default Threshold) ---
    print(f"Accuracy: {accuracy_score(y_val, y_pred_default):.4f}")
    print(f"ROC-AUC Score: {roc_auc_score(y_val, y_pred_proba):.4f}")
    print("Classification Report (Default Threshold):")
    print(classification_report(y_val, y_pred_default, target_names=['alive (0)', 'failed (1)']))
    print("--------------------------------------------------")

    # -----------------------------------------------------------------
    # --- 9. Apply Optimal Threshold Tuning to the Stacking Model ---
    # -----------------------------------------------------------------
    
    print("\n--- Applying Optimal Threshold Tuning to Stacking Model ---")

    precision, recall, thresholds = precision_recall_curve(y_val, y_pred_proba)
    f1_scores = (2 * precision * recall) / (precision + recall + 1e-9)
    
    best_f1_idx = np.argmax(f1_scores[:-1])
    best_threshold = thresholds[best_f1_idx]
    best_f1 = f1_scores[best_f1_idx]

    print(f"Best F1-Score found: {best_f1:.4f}")
    print(f"Optimal Threshold: {best_threshold:.4f}")

    # --- Evaluate with the NEW threshold ---
    print("\n--- Final Model Evaluation (Stacking Model + Optimal Threshold) ---")

    y_pred_optimal = (y_pred_proba >= best_threshold).astype(int)

    print(f"Accuracy: {accuracy_score(y_val, y_pred_optimal):.4f}")
    print("--------------------------------------------------")
    print("Confusion Matrix (Optimal Threshold):")
    print(confusion_matrix(y_val, y_pred_optimal))
    print("--------------------------------------------------")
    print("Classification Report (Optimal Threshold):")
    print(classification_report(y_val, y_pred_optimal, target_names=['alive (0)', 'failed (1)']))
    print("--------------------------------------------------")

Loading train.csv...
Data loaded successfully.
Cleaning data and defining features (X) and target (y)...
Splitting data into 80% train and 20% validation sets...
Defining preprocessing pipelines...

Calculated scale_pos_weight for XGBoost: 13.94

Defining Stacking Classifier pipeline with XGB, LGBM, and CatBoost...

--- Training Stacking Classifier (XGB+LGBM+CAT) ---
Training complete.

--- Model Evaluation of Stacking Model (Default 0.5 Threshold) ---




Accuracy: 0.9398
ROC-AUC Score: 0.8716
Classification Report (Default Threshold):
              precision    recall  f1-score   support

   alive (0)       0.95      0.99      0.97     11717
  failed (1)       0.60      0.30      0.40       841

    accuracy                           0.94     12558
   macro avg       0.78      0.64      0.68     12558
weighted avg       0.93      0.94      0.93     12558

--------------------------------------------------

--- Applying Optimal Threshold Tuning to Stacking Model ---
Best F1-Score found: 0.4467
Optimal Threshold: 0.2600

--- Final Model Evaluation (Stacking Model + Optimal Threshold) ---
Accuracy: 0.9189
--------------------------------------------------
Confusion Matrix (Optimal Threshold):
[[11129   588]
 [  430   411]]
--------------------------------------------------
Classification Report (Optimal Threshold):
              precision    recall  f1-score   support

   alive (0)       0.96      0.95      0.96     11717
  failed (1)    

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix, 
    roc_auc_score,
    precision_recall_curve,
    f1_score,
    make_scorer
)
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.combine import SMOTETomek, SMOTEENN
from imblearn.pipeline import Pipeline as ImbPipeline

# --- Ensemble Tools ---
from sklearn.ensemble import StackingClassifier, VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# --- Base Models ---
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings('ignore')

# ============================================================
# 1. LOAD DATA
# ============================================================
print("="*70)
print("  ADVANCED F1-SCORE OPTIMIZATION PIPELINE")
print("="*70)

print("\n[1/8] Loading data...")
try:
    df = pd.read_csv('train.csv')
    print(f"‚úì Data loaded: {df.shape}")
except FileNotFoundError:
    print("‚ùå Error: train.csv not found.")
    exit()

# ============================================================
# 2. DATA CLEANING & ANALYSIS
# ============================================================
print("\n[2/8] Data preprocessing...")

cols_to_drop = ['Unnamed: 0', 'company_name', 'fyear']
cols_to_drop_existing = [col for col in cols_to_drop if col in df.columns]
df = df.drop(columns=cols_to_drop_existing)
df = df.dropna(subset=['status_label'])

# Analyze class imbalance
class_counts = df['status_label'].value_counts()
imbalance_ratio = class_counts['alive'] / class_counts['failed']
print(f"‚úì Class distribution: Alive={class_counts['alive']}, Failed={class_counts['failed']}")
print(f"‚úì Imbalance ratio: {imbalance_ratio:.1f}:1")

# Prepare features and target
target = 'status_label'
y = df[target].map({'alive': 0, 'failed': 1})
X = df.drop(columns=[target])

# ============================================================
# 3. TRAIN-VALIDATION SPLIT
# ============================================================
print("\n[3/8] Splitting data...")
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"‚úì Train: {len(y_train)} | Val: {len(y_val)}")
print(f"‚úì Train failed class: {sum(y_train==1)} ({sum(y_train==1)/len(y_train)*100:.1f}%)")

# ============================================================
# 4. ENHANCED FEATURE ENGINEERING
# ============================================================
print("\n[4/8] Feature engineering...")

numerical_features = [f'X{i}' for i in range(1, 19)]
categorical_features = ['Division', 'MajorGroup']

# Enhanced numerical pipeline with feature selection
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

print("‚úì Preprocessing pipeline configured")

# ============================================================
# 5. ADVANCED MODEL CONFIGURATIONS
# ============================================================
print("\n[5/8] Configuring models...")

# Calculate optimal class weights
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"‚úì Base scale_pos_weight: {scale_pos_weight:.2f}")

# XGBoost - Tuned for high recall with controlled precision
def get_xgb_model(weight_multiplier=1.0):
    return xgb.XGBClassifier(
        random_state=42,
        scale_pos_weight=scale_pos_weight * weight_multiplier,
        learning_rate=0.02,
        max_depth=5,
        min_child_weight=1,
        n_estimators=1000,
        subsample=0.7,
        colsample_bytree=0.7,
        gamma=0.1,
        reg_alpha=1.0,
        reg_lambda=2.0,
        max_delta_step=1,  # Helps with imbalanced data
        n_jobs=-1,
        eval_metric='logloss'
    )

# LightGBM - Tuned for precision
def get_lgb_model():
    return lgb.LGBMClassifier(
        random_state=42,
        class_weight='balanced',
        learning_rate=0.02,
        min_child_samples=20,
        n_estimators=1000,
        num_leaves=31,
        max_depth=5,
        subsample=0.7,
        colsample_bytree=0.7,
        reg_alpha=1.0,
        reg_lambda=2.0,
        min_split_gain=0.1,
        is_unbalance=True,
        n_jobs=-1,
        verbose=-1
    )

# CatBoost - Robust to imbalance
def get_cat_model():
    return CatBoostClassifier(
        random_state=42,
        auto_class_weights='Balanced',
        n_estimators=1000,
        learning_rate=0.02,
        depth=5,
        l2_leaf_reg=5,
        bootstrap_type='Bernoulli',
        subsample=0.7,
        rsm=0.7,
        verbose=0
    )

# Random Forest - Different ensemble approach
def get_rf_model():
    return RandomForestClassifier(
        random_state=42,
        n_estimators=500,
        max_depth=10,
        min_samples_split=20,
        min_samples_leaf=10,
        max_features='sqrt',
        class_weight='balanced_subsample',
        n_jobs=-1
    )

print("‚úì Models configured")

# ============================================================
# 6. MULTIPLE TRAINING STRATEGIES
# ============================================================
print("\n[6/8] Training ensemble models with different strategies...")
print("     (This will take several minutes...)\n")

models = {}
strategies_info = []

# STRATEGY 1: BorderlineSMOTE (focuses on borderline cases)
print("   [1/6] BorderlineSMOTE + Enhanced Stacking...")
try:
    borderline_smote = BorderlineSMOTE(random_state=42, k_neighbors=7, m_neighbors=10)
    stacking_1 = StackingClassifier(
        estimators=[
            ('xgb', get_xgb_model(1.5)),
            ('lgbm', get_lgb_model()),
            ('cat', get_cat_model()),
            ('rf', get_rf_model())
        ],
        final_estimator=LogisticRegression(
            class_weight='balanced', 
            max_iter=2000, 
            C=0.05,
            penalty='l2',
            solver='saga'
        ),
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        n_jobs=-1
    )
    models['BorderlineSMOTE'] = ImbPipeline([
        ('preprocessor', preprocessor),
        ('sampler', borderline_smote),
        ('stacker', stacking_1)
    ])
    models['BorderlineSMOTE'].fit(X_train, y_train)
    print("       ‚úì Complete")
except Exception as e:
    print(f"       ‚úó Error: {e}")

# STRATEGY 2: SMOTEENN (SMOTE + Edited Nearest Neighbors)
print("   [2/6] SMOTEENN + Enhanced Stacking...")
try:
    smoteenn = SMOTEENN(random_state=42)
    stacking_2 = StackingClassifier(
        estimators=[
            ('xgb', get_xgb_model(2.0)),
            ('lgbm', get_lgb_model()),
            ('cat', get_cat_model()),
            ('rf', get_rf_model())
        ],
        final_estimator=LogisticRegression(
            class_weight='balanced', 
            max_iter=2000, 
            C=0.05,
            penalty='l2'
        ),
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        n_jobs=-1
    )
    models['SMOTEENN'] = ImbPipeline([
        ('preprocessor', preprocessor),
        ('sampler', smoteenn),
        ('stacker', stacking_2)
    ])
    models['SMOTEENN'].fit(X_train, y_train)
    print("       ‚úì Complete")
except Exception as e:
    print(f"       ‚úó Error: {e}")

# STRATEGY 3: Hybrid Undersampling + ADASYN
print("   [3/6] Hybrid (Undersample + ADASYN) + Stacking...")
try:
    rus = RandomUnderSampler(sampling_strategy=0.4, random_state=42)
    adasyn = ADASYN(random_state=42, n_neighbors=7)
    stacking_3 = StackingClassifier(
        estimators=[
            ('xgb', get_xgb_model(1.2)),
            ('lgbm', get_lgb_model()),
            ('cat', get_cat_model())
        ],
        final_estimator=LogisticRegression(
            class_weight='balanced', 
            max_iter=2000, 
            C=0.1
        ),
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        n_jobs=-1
    )
    models['Hybrid'] = ImbPipeline([
        ('preprocessor', preprocessor),
        ('undersampler', rus),
        ('oversampler', adasyn),
        ('stacker', stacking_3)
    ])
    models['Hybrid'].fit(X_train, y_train)
    print("       ‚úì Complete")
except Exception as e:
    print(f"       ‚úó Error: {e}")

# STRATEGY 4: Weighted Voting (No resampling, pure weighting)
print("   [4/6] Weighted Voting (No Resampling)...")
try:
    voting = VotingClassifier(
        estimators=[
            ('xgb', get_xgb_model(2.5)),
            ('lgbm', get_lgb_model()),
            ('cat', get_cat_model())
        ],
        voting='soft',
        weights=[2, 1, 1],  # Give more weight to XGB
        n_jobs=-1
    )
    models['WeightedVoting'] = Pipeline([
        ('preprocessor', preprocessor),
        ('voter', voting)
    ])
    models['WeightedVoting'].fit(X_train, y_train)
    print("       ‚úì Complete")
except Exception as e:
    print(f"       ‚úó Error: {e}")

# STRATEGY 5: SMOTETomek + Simple Stacking
print("   [5/6] SMOTETomek + 3-Model Stacking...")
try:
    smotetomek = SMOTETomek(random_state=42)
    stacking_5 = StackingClassifier(
        estimators=[
            ('xgb', get_xgb_model(1.8)),
            ('lgbm', get_lgb_model()),
            ('cat', get_cat_model())
        ],
        final_estimator=LogisticRegression(
            class_weight='balanced',
            max_iter=2000,
            C=0.08,
            penalty='l1',
            solver='saga'
        ),
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        n_jobs=-1
    )
    models['SMOTETomek'] = ImbPipeline([
        ('preprocessor', preprocessor),
        ('sampler', smotetomek),
        ('stacker', stacking_5)
    ])
    models['SMOTETomek'].fit(X_train, y_train)
    print("       ‚úì Complete")
except Exception as e:
    print(f"       ‚úó Error: {e}")

# STRATEGY 6: ADASYN + 4-Model Deep Stacking
print("   [6/6] ADASYN + 4-Model Deep Stacking...")
try:
    adasyn_deep = ADASYN(random_state=42, n_neighbors=10)
    stacking_6 = StackingClassifier(
        estimators=[
            ('xgb1', get_xgb_model(1.5)),
            ('xgb2', get_xgb_model(2.0)),
            ('lgbm', get_lgb_model()),
            ('cat', get_cat_model())
        ],
        final_estimator=LogisticRegression(
            class_weight='balanced',
            max_iter=2000,
            C=0.03,
            penalty='elasticnet',
            solver='saga',
            l1_ratio=0.5
        ),
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        n_jobs=-1
    )
    models['ADASYN_Deep'] = ImbPipeline([
        ('preprocessor', preprocessor),
        ('sampler', adasyn_deep),
        ('stacker', stacking_6)
    ])
    models['ADASYN_Deep'].fit(X_train, y_train)
    print("       ‚úì Complete")
except Exception as e:
    print(f"       ‚úó Error: {e}")

# ============================================================
# 7. COMPREHENSIVE EVALUATION
# ============================================================
print("\n[7/8] Evaluating all models...\n")

def evaluate_with_threshold_optimization(model, X_val, y_val, model_name):
    """Evaluate model with optimal threshold for F1-score"""
    try:
        y_pred_proba = model.predict_proba(X_val)[:, 1]
        
        # Find optimal threshold
        precision, recall, thresholds = precision_recall_curve(y_val, y_pred_proba)
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
        best_idx = np.argmax(f1_scores[:-1])
        best_threshold = thresholds[best_idx]
        
        # Make predictions
        y_pred_opt = (y_pred_proba >= best_threshold).astype(int)
        
        # Calculate metrics
        f1_opt = f1_score(y_val, y_pred_opt)
        cm = confusion_matrix(y_val, y_pred_opt)
        tn, fp, fn, tp = cm.ravel()
        
        precision_opt = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall_opt = tp / (tp + fn) if (tp + fn) > 0 else 0
        
        print(f"   {model_name:20s} | F1: {f1_opt:.4f} | P: {precision_opt:.4f} | R: {recall_opt:.4f} | Thresh: {best_threshold:.4f}")
        
        return {
            'model': model_name,
            'f1_score': f1_opt,
            'precision': precision_opt,
            'recall': recall_opt,
            'accuracy': accuracy_score(y_val, y_pred_opt),
            'roc_auc': roc_auc_score(y_val, y_pred_proba),
            'threshold': best_threshold,
            'tp': tp, 'fp': fp, 'tn': tn, 'fn': fn,
            'model_obj': model
        }
    except Exception as e:
        print(f"   {model_name:20s} | Error: {e}")
        return None

print("   " + "="*75)
print(f"   {'Model':<20s} | {'F1-Score':<10s} | {'Precision':<10s} | {'Recall':<10s} | {'Threshold'}")
print("   " + "="*75)

results = []
for name, model in models.items():
    result = evaluate_with_threshold_optimization(model, X_val, y_val, name)
    if result:
        results.append(result)

# ============================================================
# 8. FINAL RESULTS & RECOMMENDATIONS
# ============================================================
print("\n[8/8] Final Results\n")
print("="*70)
print("  F1-SCORE RANKING")
print("="*70)

if results:
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('f1_score', ascending=False)
    
    print(f"\n{'Rank':<6} {'Model':<22} {'F1':<10} {'Precision':<10} {'Recall':<10} {'ROC-AUC':<10}")
    print("-"*70)
    
    for i, row in enumerate(results_df.itertuples(), 1):
        medal = "ü•á" if i == 1 else "ü•à" if i == 2 else "ü•â" if i == 3 else "  "
        print(f"{medal} {i:<4} {row.model:<22} {row.f1_score:<10.4f} {row.precision:<10.4f} {row.recall:<10.4f} {row.roc_auc:<10.4f}")
    
    # Best model details
    best = results_df.iloc[0]
    print(f"\n{'='*70}")
    print(f"üèÜ CHAMPION MODEL: {best['model']}")
    print(f"{'='*70}")
    print(f"\nüìä Performance Metrics:")
    print(f"   F1-Score:        {best['f1_score']:.4f}")
    print(f"   Precision:       {best['precision']:.4f}")
    print(f"   Recall:          {best['recall']:.4f}")
    print(f"   Accuracy:        {best['accuracy']:.4f}")
    print(f"   ROC-AUC:         {best['roc_auc']:.4f}")
    print(f"   Optimal Thresh:  {best['threshold']:.4f}")
    
    print(f"\nüéØ Confusion Matrix:")
    print(f"                Predicted")
    print(f"              Alive  Failed")
    print(f"   Actual Alive  {best['tn']:5d}  {best['fp']:5d}")
    print(f"   Actual Failed {best['fn']:5d}  {best['tp']:5d}")
    
    # Calculate improvement
    baseline_f1 = 0.4314
    improvement = ((best['f1_score'] - baseline_f1) / baseline_f1) * 100
    
    print(f"\n‚ú® Performance vs Baseline:")
    print(f"   Baseline F1:     {baseline_f1:.4f}")
    print(f"   Current F1:      {best['f1_score']:.4f}")
    if improvement > 0:
        print(f"   Improvement:     +{improvement:.1f}% üöÄ")
    else:
        print(f"   Change:          {improvement:.1f}%")
    
    print(f"\nüí° Key Insights:")
    print(f"   ‚Ä¢ Detected {best['tp']} out of {best['tp'] + best['fn']} failures ({best['recall']*100:.1f}% recall)")
    print(f"   ‚Ä¢ {best['precision']*100:.1f}% of predicted failures were correct")
    print(f"   ‚Ä¢ Use threshold = {best['threshold']:.4f} for optimal F1-score")

print(f"\n{'='*70}")
print("Pipeline complete! Models ready for production use.")
print(f"{'='*70}\n")

  ADVANCED F1-SCORE OPTIMIZATION PIPELINE

[1/8] Loading data...
‚úì Data loaded: (62789, 24)

[2/8] Data preprocessing...
‚úì Class distribution: Alive=58586, Failed=4203
‚úì Imbalance ratio: 13.9:1

[3/8] Splitting data...
‚úì Train: 50231 | Val: 12558
‚úì Train failed class: 3362 (6.7%)

[4/8] Feature engineering...
‚úì Preprocessing pipeline configured

[5/8] Configuring models...
‚úì Base scale_pos_weight: 13.94
‚úì Models configured

[6/8] Training ensemble models with different strategies...
     (This will take several minutes...)

   [1/6] BorderlineSMOTE + Enhanced Stacking...
       ‚úì Complete
   [2/6] SMOTEENN + Enhanced Stacking...
       ‚úì Complete
   [3/6] Hybrid (Undersample + ADASYN) + Stacking...
       ‚úì Complete
   [4/6] Weighted Voting (No Resampling)...
       ‚úì Complete
   [5/6] SMOTETomek + 3-Model Stacking...
       ‚úì Complete
   [6/6] ADASYN + 4-Model Deep Stacking...
       ‚úì Complete

[7/8] Evaluating all models...

   Model                | F1-S