# <div style="padding: 30px;color:white;margin:10;font-size:60%;text-align:left;display:fill;border-radius:10px;background-color:#FFFFFF;overflow:hidden;background-color:#FFCE30"><b><span style='color:#FFFFFF'>1 |</span></b> <b>INTRODUCTION</b></div>

### <b><span style='color:#FFCE30'> 1.1 |</span> Objective</b>
- In this notebook we will perform some simple EDA to check the dataset
- Thereafter we will use a pipeline to perform the necessary data preprocessing with minimal feature engineering
- We will then use permutation importance / SHAP Value / and feature importance to select key features to use for the model (we keep up to 15)
- Send the model through XGboost with Hyperparameter Optimisation and submit the results

# <div style="padding: 30px;color:white;margin:10;font-size:60%;text-align:left;display:fill;border-radius:10px;background-color:#FFFFFF;overflow:hidden;background-color:#FFCE30"><b><span style='color:#FFFFFF'>2 |</span></b> <b>LOAD DATA</b></div>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

# Set the display options to show all columns without truncation
pd.set_option('display.max_columns', None)

In [None]:
#Load the data
train = pd.read_csv("/kaggle/input/playground-series-s4e3/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e3/test.csv")
sample_submission = pd.read_csv('/kaggle/input/playground-series-s4e3/sample_submission.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
sample_submission.head()

# <div style="padding: 30px;color:white;margin:10;font-size:60%;text-align:left;display:fill;border-radius:10px;background-color:#FFFFFF;overflow:hidden;background-color:#FFCE30"><b><span style='color:#FFFFFF'>3 |</span></b> <b>EDA</b></div>
- No missing data in both train and test set
- No catgorical data
- Last 7 columns of train data are target variable to predict

### <b><span style='color:#FFCE30'> 3.1 |</span> Train data</b>

In [None]:
train.describe().T.style.background_gradient(cmap='Oranges').format("{:.2f}")

In [None]:
train.info()

In [None]:
cols = train.columns
cols

In [None]:
cols = ['X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum',
       'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity',
       'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer',
       'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness',
       'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index',
       'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas',
       'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index',
       'SigmoidOfAreas']

colors = ['lightblue', 'orange']  

num_plots = len(cols)
num_cols = 3  
num_rows = -(-num_plots // num_cols)  
fig, axes = plt.subplots(num_rows, num_cols, figsize=(21, 5 * num_rows))  # Adjust the figure size as needed

for i, feature in enumerate(cols):
    row = i // num_cols
    col = i % num_cols

    ax = axes[row, col] if num_rows > 1 else axes[col]
    
    sns.histplot(train[feature], kde=True, color=colors[0], label='Train', alpha=0.5, bins=30, ax=ax)
    sns.histplot(test[feature], kde=True, color=colors[1], label='Test', alpha=0.5, bins=30, ax=ax)
    
    ax.set_title(f'Distribution of {feature}')
    ax.set_xlabel(feature)
    ax.set_ylabel('Frequency')
    ax.legend()

if num_plots % num_cols != 0:
    for j in range(num_plots % num_cols, num_cols):
        axes[-1, j].axis('off')

plt.tight_layout()
plt.show()

In [None]:
target_cols = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
       'Dirtiness', 'Bumps', 'Other_Faults']
plt.figure(figsize=(20,20))

for i, column in enumerate(target_cols):
    plt.subplot(9,4, i+1)
    sns.histplot(data=train, x=column, kde=True, bins=30)
    plt.title(f'{column} distribution')
    plt.tight_layout()

In [None]:
df_corr = train[train.columns].dropna().corr()
plt.figure(figsize=(30, 20))

# Plot the heatmap
sns.heatmap(df_corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)

# Show the plot
plt.show()

### <b><span style='color:#FFCE30'> 3.2 |</span> Test Data</b>

In [None]:
test.describe().T.style.background_gradient(cmap='Oranges').format("{:.2f}")

In [None]:
test.info()

# <div style="padding: 30px;color:white;margin:10;font-size:60%;text-align:left;display:fill;border-radius:10px;background-color:#FFFFFF;overflow:hidden;background-color:#FFCE30"><b><span style='color:#FFFFFF'>4 |</span></b> <b>DATA PREPROCESSING</b></div>

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

### <b><span style='color:#FFCE30'> 4.1 |</span> Drop Columns</b>

In [None]:
class DropColumn(BaseEstimator, TransformerMixin):
    def __init__(self, cols=[]):
        self.cols = cols
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):

        return X.drop(self.cols, axis=1)

DropColumn(cols=['id']).fit_transform(train)

# <div style="padding: 30px;color:white;margin:10;font-size:60%;text-align:left;display:fill;border-radius:10px;background-color:#FFFFFF;overflow:hidden;background-color:#FFCE30"><b><span style='color:#FFFFFF'>5 |</span></b> <b>ASSEMBLING THE PIPELINE</b></div>

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline

In [None]:
named_preprocessing_pipeline = Pipeline([
        ('drop', DropColumn(cols=[
            'id', #id is not a useful feature
        ]
                           )
        ),
    
        ('prep', ColumnTransformer(
            [
                
            ],
            remainder='passthrough').set_output(transform='pandas')
        ),
])

named_preprocessing_pipeline

In [None]:
label_cols = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

In [None]:
# drop observations with multiple labels
train = train[train[label_cols].sum(axis=1) <= 1]

In [None]:
# add a label column for the multi-class classification 
sparse_labels = train[label_cols].values.copy()
sparse_labels = np.concatenate([sparse_labels, 1 - sparse_labels.sum(1)[:, np.newaxis]], axis=1)
train['label'] = np.argmax(sparse_labels, axis=1)

In [None]:
X = train.drop(columns=label_cols + ['label'])
y = train['label'].values

In [None]:
X

In [None]:
y

In [None]:
#checking output on train df
df_train = named_preprocessing_pipeline.fit_transform(X)
df_train.info()

In [None]:
from sklearn.preprocessing import  StandardScaler,RobustScaler

# Using Standard Scaler or Robust Scaler to scale numeric variables

class StandardScalerNamed(StandardScaler, TransformerMixin):
    def get_feature_names_out(self, X, y=None):
        return X.columns.tolist()

    def transform(self, X, y=None):
        transformed = super().transform(X, y)
        return pd.DataFrame(transformed, columns=X.columns)


class RobustScalerNamed(RobustScaler, TransformerMixin):
    def get_feature_names_out(self, X, y=None):
        return X.columns.tolist()

    def transform(self, X, y=None):
        transformed = super().transform(X, y)
        return pd.DataFrame(transformed, columns=X.columns)

In [None]:
modelling_pipeline = Pipeline(named_preprocessing_pipeline.steps + [('scale',RobustScaler().set_output(transform='pandas')),])
modelling_pipeline

In [None]:
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline

In [None]:
# XGBoost baseline model
xgb_model = XGBClassifier(
                          objective='multi:softprob',
                          eval_metric='mlogloss',
                          )

xgb_pipeline = make_pipeline(modelling_pipeline, xgb_model)
xgb_pipeline

In [None]:
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score

# Number of folds
n_splits = 10

# Adjusting parameters of StratifiedKFold
stratkf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Cross-validation results
cv_results = []

# Stratified k-fold cross-validation
for fold, (train_idx, val_idx) in enumerate(stratkf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Fit the model
    xgb_pipeline.fit(X_train, y_train)

    # Predictions on the validation set
    y_val_pred_prob = xgb_pipeline.predict_proba(X_val)
    y_pred = xgb_pipeline.predict(X_val)
        
    # Calculate evaluation metrics
    f1 = f1_score(y_val, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_val, y_val_pred_prob, multi_class='ovr')

    print(f'Fold {fold + 1}, AUC Score on Validation Set: {roc_auc}')
    print(f'Fold {fold + 1}, F1 Score on Validation Set: {f1}')
    print('-' * 70)

    # Results
    cv_results.append(roc_auc)

# Average cross-validation result
average_cv_result = sum(cv_results) / n_splits
print(f'\nAverage AUC-score across {n_splits} folds: {average_cv_result}')


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import  confusion_matrix

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_pipeline.fit(X_train,y_train)

predictions_xgb = xgb_pipeline.predict(X_val)

cm_xgb = confusion_matrix(y_val, predictions_xgb)

disp = ConfusionMatrixDisplay(confusion_matrix=cm_xgb, display_labels = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults','None'])

plt.figure(figsize=(10, 10))

# Plot the confusion matrix
disp.plot(ax=plt.gca())  # Use the current axes
plt.show()

- Seems to have difficulty to seperate 'Other_Faults' and 'Bumps' properly

# <div style="padding: 30px;color:white;margin:10;font-size:60%;text-align:left;display:fill;border-radius:10px;background-color:#FFFFFF;overflow:hidden;background-color:#FFCE30"><b><span style='color:#FFFFFF'>6 |</span></b> <b>FEATURE ANALYSIS</b></div>

### <b><span style='color:#FFCE30'> 6.1 |</span> Permutation Importance</b>

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(xgb_pipeline, random_state=42).fit(X, y)
eli5.show_weights(perm, feature_names = X.columns.tolist())

### <b><span style='color:#FFCE30'> 6.2 |</span> SHAP Values</b>

In [None]:
import shap

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Assuming xgb_pipeline is a Pipeline with XGBoost as its last step
xgb_model = xgb_pipeline.steps[-1][1]

my_model = xgb_model.fit(X_train,y_train)

explainer = shap.TreeExplainer(my_model)
shap_values = explainer.shap_values(X_val)

shap.summary_plot(shap_values, X_val)

### <b><span style='color:#FFCE30'> 6.3 |</span> Feature Importance</b>

In [None]:
TOP = 20

feature_importance = xgb_model.feature_importances_

# Get the feature names from 'X'
feature_names = X.columns

# Sort the feature importances and get the indices of the sorted array
sorted_idx = np.argsort(feature_importance)

# Plot only the top 'TOP' features
fig = plt.figure(figsize=(10, 8))
plt.barh(np.arange(len(sorted_idx))[-TOP:], feature_importance[sorted_idx][-TOP:], align='center')
plt.yticks(np.arange(len(sorted_idx))[-TOP:], feature_names[sorted_idx][-TOP:])
plt.title(f'Feature Importance - Top {TOP}')
plt.show()

In [None]:
# Create an empty DataFrame to store SHAP values
shap_df = pd.DataFrame()

# Iterate over each class
for class_index in range(7):  # Assuming you have 7 classes
    # Create a DataFrame for SHAP values of the current class
    class_shap_df = pd.DataFrame(data=shap_values[class_index], columns=X.columns)
    # Append the DataFrame to the shap_df DataFrame
    shap_df = pd.concat([shap_df, class_shap_df], ignore_index=True)


# Feature Importance
feature_importance_dict = dict(zip(X.columns, feature_importance))
sorted_feature_importance = {k: v for k, v in sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True)}

# Combine all rankings
features = list(X.columns)
perm_ranking = [features[index] for index in perm.feature_importances_.argsort()[::-1]]
shap_ranking = [features[index] for index in shap_df.abs().mean().argsort()[::-1]]
feature_importance_ranking = [feature for feature, _ in sorted_feature_importance.items()]


# Create DataFrame
data = {
    "Permutation Importance Rank": perm_ranking,
    "SHAP Values Rank": shap_ranking,
    "Feature Importance Rank": feature_importance_ranking
}

df = pd.DataFrame(data)
df.head(20)


In [None]:
# Create dictionary to store combined rank scores
combined_rank = {}

# Compute combined rank score for each feature
for feature in X.columns:
    perm_rank = perm_ranking.index(feature) + 1
    shap_rank = shap_ranking.index(feature) + 1
    importance_rank = feature_importance_ranking.index(feature) + 1
    
    # Calculate combined rank score
    combined_rank[feature] = perm_rank + shap_rank + importance_rank

# Create DataFrame for combined rank scores
combined_rank_df = pd.DataFrame(list(combined_rank.items()), columns=['Feature Name', 'Importance Rank Score'])

# Sort DataFrame by combined rank score in ascending order
combined_rank_df = combined_rank_df.sort_values(by='Importance Rank Score')

# Select top 15 features
top_15_features = combined_rank_df.head(15)

top_15_features


In [None]:
top_15_features['Feature Name'].tolist()

# <div style="padding: 30px;color:white;margin:10;font-size:60%;text-align:left;display:fill;border-radius:10px;background-color:#FFFFFF;overflow:hidden;background-color:#FFCE30"><b><span style='color:#FFFFFF'>7 |</span></b> <b>XGBOOST MODEL</b></div>

In [None]:
# Define the list of selected columns you want to keep
selected_columns = [
  'Steel_Plate_Thickness',
 'Length_of_Conveyer',
 'Orientation_Index',
 'Outside_X_Index',
 'Edges_Y_Index',
 'Minimum_of_Luminosity',
 'LogOfAreas',
 'Pixels_Areas',
 'X_Perimeter',
 'Log_Y_Index',
 'Luminosity_Index',
 'TypeOfSteel_A300',
 'Empty_Index',
 'Edges_Index',
 'Log_X_Index'
] 

# Select only the desired columns from the DataFrame
X_xgb = X[selected_columns]
X_xgb

In [None]:
X_xgb.info()

In [None]:
X_xgb.isnull().sum()

# <div style="padding: 30px;color:white;margin:10;font-size:60%;text-align:left;display:fill;border-radius:10px;background-color:#FFFFFF;overflow:hidden;background-color:#FFCE30"><b><span style='color:#FFFFFF'>8 |</span></b> <b>HYPERPARAMETER TUNING</b></div>

In [None]:
# import optuna
# from sklearn.model_selection import  cross_val_score

# import warnings
# # Set global warning filter
# warnings.filterwarnings("ignore")

In [None]:
# def objective_xgb(trial):
#     """Define the objective function for XGBClassifier"""

#     params = {
#     'max_depth': trial.suggest_int('max_depth', 5, 10),
#     'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
#     'n_estimators': trial.suggest_int('n_estimators', 150, 1000),
#     'subsample': trial.suggest_float('subsample', 0.01, 1.0),
#     'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
#     'objective': 'multi:softprob',  # Multiclass classification
#     'num_class': 7,  # Specify the number of classes
#     'eval_metric': 'mlogloss',  # Use 'mlogloss' for multiclass log loss optimization
#     'verbosity': 0,  # Set verbosity to 0 for less output
#     }

#     xgb_model = XGBClassifier(**params)

#     # Assuming 'skf' is your StratifiedKFold object
#     skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

#     # Change scoring to 'neg_log_loss'
#     cv = -cross_val_score(xgb_model, X_xgb, y, cv=skf, scoring='neg_log_loss').mean()

#     return cv

# # Create an Optuna study object
# study = optuna.create_study(direction='maximize')
# study.optimize(objective_xgb, n_trials=50)

# # Get the best parameters
# best_params_xgb = study.best_params
# print("Best Hyperparameters for XGBoost:", best_params_xgb)

- Best Hyperparameters for XGBoost: {'max_depth': 10, 'learning_rate': 0.9472450516762687, 'n_estimators': 688, 'subsample': 0.09079473120748476, 'colsample_bytree': 0.37465855668049536}
- Best is trial 30 with value: 8.044855124346752.

In [None]:
# # Define the list of selected columns you want to keep
# selected_columns = [
#   'Steel_Plate_Thickness',
#  'Length_of_Conveyer',
#  'Orientation_Index',
#  'Outside_X_Index',
#  'Edges_Y_Index',
#  'Minimum_of_Luminosity',
#  'LogOfAreas',
#  'Pixels_Areas',
#  'X_Perimeter',
#  'Log_Y_Index',
#  'Luminosity_Index',
#  'TypeOfSteel_A300',
#  'Empty_Index',
#  'Edges_Index',
#  'Log_X_Index', 'id'
# ] 

# # Select only the desired columns from the DataFrame
# X_xgb = X[selected_columns]
# X_xgb

In [None]:
# #XGBoost best parameters {'max_depth': 10, 'learning_rate': 0.9472450516762687, 'n_estimators': 688, 'subsample': 0.09079473120748476, 'colsample_bytree': 0.37465855668049536}
# xgb_params_optuna = {'max_depth': 10, 'learning_rate': 0.9472450516762687, 'n_estimators': 688, 'subsample': 0.09079473120748476, 'colsample_bytree': 0.37465855668049536,                    
#                     'objective': 'multi:softprob',
#                     'eval_metric': 'mlogloss'
                    
#                     }


# # XGBoost baseline model
# xgb_model = XGBClassifier(**xgb_params_optuna)

# xgb_pipeline = make_pipeline(modelling_pipeline, xgb_model)
# xgb_pipeline

In [None]:
# from sklearn.model_selection import cross_validate, StratifiedKFold
# from sklearn.metrics import f1_score, roc_auc_score

# # Number of folds
# n_splits = 10

# # Adjusting parameters of StratifiedKFold
# stratkf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# # Cross-validation results
# cv_results = []

# # Stratified k-fold cross-validation
# for fold, (train_idx, val_idx) in enumerate(stratkf.split(X_xgb, y)):
#     X_train, X_val = X_xgb.iloc[train_idx], X_xgb.iloc[val_idx]
#     y_train, y_val = y[train_idx], y[val_idx]
    
#     # Fit the model
#     xgb_pipeline.fit(X_train, y_train)

#     # Predictions on the validation set
#     y_val_pred_prob = xgb_pipeline.predict_proba(X_val)
#     y_pred = xgb_pipeline.predict(X_val)
        
#     # Calculate evaluation metrics
#     f1 = f1_score(y_val, y_pred, average='weighted')
#     roc_auc = roc_auc_score(y_val, y_val_pred_prob, multi_class='ovr')

#     print(f'Fold {fold + 1}, AUC Score on Validation Set: {roc_auc}')
#     print(f'Fold {fold + 1}, F1 Score on Validation Set: {f1}')
#     print('-' * 70)

#     # Results
#     cv_results.append(roc_auc)

# # Average cross-validation result
# average_cv_result = sum(cv_results) / n_splits
# print(f'\nAverage AUC-score across {n_splits} folds: {average_cv_result}')


In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import ConfusionMatrixDisplay
# from sklearn.metrics import  confusion_matrix

# X_train, X_val, y_train, y_val = train_test_split(X_xgb, y, test_size=0.2, random_state=42)

# xgb_pipeline.fit(X_train,y_train)

# predictions_xgb = xgb_pipeline.predict(X_val)

# cm_xgb = confusion_matrix(y_val, predictions_xgb)

# disp = ConfusionMatrixDisplay(confusion_matrix=cm_xgb, display_labels = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults','None'])

# plt.figure(figsize=(10, 10))

# # Plot the confusion matrix
# disp.plot(ax=plt.gca())  # Use the current axes
# plt.show()

# <div style="padding: 30px;color:white;margin:10;font-size:60%;text-align:left;display:fill;border-radius:10px;background-color:#FFFFFF;overflow:hidden;background-color:#FFCE30"><b><span style='color:#FFFFFF'>8 |</span></b> <b>SUBMISSION</b></div>

In [None]:
# # Select only the desired columns from the DataFrame
# test_xgb = test[selected_columns]

In [None]:
params_1 = {
    "random_state": 18,
    "n_estimators": 1800,
    "learning_rate": 0.006,
    "gamma": 0.44,
    "subsample": 0.7,
    "colsample_bytree": 0.38,
    "max_depth": 5,
    "min_child_weight": 4,
    "reg_lambda": 1.8e-06,
    "reg_alpha": 0.54,
    "booster": "gbtree",
    "verbosity": 0,
    "device_type": "cuda",
    "tree_method": "gpu_hist",
    "grow_policy": "depthwise",
}  

params_2 = {
    "n_estimators": 703,
    "learning_rate": 0.023358116742747285,
    "gamma": 0.24997920132991797,
    "subsample": 0.8841265541346639,
    "colsample_bytree": 0.362499715714305,
    "max_depth": 5,
    "min_child_weight": 5,
    "reg_lambda": 2.9660886967874625,
    "reg_alpha": 0.00011509254946941848,
    "booster": "gbtree",
    "verbosity": 0,
    "grow_policy": "depthwise",
    "device_type": "cuda",
    "tree_method": "gpu_hist",
}  

params_3 = {
    "verbosity": 0,
    "learning_rate": 0.02767540293640535,
    "n_estimators": 494,
    "reg_alpha": 1.5855453969671037e-06,
    "reg_lambda": 1.4155529076600075,
    "max_depth": 5,
    "colsample_bytree": 0.46589178614541227,
    "subsample": 0.8504122771965839,
    "min_child_weight": 3,
    "device": "cuda",
    "tree_method": "hist",
    "random_state": 18,
} 

params_4 = {
    "n_estimators": 1235,
    "learning_rate": 0.008352405007099802,
    "gamma": 0.6499918347241912,
    "subsample": 0.9116532305497375,
    "colsample_bytree": 0.49334879814671045,
    "max_depth": 7,
    "min_child_weight": 1,
    "reg_lambda": 1.7005084366184795,
    "reg_alpha": 0.0059679946773570774,
    "device": "cuda",
    "tree_method": "hist"
}

In [None]:
# XGBoost baseline model
xgb_model_1 = XGBClassifier(**params_1)
xgb_model_2 = XGBClassifier(**params_2)
xgb_model_3 = XGBClassifier(**params_3)
xgb_model_4 = XGBClassifier(**params_4)


xgb_pipeline_1 = make_pipeline(modelling_pipeline, xgb_model_1)
xgb_pipeline_2 = make_pipeline(modelling_pipeline, xgb_model_2)
xgb_pipeline_3 = make_pipeline(modelling_pipeline, xgb_model_3)
xgb_pipeline_4 = make_pipeline(modelling_pipeline, xgb_model_4)


In [None]:
from sklearn.ensemble import VotingClassifier

ensemble_model = VotingClassifier(estimators=[
    ('xgb1', xgb_pipeline_1),
    ('xgb2', xgb_pipeline_2),
    ('xgb3', xgb_pipeline_3),
    ('xgb4', xgb_pipeline_4),
    
  
            
]
                                  , voting='soft',
                                  weights = [0.4,0.4,0.1,0.1]) 

ensemble_model

In [None]:
# Fit the xgb model model
ensemble_model.fit(X, y)

# Create submission file with probability predictions
predictions = ensemble_model.predict_proba(test)[:, :-1]

sample_submission[label_cols] = predictions
sample_submission.to_csv('submission_baseline_xgb.csv', index=False)

In [None]:
sample_submission

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set the style of the visualization
sns.set(style="whitegrid")

# Specify the columns you want to plot (excluding the 'id' column)
columns_to_plot = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

# Plot density plot for each category
plt.figure(figsize=(12, 8))
for column in columns_to_plot:
    sns.kdeplot(data=sample_submission[column], label=column, fill=True)
    
plt.title('Density Plot of Predicted Probabilities for Each Class')
plt.xlabel('Predicted Probabilities')
plt.ylabel('Density')
plt.legend()
plt.show()