## Import

In [1]:
import numpy as np
np.random.seed(42)

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.ensemble import VotingClassifier

from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from collections import Counter


from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


import warnings
warnings.filterwarnings("ignore")

sns.set_theme()
sns.set_context("paper")
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


## Read in the data

In [2]:
train_df = pd.read_csv("playground-series-s4e3/train.csv")
X_test = pd.read_csv("playground-series-s4e3/test.csv")
submission_df = pd.read_csv("playground-series-s4e3/sample_submission.csv")

In [3]:
train_df.head()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,0,584,590,909972,909977,16,8,5,2274,113,...,-0.5,-0.0104,0.1417,0,0,0,1,0,0,0
1,1,808,816,728350,728372,433,20,54,44478,70,...,0.7419,-0.2997,0.9491,0,0,0,0,0,0,1
2,2,39,192,2212076,2212144,11388,705,420,1311391,29,...,-0.0105,-0.0944,1.0,0,0,1,0,0,0,0
3,3,781,789,3353146,3353173,210,16,29,3202,114,...,0.6667,-0.0402,0.4025,0,0,1,0,0,0,0
4,4,1540,1560,618457,618502,521,72,67,48231,82,...,0.9158,-0.2455,0.9998,0,0,0,0,0,0,1


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19219 entries, 0 to 19218
Data columns (total 35 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     19219 non-null  int64  
 1   X_Minimum              19219 non-null  int64  
 2   X_Maximum              19219 non-null  int64  
 3   Y_Minimum              19219 non-null  int64  
 4   Y_Maximum              19219 non-null  int64  
 5   Pixels_Areas           19219 non-null  int64  
 6   X_Perimeter            19219 non-null  int64  
 7   Y_Perimeter            19219 non-null  int64  
 8   Sum_of_Luminosity      19219 non-null  int64  
 9   Minimum_of_Luminosity  19219 non-null  int64  
 10  Maximum_of_Luminosity  19219 non-null  int64  
 11  Length_of_Conveyer     19219 non-null  int64  
 12  TypeOfSteel_A300       19219 non-null  int64  
 13  TypeOfSteel_A400       19219 non-null  int64  
 14  Steel_Plate_Thickness  19219 non-null  int64  
 15  Ed

In [5]:
train_df.describe()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
count,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,...,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0
mean,9609.0,709.854675,753.857641,1849756.0,1846605.0,1683.987616,95.654665,64.124096,191846.7,84.808419,...,0.102742,-0.138382,0.571902,0.076279,0.059837,0.178573,0.029554,0.025235,0.247828,0.341225
std,5548.191747,531.544189,499.836603,1903554.0,1896295.0,3730.319865,177.821382,101.054178,442024.7,28.800344,...,0.487681,0.120344,0.332219,0.26545,0.23719,0.383005,0.169358,0.156844,0.431762,0.474133
min,0.0,0.0,4.0,6712.0,6724.0,6.0,2.0,1.0,250.0,0.0,...,-0.9884,-0.885,0.119,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4804.5,49.0,214.0,657468.0,657502.0,89.0,15.0,14.0,9848.0,70.0,...,-0.2727,-0.1925,0.2532,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,9609.0,777.0,796.0,1398169.0,1398179.0,168.0,25.0,23.0,18238.0,90.0,...,0.1111,-0.1426,0.4729,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,14413.5,1152.0,1165.0,2368032.0,2362511.0,653.0,64.0,61.0,67978.0,105.0,...,0.5294,-0.084,0.9994,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,19218.0,1705.0,1713.0,12987660.0,12987690.0,152655.0,7553.0,903.0,11591410.0,196.0,...,0.9917,0.6421,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Understand the features

- Pastry: Pastry refers to small patches or irregularities on the surface of the steel plate, typically caused by imperfections in the manufacturing process or handling during transport. These imperfections can affect the surface smoothness and appearance of the steel plate.

- Z_Scratch: Z-scratches are narrow scratches or marks on the surface of the steel plate that run parallel to the rolling direction. Various factors, such as handling, machining, or contact with abrasive materials during production or transportation, can cause these scratches.

- K_Scratch: K-scratches are similar to Z-scratches but run perpendicular to the rolling direction. They can also be caused by handling, machining, or contact with abrasive materials during manufacturing or transportation processes.

- Stains: Stains refer to discolored or contaminated areas on the surface of the steel plate. These stains can result from various sources, such as rust, oil, grease, or other foreign substances that come into contact with the steel surface during processing, storage, or handling.

- Dirtiness: Dirtiness indicates the presence of dirt or particulate matter on the surface of the steel plate. This can include various types of debris or contaminants that accumulate during manufacturing, handling, or storage processes.

- Bumps: Bumps are raised or protruding areas on the surface of the steel plate. These can be caused by irregularities in the manufacturing process, such as uneven rolling or cooling, or by physical damage during handling or transportation.

- Other_Faults: This category likely encompasses a broader range of faults or defects not explicitly categorized in the other fault types listed. It could include various types of surface imperfections, irregularities, or abnormalities that affect the quality or usability of the steel plate.

Here are some further information about the features:

The dataset "Steel Plates Faults" contains 27 features that describe each fault in detail. Here is an explanation of some of the features based on the information gathered from the search results:

- Location Features:

X_Minimum: The minimum x-coordinate of the fault.
X_Maximum: The maximum x-coordinate of the fault.
Y_Minimum: The minimum y-coordinate of the fault.
Y_Maximum: The maximum y-coordinate of the fault.

- Size Features:

Pixels_Areas: Area of the fault in pixels.
X_Perimeter: Perimeter along the x-axis of the fault.
Y_Perimeter: Perimeter along the y-axis of the fault.

- Luminosity Features:

Sum_of_Luminosity: Sum of luminosity values in the fault area.
Minimum_of_Luminosity: Minimum luminosity value in the fault area.
Maximum_of_Luminosity: Maximum luminosity value in the fault area.

- Material and Index Features:

TypeOfSteel_A300: Type of steel (A300).
TypeOfSteel_A400: Type of steel (A400).
Steel_Plate_Thickness: Thickness of the steel plate.
Edges_Index, Empty_Index, Square_Index, Outside_X_Index, Edges_X_Index, Edges_Y_Index, Outside_Global_Index: Various index values related to edges and geometry.

- Logarithmic Features:

LogOfAreas: Logarithm of the area of the fault.
Log_X_Index, Log_Y_Index: Logarithmic indices related to X and Y coordinates.
Statistical Features:

- Orientation_Index: Index describing orientation.
- Luminosity_Index: Index related to luminosity.
- SigmoidOfAreas: Sigmoid function applied to areas.

### something that I can try

1. MultiOutputClassifier 
https://www.kaggle.com/code/fchmiel/xgboost-baseline-multilabel-classification
2. 

## EDA

In [6]:
train_df = train_df.drop(['id'],axis=1)
X_test = X_test.drop(['id'],axis=1)

In [7]:
X_train = train_df.drop(columns=['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
    'Dirtiness', 'Bumps', 'Other_Faults'], axis=1)
y_train = train_df[['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
    'Dirtiness', 'Bumps', 'Other_Faults']]

In [22]:
# count how many percentages of the observations have multiple labels
# Turns out there could be multiple labels for one observation, but the percentage is low
count_multi_labels = (np.sum(y_train, axis=1) > 1).sum()
count_multi_labels/y_train.shape[0] * 100

0.10926687132525106

In [8]:
class Preprocess(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return 
    
    def fit_transform(self, X):
        epsilon = 1e-6 # Avoid Division by Zero

        # Location Features
        X['X_Distance'] = X['X_Maximum'] - X['X_Minimum']
        X['Y_Distance'] = X['Y_Maximum'] - X['Y_Minimum']

        # Density Feature
        X['Density'] = X['Pixels_Areas'] / (X['X_Perimeter'] + X['Y_Perimeter'])

        # Relative Perimeter Feature
        X['Relative_Perimeter'] = X['X_Perimeter'] / (X['X_Perimeter'] + X['Y_Perimeter'] + epsilon)
# Circularity Feature
        X['Circularity'] = X['Pixels_Areas'] / (X['X_Perimeter'] ** 2)

        # Symmetry Index Feature
        X['Symmetry_Index'] = np.abs(X['X_Distance'] - X['Y_Distance']) / (X['X_Distance'] + X['Y_Distance'] + epsilon)

        # Color Contrast Feature
        X['Color_Contrast'] = X['Maximum_of_Luminosity'] - X['Minimum_of_Luminosity']

        # Combined Geometric Index Feature
        X['Combined_Geometric_Index'] = X['Edges_Index'] * X['Square_Index']

        # Interaction Term Feature
        X['X_Distance*Pixels_Areas'] = X['X_Distance'] * X['Pixels_Areas']

        # Additional Features
        X['sin_orientation'] = np.sin(X['Orientation_Index'])
        X['Edges_Index2'] = np.exp(X['Edges_Index'] + epsilon)
        X['X_Maximum2'] = np.sin(X['X_Maximum'])
        X['Y_Minimum2'] = np.sin(X['Y_Minimum'])
        X['Aspect_Ratio_Pixels'] = np.where(X['Y_Perimeter'] == 0, 0, X['X_Perimeter'] / X['Y_Perimeter'])
        X['Aspect_Ratio'] = np.where(X['Y_Distance'] == 0, 0, X['X_Distance'] / X['Y_Distance'])

        # Average Luminosity Feature
        X['Average_Luminosity'] = (X['Sum_of_Luminosity'] + X['Minimum_of_Luminosity']) / 2

        # Normalized Steel Thickness Feature
        X['Normalized_Steel_Thickness'] = (X['Steel_Plate_Thickness'] - X['Steel_Plate_Thickness'].min()) / (X['Steel_Plate_Thickness'].max() - X['Steel_Plate_Thickness'].min())

        # Logarithmic Features
        X['Log_Perimeter'] = np.log(X['X_Perimeter'] + X['Y_Perimeter'] + epsilon)
        X['Log_Luminosity'] = np.log(X['Sum_of_Luminosity'] + epsilon)
        X['Log_Aspect_Ratio'] = np.log(X['Aspect_Ratio'] ** 2 + epsilon)

        # Statistical Features
        X['Combined_Index'] = X['Orientation_Index'] * X['Luminosity_Index']
        X['Sigmoid_Areas'] = 1 / (1 + np.exp(-X['LogOfAreas'] + epsilon))
        return X

X_train = Preprocess().fit_transform(X_train)
X_test = Preprocess().fit_transform(X_test)