In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# load in data
df_train = pd.read_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e3/train.csv')
df_test = pd.read_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e3/test.csv')


In [2]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.feature_selection import SelectKBest, f_classif
from xgboost import XGBClassifier
import optuna
from sklearn.metrics import roc_auc_score

class model_class:
    def __init__(self, df_train, df_test, target = [], drop = []):
        self.df_train = df_train.drop(columns = drop)
        self.df_test = df_test.drop(columns = drop)
        self.target = target
        self.drop = drop
        # preprocessing
        self.df_train = self.preprocess_data(self.df_train)
        self.df_test = self.preprocess_data(self.df_test)

    def preprocess_data(self,data):   
        # taken from hre: https://www.kaggle.com/competitions/playground-series-s4e3/discussion/482475
        data['_Aspect_Ratio'] = (data['X_Maximum'] - data['X_Minimum']) / (abs(data['Y_Maximum'] - data['Y_Minimum']) + 1)
        data['_Area_Perimeter_Ratio_X'] = data['Pixels_Areas'] / data['X_Perimeter']
        data['_Area_Perimeter_Ratio_Y'] = data['Pixels_Areas'] / data['Y_Perimeter']
        data['_Compactness_X'] = data['Pixels_Areas'] / (data['X_Perimeter'] ** 2)
        data['_Compactness_Y'] = data['Pixels_Areas'] / (data['Y_Perimeter'] ** 2)
        data['_Color_Range'] = data['Maximum_of_Luminosity'] - data['Minimum_of_Luminosity']
        data['_Spatial_Distribution_Index'] = (
            data['Edges_Index'] + data['Empty_Index'] + data['Square_Index'] + data['Outside_X_Index'] + data['Edges_X_Index'] + 
            data['Edges_Y_Index'] + data['Outside_Global_Index']
        )
        data['_Log_Area_Perimeter_Ratio'] = data['LogOfAreas'] / (data['Log_X_Index'] + data['Log_Y_Index'])
        data['_Normalized_Luminosity_Index'] = data['Luminosity_Index'] / data['Pixels_Areas']
        data['_Thickness_Steel_Type'] = data['Steel_Plate_Thickness'] * (data['TypeOfSteel_A300'] + data['TypeOfSteel_A400'])
        data['_Edge_to_Area_Ratio'] = data['Edges_Index'] / data['Pixels_Areas']
        
        # taken from here: https://www.kaggle.com/competitions/playground-series-s4e3/discussion/481687
        epsilon = 1e-6  # A small constant to avoid division by zero or taking the logarithm of zero
        # Location Features
        data['X_Distance'] = data['X_Maximum'] - data['X_Minimum']
        data['Y_Distance'] = data['Y_Maximum'] - data['Y_Minimum']

        # Density Feature
        data['Density'] = data['Pixels_Areas'] / (data['X_Perimeter'] + data['Y_Perimeter'])

        # Relative Perimeter Feature
        data['Relative_Perimeter'] = data['X_Perimeter'] / (data['X_Perimeter'] + data['Y_Perimeter'] + epsilon)

        # Circularity Feature
        data['Circularity'] = data['Pixels_Areas'] / (data['X_Perimeter'] ** 2)

        # Symmetry Index Feature
        data['Symmetry_Index'] = np.abs(data['X_Distance'] - data['Y_Distance']) / (data['X_Distance'] + data['Y_Distance'] + epsilon)

        # Color Contrast Feature
        data['Color_Contrast'] = data['Maximum_of_Luminosity'] - data['Minimum_of_Luminosity']

        # Combined Geometric Index Feature
        data['Combined_Geometric_Index'] = data['Edges_Index'] * data['Square_Index']

        # Interaction Term Feature
        data['X_Distance*Pixels_Areas'] = data['X_Distance'] * data['Pixels_Areas']

        # Additional Features
        data['sin_orientation'] = np.sin(data['Orientation_Index'])
        data['Edges_Index2'] = np.exp(data['Edges_Index'] + epsilon)
        data['X_Maximum2'] = np.sin(data['X_Maximum'])
        data['Y_Minimum2'] = np.sin(data['Y_Minimum'])
        data['Aspect_Ratio_Pixels'] = np.where(data['Y_Perimeter'] == 0, 0, data['X_Perimeter'] / data['Y_Perimeter'])
        data['Aspect_Ratio'] = np.where(data['Y_Distance'] == 0, 0, data['X_Distance'] / data['Y_Distance'])

        # Average Luminosity Feature
        data['Average_Luminosity'] = (data['Sum_of_Luminosity'] + data['Minimum_of_Luminosity']) / 2

        # Normalized Steel Thickness Feature
        data['Normalized_Steel_Thickness'] = (data['Steel_Plate_Thickness'] - data['Steel_Plate_Thickness'].min()) / (data['Steel_Plate_Thickness'].max() - data['Steel_Plate_Thickness'].min())

        # Logarithmic Features
        data['Log_Perimeter'] = np.log(data['X_Perimeter'] + data['Y_Perimeter'] + epsilon)
        data['Log_Luminosity'] = np.log(data['Sum_of_Luminosity'] + epsilon)
        data['Log_Aspect_Ratio'] = np.log(data['Aspect_Ratio'] ** 2 + epsilon)

        # Statistical Features
        data['Combined_Index'] = data['Orientation_Index'] * data['Luminosity_Index']
        data['Sigmoid_Areas'] = 1 / (1 + np.exp(-data['LogOfAreas'] + epsilon))
        return data
    
    def get_k_most_important_features(self, k):
        pass

    def fit(self,params, fit_not_evaluate = True):
        roc_scores = []
        mskf = MultilabelStratifiedKFold(n_splits=2, random_state=0, shuffle=True)
        result = np.zeros((self.df_train.shape[0], len(self.target)))
        for itteration, (train_index, test_index) in enumerate(mskf.split(self.df_train, self.df_train[self.target])):
            X_train = self.df_train.loc[train_index].drop(columns = self.target)
            X_test = self.df_train.loc[test_index].drop(columns = self.target)
            y_train = self.df_train.loc[train_index][self.target]
            y_test = self.df_train.loc[test_index][self.target]
            model = XGBClassifier(**params, n_jobs = 3, random_state = itteration)
            model.fit(X_train, y_train)
            y_pred = model.predict_proba(X_test)
            # i think this wont work, because with n_splits = 2, the validation set  will sty zero
            result[test_index] = y_pred
            roc_scores.append(roc_auc_score(y_test, y_pred))
        if fit_not_evaluate:
            return result
        else:
            return np.mean(roc_scores)
    
    def predict(self, params):
        # fit to all the data
        model = XGBClassifier(**params)
        model.fit(self.df_train.drop(columns = self.target), self.df_train[self.target])
        prediction =  model.predict_proba(self.df_test)
        # plot feature importance for prediction in a bar chart with matplotlib
        #print('IMportances:', pd.Series(model.feature_importances_, index = self.df_train.drop(columns = self.target).columns).sort_values(ascending = False).plot(kind = 'bar'))
        return prediction
    
    def objective(self,trial):
        # params for optimizing the xgboostclassifier,makke sure to use compatible ones
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1),
            'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
            'use_label_encoder': False,
            'eval_metric': 'logloss'
        }
        score = self.fit(params)
        return score
     
    def find_params(self):
        study = optuna.create_study(direction='maximize')
        study.optimize(self.objective, n_trials=400, n_jobs=7)
        return study.best_params
    

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# find the best params
target_columns = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
drop_columns = ['id']
# these parameters propably apply to thedataset without feature engineering
params2 = { 'n_estimators':1800,
            'learning_rate': 0.006,
            'gamma': 0.44,
            'subsample': 0.7,
            'colsample_bytree': 0.38,
            'max_depth': 5,
            'min_child_weight': 4,
            'reg_lambda': 1.8e-06,
            'reg_alpha': 0.54,
            'booster':'gbtree',
           'grow_policy': 'depthwise',
            'verbosity': 0 ,#'device_type': 'cuda','tree_method': 'gpu_hist',}
          }

model = model_class(df_train, df_test, target_columns, drop_columns)
roc = model.fit(params2, fit_not_evaluate = False)
print(roc)


0.8823579743906221


-> seems not to be that much better

In [None]:
# find the best params
target_columns = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
drop_columns = ['id']
# these parameters propably apply to thedataset without feature engineering
params2 = { 'n_estimators':1800,
            'learning_rate': 0.006,
            'gamma': 0.44,
            'subsample': 0.7,
            'colsample_bytree': 0.38,
            'max_depth': 5,
            'min_child_weight': 4,
            'reg_lambda': 1.8e-06,
            'reg_alpha': 0.54,
            'booster':'gbtree',
           'grow_policy': 'depthwise',
            'verbosity': 0 ,#'device_type': 'cuda','tree_method': 'gpu_hist',}
          }

model = model_class(df_train, df_test, target_columns, drop_columns)
roc_curve = model.predict(params = params2)

predictions = pd.DataFrame(np.column_stack((df_test['id'].astype('Int32'), roc_curve)), columns = ['id', 'Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults'])
# save predictions
predictions.to_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e3/predictions_simpledexgboost_1800_ca10_More Features.csv', index = False)
