In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# load in data
df_train = pd.read_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e3/train.csv')
df_test = pd.read_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e3/test.csv')


In [41]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from xgboost import XGBClassifier
import optuna
from sklearn.metrics import roc_auc_score

class model_class:
    def __init__(self, df_train, df_test, target = [], drop = []):
        self.df_train = df_train.drop(columns = drop)
        self.df_test = df_test.drop(columns = drop)
        self.target = target
        self.drop = drop

    def feature_engineering(self):
        # coordinate range
        self.df_train['X_Range'] = self.df_train['X_Maximum'] - self.df_train['X_Minimum']
        self.df_train['Y_Range'] = self.df_train['Y_Maximum'] - self.df_train['Y_Minimum']
        # same for test set
        self.df_test['X_Range'] = self.df_test['X_Maximum'] - self.df_test['X_Minimum']
        self.df_test['Y_Range'] = self.df_test['Y_Maximum'] - self.df_test['Y_Minimum']

    def fit(self,params):
        prob_scores = []  
        for i in range(5):
            print('i: ', i)
            mskf = MultilabelStratifiedKFold(n_splits=10, random_state=i, shuffle=True)
            result = np.zeros((self.df_train.shape[0], len(self.target)))
            for itteration, (train_index, test_index) in enumerate(mskf.split(self.df_train, self.df_train[self.target])):
                X_train = self.df_train.loc[train_index].drop(columns = self.target)
                X_test = self.df_train.loc[test_index].drop(columns = self.target)
                y_train = self.df_train.loc[train_index][self.target]
                y_test = self.df_train.loc[test_index][self.target]
                model = XGBClassifier(**params, n_jobs = 3, random_state = i + itteration)
                model.fit(X_train, y_train)
                y_pred = model.predict_proba(self.df_test)
                # i think this wont work, because with n_splits = 2, the validation set  will sty zero
                prob_scores.append(y_pred)
        # make numpy array from self.df_train[self.target]
        return np.mean(prob_scores, axis=0)
    
    def predict(self, params):
        # fit to all the data
        model = XGBClassifier(**params)
        model.fit(self.df_train.drop(columns = self.target), self.df_train[self.target])
        prediction =  model.predict_proba(self.df_test)
        # plot feature importance for prediction in a bar chart with matplotlib
        print('IMportances:', pd.Series(model.feature_importances_, index = self.df_train.drop(columns = self.target).columns).sort_values(ascending = False).plot(kind = 'bar'))
        return prediction
    
    def objective(self,trial):
        # params for optimizing the xgboostclassifier,makke sure to use compatible ones
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1),
            'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
            'use_label_encoder': False,
            'eval_metric': 'logloss'
        }
        score = self.fit(params)
        return score
     
    def find_params(self):
        study = optuna.create_study(direction='maximize')
        study.optimize(self.objective, n_trials=400, n_jobs=7)
        return study.best_params
    

In [7]:
# find good params
import warnings
warnings.filterwarnings("ignore")
target_columns = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
drop_columns = ['id']

model = model_class(df_train, df_test, target_columns, drop_columns)
params = model.find_params()
print(params)

[I 2024-03-13 09:27:28,491] A new study created in memory with name: no-name-b8d55ecc-2789-47a0-9b31-12cac292a6a5
[I 2024-03-13 09:28:04,287] Trial 2 finished with value: 0.8749010954469507 and parameters: {'n_estimators': 126, 'max_depth': 4, 'learning_rate': 0.018911834741224733, 'subsample': 0.987437205812265, 'colsample_bytree': 0.6943491367140943, 'gamma': 0.19768168442190806, 'reg_alpha': 1.9432096498206576e-07, 'reg_lambda': 0.3828200978121032, 'min_child_weight': 25}. Best is trial 2 with value: 0.8749010954469507.
[I 2024-03-13 09:28:10,813] Trial 5 finished with value: 0.8631752314172025 and parameters: {'n_estimators': 232, 'max_depth': 10, 'learning_rate': 0.049483264864556155, 'subsample': 0.5377109997073765, 'colsample_bytree': 0.8108876469084856, 'gamma': 0.0019023888701963953, 'reg_alpha': 0.0069500352486122075, 'reg_lambda': 2.7214616885501374e-06, 'min_child_weight': 126}. Best is trial 2 with value: 0.8749010954469507.
[I 2024-03-13 09:28:41,005] Trial 1 finished wit

KeyboardInterrupt: 

[I 2024-03-13 09:46:35,239] Trial 60 finished with value: 0.8769933645696881 and parameters: {'n_estimators': 265, 'max_depth': 10, 'learning_rate': 0.05830189411713246, 'subsample': 0.8382112380761241, 'colsample_bytree': 0.7852733541050583, 'gamma': 0.00017790602488131806, 'reg_alpha': 0.0003907792899496163, 'reg_lambda': 0.0033604360009796526, 'min_child_weight': 15}. Best is trial 57 with value: 0.8829533242256186.
[I 2024-03-13 09:47:58,795] Trial 64 finished with value: 0.8777360207853822 and parameters: {'n_estimators': 405, 'max_depth': 5, 'learning_rate': 0.06687748888432073, 'subsample': 0.8361105190873815, 'colsample_bytree': 0.57913543850559, 'gamma': 0.00020235948196095952, 'reg_alpha': 0.00029718316026717504, 'reg_lambda': 0.17404262771754223, 'min_child_weight': 16}. Best is trial 57 with value: 0.8829533242256186.
[I 2024-03-13 09:48:27,795] Trial 61 finished with value: 0.8728974562726824 and parameters: {'n_estimators': 488, 'max_depth': 10, 'learning_rate': 0.0582743

In [42]:
# predict
target_columns = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
drop_columns = ['id']
model = model_class(df_train, df_test, target_columns, drop_columns)
# it was 445, i rduce to 50
best_params = {'n_estimators': 50, 'max_depth': 5, 'learning_rate': 0.02150486464722118, 'subsample': 0.8765988725917477, 'colsample_bytree': 0.8804665026341646, 'gamma': 8.671435338499592e-05, 'reg_alpha': 0.7641529501668356, 'reg_lambda': 0.036667745207266844, 'min_child_weight': 16}
dummy_params = {'n_estimators': 20}
params2 = { 'n_estimators':1800,
            'learning_rate': 0.006,
            'gamma': 0.44,
            'subsample': 0.7,
            'colsample_bytree': 0.38,
            'max_depth': 5,
            'min_child_weight': 4,
            'reg_lambda': 1.8e-06,
            'reg_alpha': 0.54,
            'booster':'gbtree',
           'grow_policy': 'depthwise',
            'verbosity': 0 ,#'device_type': 'cuda','tree_method': 'gpu_hist',}
          }
# make predictions and save them
roc_curve = model.fit(params=params2)
predictions = pd.DataFrame(np.column_stack((df_test['id'].astype('Int32'), roc_curve)), columns = ['id', 'Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults'])

# save predictions
predictions.to_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e3/predictions_simpledexgboost_1800_coordinate_range_estimators_5iterations_10MSKF.csv', index = False)


i:  0
i:  1
i:  2
i:  3
i:  4


### This performed a little worse then my simple xgboost. So iterating over subsets and combining them does not improve the integrated ROC corve