In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import early_stopping,log_evaluation, Dataset
import lightgbm as lgb
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn import set_config
import warnings
import optuna
from sklearn.preprocessing import PolynomialFeatures,OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from scipy.stats import mode
from sklearn.preprocessing import LabelEncoder
from pprint import pprint

warnings.filterwarnings('ignore')

sns.set_theme(style = 'white', palette = 'viridis')
pal = sns.color_palette('viridis')

pd.set_option('display.max_rows', 100)
set_config(transform_output = 'pandas')
pd.options.mode.chained_assignment = None

In [27]:
# load in data
data = pd.read_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e3/train.csv')
test = pd.read_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e3/test.csv')

In [28]:

class Model:
    def __init__(self, train, test):
        # init the class with the train and test data
        self.train = train
        self.test = test
        self.model_dict = dict()
        self.test_predict_list = list()
        
    def fit(self):
        target_col = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
        # drop ths column because it is completely useless for classification
        drop_col = ['id']
        
        # get the columns for training
        train_cols = [col for col in self.train.columns.to_list() if col not in target_col + drop_col]
        print('train_cols:',train_cols)
        scores = list()
        # set the parameters for the model
        params = {'grow_policy': 'lossguide',
                  'n_estimators': 1713, 'learning_rate': 0.00676793896727872, 'gamma': 0.4425433619561816, 'subsample': 0.6782713902375049, 'colsample_bytree': 0.38371870139739117, 'max_depth': 5, 'min_child_weight': 4, 'reg_lambda': 1.7864788262454325e-06, 'reg_alpha': 0.5400111178318557,
                       'booster':'gbtree',
                       #'objective':'multi:softmax',
                       'verbosity':0,#'device_type': 'cuda','tree_method': 'gpu_hist'
                        }
    
        for i in range(4):
            # use multilabel stratified kfold to split the data, to keep the distribution of the target classes
            mskf = MultilabelStratifiedKFold(n_splits=17, shuffle=True)
            # create an array to store the predictions
            oof_valid_preds = np.zeros((self.train[train_cols].shape[0], len(target_col)))
            
            # iterate over the folds, so over each split of the data, train_idx: the indices of the training data, valid_idx: the indices of the validation data, fold: the fold number
            for fold, (train_idx, valid_idx) in enumerate(mskf.split(self.train[train_cols], self.train[target_col])):
                X_train, y_train = self.train[train_cols].iloc[train_idx], self.train[target_col].iloc[train_idx]
                X_valid, y_valid = self.train[train_cols].iloc[valid_idx], self.train[target_col].iloc[valid_idx]

                model = XGBClassifier(random_state=i+fold,**params)
                
                model.fit(X_train, y_train, eval_set = [(X_valid, y_valid)], early_stopping_rounds=100,verbose=False)
                
                print('predict')
                valid_preds = model.predict_proba(X_valid)
                # store the predictions in the array
                oof_valid_preds[valid_idx] = valid_preds
                print('fold:',fold+1, 'iteration:',i+1)
                # das macht keinen SInn, weil ich die Testdaten nicht habe
                print('shape and properties of the test data:', self.test[train_cols].shape, self.test[train_cols].columns, self.test[train_cols].dtypes)
                print('test data:', self.test[train_cols])
                test_predict = model.predict_proba(self.test[train_cols])
                self.test_predict_list.append(test_predict)
                score = roc_auc_score(y_valid, valid_preds, multi_class="ovr")
                # store the model in a dictionary
                self.model_dict[f'fold_{fold}'] = model
                    
            oof_score = roc_auc_score(self.train[target_col], oof_valid_preds, multi_class="ovr")
            print(f"The OOF auc score for iteration {i+1} is {oof_score}")
            scores.append(oof_score)
        return scores,self.test_predict_list
    
    def objective(self,trial):
        target_col = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
        drop_col = ['id']
        test_predict_list = list()
        model_dict = dict()
        
        train_cols = [col for col in self.train.columns.to_list() if col not in target_col + drop_col]
        scores = list()
        params = {'grow_policy': 'depthwise',
                  #'num_class':7,
                      'n_estimators': trial.suggest_int('n_estimators', 500, 2000), 
                       'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1), 
                       'gamma': trial.suggest_uniform('gamma', 0.1, 1), 
                       'subsample': trial.suggest_uniform('subsample', 0.5, 1),
                       'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3, 1), 
                       'max_depth': trial.suggest_int('max_depth', 5, 30), 
                       'min_child_weight': trial.suggest_int('min_child_weight', 1, 25), 
                       'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 10),
                       'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 10),
                       'booster':'gbtree',
                       'verbosity':0,
                       #'device_type': 'cuda','tree_method': 'gpu_hist'
                        }
        
        for i in range(1):
            mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True)
            oof_valid_preds = np.zeros((self.train[train_cols].shape[0], len(target_col)))
                
            for fold, (train_idx, valid_idx) in enumerate(mskf.split(self.train[train_cols], self.train[target_col])):
                X_train, y_train = self.train[train_cols].iloc[train_idx], self.train[target_col].iloc[train_idx]
                X_valid, y_valid = self.train[train_cols].iloc[valid_idx], self.train[target_col].iloc[valid_idx]
                
                model = XGBClassifier(random_state=i+fold,**params)
                    
                model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=100,verbose=False)
                
                valid_preds = model.predict_proba(X_valid)
                oof_valid_preds[valid_idx] = valid_preds

                # there must be an error, beucause im getting an error which states, that columns are missing
                test_predict = model.predict_proba(self.test[train_cols])
                test_predict_list.append(test_predict)
                score = roc_auc_score(y_valid, valid_preds, multi_class="ovr")
                model_dict[f'fold_{fold}'] = model
                    
            oof_score = roc_auc_score(self.train[target_col], oof_valid_preds, multi_class="ovr")
            #print(f"The OOF auc score for iteration {i+1} is {oof_score}")
            scores.append(oof_score)
        return np.mean(np.array(scores))
    
    def find_params(self):
        study = optuna.create_study(direction='maximize')
        study.optimize(self.objective,250)

        best_params = study.best_params
        return best_params


In [31]:
model = Model(data,test)
best_params = model.find_params()
print(best_params)
#scores,preds = model.fit()
#print(f'The average roc-auc score is {np.mean(scores)}')

[I 2024-03-07 11:27:08,631] A new study created in memory with name: no-name-f2ffadc3-8932-49d8-bfee-378f630efaa5


In dem Objective drinnen!!!!!


[I 2024-03-07 11:32:34,001] Trial 0 finished with value: 0.8812640308557725 and parameters: {'n_estimators': 1299, 'learning_rate': 0.02153045958850214, 'gamma': 0.47742132283406324, 'subsample': 0.5487577541670697, 'colsample_bytree': 0.9413923657179666, 'max_depth': 29, 'min_child_weight': 4, 'reg_lambda': 0.15067877890771939, 'reg_alpha': 0.19685513181186007}. Best is trial 0 with value: 0.8812640308557725.


In dem Objective drinnen!!!!!


[W 2024-03-07 11:33:01,599] Trial 1 failed with parameters: {'n_estimators': 1539, 'learning_rate': 0.9949369651726591, 'gamma': 0.8393160237521843, 'subsample': 0.9390502694871767, 'colsample_bytree': 0.6617703823926788, 'max_depth': 30, 'min_child_weight': 11, 'reg_lambda': 7.249475230049934, 'reg_alpha': 2.244431242523958} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/tomruge/anaconda3/envs/env3.12.2/lib/python3.12/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_185937/1444000724.py", line 95, in objective
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=100,verbose=False)
  File "/home/tomruge/anaconda3/envs/env3.12.2/lib/python3.12/site-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/home/tomruge/anaconda3/envs/env3.12.2/li

KeyboardInterrupt: 