In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# load in data
df_train = pd.read_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e3/train.csv')
df_test = pd.read_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e3/test.csv')


In [102]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
import optuna
from sklearn.metrics import roc_auc_score

class model_class:
    def __init__(self, df_train, df_test, target = [], drop = []):
        self.df_train = df_train.drop(columns = drop)
        self.df_test = df_test.drop(columns = drop)
        self.target = target
        self.drop = drop
    
    def fit(self,params):
        mskf = MultilabelStratifiedKFold(n_splits=2, random_state=0, shuffle=True)
        for train_index, test_index in mskf.split(self.df_train, self.df_train[self.target]):
            X_train = self.df_train.loc[train_index].drop(columns = self.target)
            X_test = self.df_train.loc[test_index].drop(columns = self.target)
            y_train = self.df_train.loc[train_index][self.target]
            y_test = self.df_train.loc[test_index][self.target]
            self.model = DecisionTreeClassifier(**params)
            self.model.fit(X_train, y_train)
            y_pred = self.model.predict_proba(X_test)
            # extract the probability of the positive class
            y_pred = np.array([array[:,1] for array in y_pred]).T
            # shape ypred
            print(roc_auc_score(y_test, y_pred))
        #return roc_auc_score(y_test, y_pred), model
    def predict(self, params):
        # fit to all the data
        model = DecisionTreeClassifier(**params)
        model.fit(self.df_train.drop(columns = self.target), self.df_train[self.target])
        prediction =  model.predict_proba(self.df_test)
        y_pred = np.array([array[:,1] for array in prediction]).T
        return y_pred
    
    def objective(self,trial):
        # params for optimizing decision tree, they need to be compatible with eacht other
        params = {
            'max_depth': trial.suggest_int('max_depth', 1, 32),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 32),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 32),
            'max_features': trial.suggest_int('max_features', 1, 32),
            'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
            'splitter': trial.suggest_categorical('splitter', ['best', 'random'])
        }
        score, _ = self.fit(params)
        return score
     
    def find_params(self):
        study = optuna.create_study(direction='maximize')
        study.optimize(self.objective, n_trials=100)
        return study.best_params


In [103]:
# make fit
# find good params
target_columns = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
drop_columns = ['id']
model = model_class(df_train, df_test, target_columns, drop_columns)
best_params = {'max_depth': 15,
 'min_samples_split': 21,
 'min_samples_leaf': 11,
 'max_features': 21,
 'criterion': 'entropy',
 'splitter': 'best'}
model.fit(params=best_params)

# make predictions and save them
predicted_values = model.predict(params=best_params)
predictions = pd.DataFrame(np.column_stack((df_test['id'].astype('Int32'), predicted_values)), columns = ['id', 'Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults'])

# save predictions
predictions.to_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e3/predictions_simpledeecisiontree.csv', index = False)


0.7924164204220878
0.7928286530414298


In [23]:
# find good params
target_columns = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
drop_columns = ['id']
model = model(df_train, df_test, target_columns, drop_columns)
best_params = model.find_params()

[I 2024-03-12 17:49:59,690] A new study created in memory with name: no-name-63b4d013-bcca-4587-8e8f-f60ffa6a4ea1
[I 2024-03-12 17:50:00,585] Trial 0 finished with value: 0.6733550461589655 and parameters: {'max_depth': 25, 'min_samples_split': 26, 'min_samples_leaf': 9, 'max_features': 16, 'criterion': 'gini', 'splitter': 'best'}. Best is trial 0 with value: 0.6733550461589655.
[I 2024-03-12 17:50:01,336] Trial 1 finished with value: 0.6663140872415705 and parameters: {'max_depth': 17, 'min_samples_split': 5, 'min_samples_leaf': 32, 'max_features': 10, 'criterion': 'gini', 'splitter': 'best'}. Best is trial 0 with value: 0.6733550461589655.
[I 2024-03-12 17:50:02,012] Trial 2 finished with value: 0.6573009355186963 and parameters: {'max_depth': 21, 'min_samples_split': 20, 'min_samples_leaf': 13, 'max_features': 28, 'criterion': 'gini', 'splitter': 'random'}. Best is trial 0 with value: 0.6733550461589655.
[I 2024-03-12 17:50:02,643] Trial 3 finished with value: 0.6295247027954473 and