In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# load in data
df_train = pd.read_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e3/train.csv')
df_test = pd.read_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e3/test.csv')


In [6]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import optuna
from sklearn.metrics import roc_auc_score
class model_decision_tree:
    def __init__(self, train, test, target, drop = []):
        self.train = train
        self.test = test
        self.target = target
        self.drop = drop
    
    def fit(self, params = {}):
        # running fit multiple times for different validation and train sets to get a better estimate for the prediction
        predictions = []
        for i in range(5):
            print('i: ', i)
            # MultilabelStratifiedKFold splits 5 times, but the splits are not overlapping
            mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            predictions_1_iteration = np.zeros((len(self.train),len(self.target)))
            #print('i: ', i)
            for fold, (train_index, val_index) in enumerate(mskf.split(self.train.drop(self.target, axis=1), self.train[self.target])):
                X_train, X_val = self.train.drop(self.target, axis=1).iloc[train_index], self.train.drop(self.target, axis=1).iloc[val_index]
                y_train, y_val = self.train[self.target].iloc[train_index], self.train[self.target].iloc[val_index]

                model = RandomForestClassifier(random_state = i+fold,n_jobs=-1,**params)
                model.fit(X_train, y_train)
                predictions_1_iteration[val_index] = model.predict(X_val)   
            predictions.append(predictions_1_iteration)
        # get the mean of the predictions
        mean_predictions = np.mean(predictions, axis = 0)
        '''print('shape of mean_predictions: ', mean_predictions.shape)
        print('mean unique values: ', np.unique(mean_predictions))
        print('mean_predictions: ', mean_predictions)'''
        # get roc auc score
        integral_score = roc_auc_score(self.train[self.target], mean_predictions)
        print('roc auc score: ', integral_score)
        return integral_score, mean_predictions   
     
    def prediction(self,params):
        predictions = []
        for i in range(10):
            mskf = MultilabelStratifiedKFold(n_splits=3, shuffle=True, random_state=42)
            for fold, (train_index, val_index) in enumerate(mskf.split(self.train.drop(self.target, axis=1), self.train[self.target])):
                X_train, X_val = self.train.drop(self.target, axis=1).iloc[train_index], self.train.drop(self.target, axis=1).iloc[val_index]
                y_train, y_val = self.train[self.target].iloc[train_index], self.train[self.target].iloc[val_index]
                model = RandomForestClassifier(random_state = i+fold,**params)
                model.fit(X_train, y_train)
                prediction = model.predict(self.test)
                predictions.append(prediction) 
        # predictions shape
        print('predictions shape: ', np.array(predictions).shape)           
        mean_predictions = np.mean(predictions, axis = 0)
        return mean_predictions
    
    def objective(self, trial) :
        # parameter search space
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 1, 20),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
            #'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
            'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
            'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy'])
        }
        score, _ = self.fit(params)
        return score
    
    def find_params(self):
        print('start finding best params')
        study = optuna.create_study(direction='maximize')
        study.optimize(self.objective, n_trials=20, n_jobs = -1)
        return study.best_params


       

In [3]:
# find best params
target_columns = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
drop_columns = ['id']
model = model_decision_tree(df_train, df_test, target_columns, drop_columns)
best_params = model.find_params()

[I 2024-03-10 20:25:25,446] A new study created in memory with name: no-name-5a543416-7d59-465d-b0cd-da534280c497


start finding best params


[I 2024-03-10 20:26:46,993] Trial 0 finished with value: 0.6155516420606773 and parameters: {'n_estimators': 2, 'max_depth': 3, 'min_samples_split': 3, 'min_samples_leaf': 8, 'bootstrap': False, 'criterion': 'entropy'}. Best is trial 0 with value: 0.6155516420606773.
[I 2024-03-10 20:26:51,921] Trial 3 finished with value: 0.6674313035736413 and parameters: {'n_estimators': 6, 'max_depth': 8, 'min_samples_split': 5, 'min_samples_leaf': 8, 'bootstrap': False, 'criterion': 'gini'}. Best is trial 3 with value: 0.6674313035736413.
[I 2024-03-10 20:26:53,128] Trial 5 finished with value: 0.6125216333457579 and parameters: {'n_estimators': 8, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 5, 'bootstrap': True, 'criterion': 'entropy'}. Best is trial 3 with value: 0.6674313035736413.
[I 2024-03-10 20:26:57,125] Trial 2 finished with value: 0.6148162425616766 and parameters: {'n_estimators': 8, 'max_depth': 5, 'min_samples_split': 4, 'min_samples_leaf': 9, 'bootstrap': False, 'crit

In [9]:
# make predictions
target_columns = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
drop_columns = ['id']
best_params = {'n_estimators': 3, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 2, 'bootstrap': False, 'criterion': 'gini'}
# predict and save the predictions
model = model_decision_tree(df_train, df_test, target_columns, drop_columns)
predicted_values = model.fit(best_params)


i:  0
i:  1
i:  2
i:  3
i:  4
roc auc score:  0.6624443621663183


In [4]:
target_columns = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
drop_columns = ['id']
best_params = {'n_estimators': 3, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 2, 'bootstrap': False, 'criterion': 'gini'}
# predict and save the predictions
model = model_decision_tree(df_train, df_test, target_columns, drop_columns)
predicted_values = model.prediction(best_params)

# column 1 must be the id column and the following columns must be the predictions, id column should be integer
predictions = pd.DataFrame(np.column_stack((df_test['id'].astype('Int32'), predicted_values)), columns = ['id', 'Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults'])
predictions.to_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e3/predictions_forest.csv', index = False)


predictions shape:  (30, 12814, 7)
