In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# load in data
df_train = pd.read_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e3/train.csv')
df_test = pd.read_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e3/test.csv')


In [32]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import optuna
from sklearn.metrics import roc_auc_score

class model_class:
    def __init__(self, df_train, df_test, target = [], drop = []):
        self.df_train = df_train.drop(columns = drop)
        self.df_test = df_test.drop(columns = drop)
        self.target = target
        self.drop = drop
    
    def fit(self,params):
        mskf = MultilabelStratifiedKFold(n_splits=2, random_state=0, shuffle=True)
        roc_score = []
        for train_index, test_index in mskf.split(self.df_train, self.df_train[self.target]):
            X_train = self.df_train.loc[train_index].drop(columns = self.target)
            X_test = self.df_train.loc[test_index].drop(columns = self.target)
            y_train = self.df_train.loc[train_index][self.target]
            y_test = self.df_train.loc[test_index][self.target]
            self.model = RandomForestClassifier(**params)
            self.model.fit(X_train, y_train)
            y_pred = self.model.predict_proba(X_test)
            # extract the probability of the positive class
            y_pred = np.array([array[:,1] for array in y_pred]).T
            #print('UNiques: ', np.unique(y_pred))
            # shape ypred
            #print(roc_auc_score(y_test, y_pred))
            roc_score.append(roc_auc_score(y_test, y_pred))
        return np.mean(roc_score)
    
    def predict(self, params):
        # fit to all the data
        model = RandomForestClassifier(**params)
        model.fit(self.df_train.drop(columns = self.target), self.df_train[self.target])
        prediction =  model.predict_proba(self.df_test)
        y_pred = np.array([array[:,1] for array in prediction]).T
        return y_pred
    
    def objective(self,trial):
        # params for optimizing the random forrest, use many, but make sure to use compatible ones
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 10, 100),
            "max_depth": trial.suggest_int("max_depth", 2, 32),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 32),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 32),
            "max_features": trial.suggest_int("max_features", 1, 32),
            "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
            "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        }
        score = self.fit(params)
        return score
     
    def find_params(self):
        study = optuna.create_study(direction='maximize')
        study.optimize(self.objective, n_trials=100, n_jobs=1)
        return study.best_params


In [None]:
# find good params
target_columns = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
drop_columns = ['id']
model = model_class(df_train, df_test, target_columns, drop_columns)
params = model.find_params()
print(params)

In [None]:
# predict
target_columns = ['Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults']
drop_columns = ['id']
model = model_class(df_train, df_test, target_columns, drop_columns)
best_params = {'n_estimators': 94, 'max_depth': 17, 'min_samples_split': 21, 'min_samples_leaf': 28, 'max_features': 9, 'criterion': 'entropy', 'bootstrap': False}
# make predictions and save them
predicted_values = model.predict(params=best_params)
predictions = pd.DataFrame(np.column_stack((df_test['id'].astype('Int32'), predicted_values)), columns = ['id', 'Pastry','Z_Scratch','K_Scatch','Stains','Dirtiness','Bumps','Other_Faults'])

# save predictions
predictions.to_csv('/home/tomruge/Schreibtisch/Data/Kaggle/playground-series-s4e3/predictions_simplederandom_forest.csv', index = False)
