# Poisonous Mushrooms: Experimentos

<!-- TODO
[x] Preprocess dataset
[x] Divide dataset into 10-fold
[x] Create train/val/test
[x] Select model
[x] Train k-fold cross validation
[x] Train hold-out
[x] Submission on Kaggle
-->

**Universidade Federal de Ouro Preto**

**PCC142 - Mineração de Dados - 2025/1**

**Prof. Anderson Almeida Ferreira**

**Victor G. Lima**

**Conteúdo:**    

- Importações e ambiente   
 
- Dataset    

- Particionamento do dataset em 10 partes (folds)    

- Criação de partições de treino, validação e teste com os 10 folds    

- Pré-processamento    

-  Função objetivo para busca de melhor combinação de hiper-parâmetros   

- Experimento em hold-out    

- Experimento em validação cruzada    

- Predição da partição de teste para submissão no Kaggle   

  - Treinando com todos os dados da partição original de treino  

  - Criando um classificador por votação com os modelos da validação cruzada   

    - Hard    
	
    - Soft  

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=false
	flat=false
	minLevel=2
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

---

**Este notebook apenas sumariza e executa os experimentos que são descritos e comentados no <a href="../docs/article.pdf">artigo</a>.**

---

## Importações e ambiente

In [None]:
import random
import warnings
import os
import time
import pickle
from collections import Counter

warnings.simplefilter(action="ignore", category=Warning)

%pip install numpy pandas optuna tqdm catboost scikit-learn

import numpy as np
import pandas as pd
import optuna
from optuna.visualization import plot_optimization_history
from tqdm import tqdm
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, matthews_corrcoef
from IPython.display import clear_output

from src.preprocessing import MushroomPreprocessor


DEVICE = "GPU"
open("./results/device.txt", "w").write(str(DEVICE))

SEED = 27
open("./results/seed.txt", "w").write(str(SEED))
random.seed(SEED)

clear_output()

## Dataset

In [None]:
data = pd.read_csv("./data/kaggle_mushroom/train.csv")
secondary_mushroom = pd.read_csv("./data/secondary_mushroom/secondary_data.csv", sep=";")
n_samples = len(data)

print("Number of samples:", n_samples)
data.head()

Number of samples: 3116945


Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a


## Particionamento do dataset em 10 partes (folds)

In [None]:
# suffle dataset
data = data.sample(frac=1).reset_index(drop=True)

# 10-folds
folds = [data.iloc[i::10].reset_index(drop=True) for i in range(10)]

pd.DataFrame(
    {
        "Fold": [i for i in range(10)],
        "Instances": [len(fold) for fold in folds],
        "Percentage (%)": [len(fold) / n_samples * 100 for fold in folds],
    }
)

Unnamed: 0,Fold,Instances,Percentage (%)
0,0,311695,10.000016
1,1,311695,10.000016
2,2,311695,10.000016
3,3,311695,10.000016
4,4,311695,10.000016
5,5,311694,9.999984
6,6,311694,9.999984
7,7,311694,9.999984
8,8,311694,9.999984
9,9,311694,9.999984


## Criação de partições de treino, validação e teste com os 10 folds

In [None]:
# train, val and test
train_data = pd.concat([folds[i] for i in [0, 1, 2, 3, 4, 5, 6]], ignore_index=True)
val_data = pd.concat([folds[i] for i in [7]], ignore_index=True)
test_data = pd.concat([folds[i] for i in [8, 9]], ignore_index=True)

pd.DataFrame(
    {
        "Partition": ["Train", "Validation", "Test"],
        "Instances": [
            len(train_data),
            len(val_data),
            len(test_data),
        ],
        "Percentage (%)": [
            len(train_data) / n_samples * 100,
            len(val_data) / n_samples * 100,
            len(test_data) / n_samples * 100,
        ],
    }
)

Unnamed: 0,Partition,Instances,Percentage (%)
0,Train,2181863,70.000048
1,Validation,311694,9.999984
2,Test,623388,19.999968


## Pré-processamento

O mesmo realizado anteriormene.

In [None]:
preprocessor = MushroomPreprocessor(secondary_mushroom)
pickle.dump(preprocessor, open("./results/hold_out_preprocessor.pkl", "wb"))

X_train, y_train = preprocessor.fit_transform(train_data)
X_val, y_val = preprocessor.transform(val_data)
X_test, y_test = preprocessor.transform(test_data)

clear_output()
print("train", X_train.shape, y_train.shape)
print("val", X_val.shape, y_val.shape)
print("test", X_test.shape, y_test.shape)

train (2181863, 15) (2181863,)
val (311694, 15) (311694,)
test (623388, 15) (623388,)


##  Função objetivo para busca de melhor combinação de hiper-parâmetros

In [None]:
def create_objective(X_train, y_train, X_val, y_val, device="CPU"):
    def objective(trial: optuna.Trial):
        params = {
            "iterations": trial.suggest_categorical("iterations", [250, 500, 750, 1000]),
            "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
            "depth": int(trial.suggest_int("depth", low=4, high=12)),
            "l2_leaf_reg": int(trial.suggest_int("l2_leaf_reg", low=2, high=10)),
            "border_count": trial.suggest_categorical("border_count", [64, 128, 254]),
            "loss_function": "Logloss",
            "feature_border_type": "GreedyLogSum",
            "early_stopping_rounds": 50,
        }

        model = CatBoostClassifier(verbose=0, use_best_model=True, task_type=device, **params)

        model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)
        y_pred = model.predict(X_val)
        mcc = matthews_corrcoef(y_val, y_pred)
        return mcc

    return objective

## Experimento em hold-out

In [None]:
n_trials = 100

study = optuna.create_study(
    direction="maximize",
    study_name="train_val_test",
    storage="sqlite:///results/optuna.sqlite3",
    load_if_exists=True,
)

objective = create_objective(X_train, y_train, X_val, y_val, device=DEVICE)
study.optimize(objective, n_trials=n_trials - len(study.get_trials()), show_progress_bar=True)

clear_output()
print("Best trial:", study.best_trial.number)
print(f"- MCC: {study.best_value}")
print(f"- Params: {study.best_params}")

Best trial: 76
- MCC: 0.9823813757931498
- Params: {'iterations': 1000, 'learning_rate': 0.0731971947212699, 'depth': 12, 'l2_leaf_reg': 7, 'border_count': 64}


In [None]:
study = optuna.load_study(
    study_name="train_val_test",
    storage="sqlite:///results/optuna.sqlite3",
)
fig = plot_optimization_history(study)
fig.update_layout(width=1000, height=600)
fig.show()

In [None]:
training_time = time.time()
model = CatBoostClassifier(verbose=100, task_type=DEVICE, **study.best_params)
model.fit(X_train, y_train)
training_time = time.time() - training_time
pickle.dump(model, open("./results/hold_out_model.pkl", "wb"))

testing_time = time.time()
y_pred = model.predict(X_test)
testing_time = time.time() - testing_time
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
f1 = f1_score(y_test, y_pred, average="macro")
mcc = matthews_corrcoef(y_test, y_pred)

hold_out_results = pd.DataFrame(
    {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "mcc": mcc,
        "training_time": training_time,
        "testing_time": testing_time,
        "hyperparameters": str(study.best_params),
    },
    index=["Hold-Out Experiment"],
)
hold_out_results.to_csv(
    "./results/hold_out_results.csv",
)

clear_output()
hold_out_results

Unnamed: 0,accuracy,precision,recall,f1,mcc,training_time,testing_time,hyperparameters
Hold-Out Experiment,0.991073,0.990906,0.991087,0.990996,0.981993,219.858307,1.46149,"{'iterations': 1000, 'learning_rate': 0.073197..."


## Experimento em validação cruzada

In [None]:
n_trials = 25
os.makedirs("./results/10_fold_models/", exist_ok=True)

if os.path.exists("./results/10_fold_results.csv"):
    df = pd.read_csv("./results/10_fold_results.csv", index_col=0)
    results = {
        "accuracy": [(-1, -1)] + list(df["accuracy"][1:].astype(float)),
        "precision": [(-1, -1)] + list(df["precision"][1:].astype(float)),
        "recall": [(-1, -1)] + list(df["recall"][1:].astype(float)),
        "f1": [(-1, -1)] + list(df["f1"][1:].astype(float)),
        "mcc": [(-1, -1)] + list(df["mcc"][1:].astype(float)),
        "training_time": [-1] + list(df["training_time"][1:].astype(float)),
        "testing_time": [-1] + list(df["testing_time"][1:].astype(float)),
        "hyperparameters": ["-"] + list(df["hyperparameters"][1:].astype(str)),
    }
else:
    results = {
        "accuracy": [(-1, -1)],
        "precision": [(-1, -1)],
        "recall": [(-1, -1)],
        "f1": [(-1, -1)],
        "mcc": [(-1, -1)],
        "training_time": [-1],
        "testing_time": [-1],
        "hyperparameters": ["-"],
    }

for i in tqdm(range(10)):

    if i < len(results["accuracy"]) - 1:
        continue

    j = (i + 1) % 10
    k_test = folds[i]
    k_val = folds[j]
    k_train = pd.concat([folds[k] for k in range(10) if k not in (i, j)], ignore_index=True)

    k_preprocessor = MushroomPreprocessor(secondary_mushroom)
    k_X_train, k_y_train = k_preprocessor.fit_transform(k_train)
    k_X_val, k_y_val = k_preprocessor.transform(k_val)
    k_X_test, k_y_test = k_preprocessor.transform(k_test)

    try:
        optuna.delete_study(study_name=f"10_fold-{i}", storage="sqlite:///results/optuna.sqlite3")
    except KeyError:
        pass

    k_study = optuna.create_study(
        direction="maximize",
        study_name=f"10_fold-{i}",
        storage="sqlite:///results/optuna.sqlite3",
        load_if_exists=False,
    )

    objective = create_objective(k_X_train, k_y_train, k_X_val, k_y_val, device=DEVICE)
    k_study.optimize(objective, n_trials=n_trials - len(k_study.get_trials()), show_progress_bar=True)

    training_time = time.time()
    k_model = CatBoostClassifier(verbose=0, task_type=DEVICE, **k_study.best_params)
    k_model.fit(k_X_train, k_y_train)
    training_time = time.time() - training_time
    pickle.dump(k_model, open(f"./results/10_fold_models/{i}.pkl", "wb"))

    testing_time = time.time()
    y_pred = k_model.predict(k_X_test)
    testing_time = time.time() - testing_time
    results["accuracy"].append(accuracy_score(k_y_test, y_pred))
    results["precision"].append(precision_score(k_y_test, y_pred))
    results["recall"].append(recall_score(k_y_test, y_pred))
    results["f1"].append(f1_score(k_y_test, y_pred))
    results["mcc"].append(matthews_corrcoef(k_y_test, y_pred))
    results["training_time"].append(training_time)
    results["testing_time"].append(testing_time)
    results["hyperparameters"].append(str(k_study.best_params))

    pd.DataFrame(results, index=["10-Fold"] + list(range(i + 1))).to_csv("./results/10_fold_results.csv")
    clear_output()

results["accuracy"][0] = (
    float(np.mean([float(x) for x in results["accuracy"][1:]])),
    float(np.std([float(x) for x in results["accuracy"][1:]])),
)
results["precision"][0] = (
    float(np.mean([float(x) for x in results["precision"][1:]])),
    float(np.std([float(x) for x in results["precision"][1:]])),
)
results["recall"][0] = (
    float(np.mean([float(x) for x in results["recall"][1:]])),
    float(np.std([float(x) for x in results["recall"][1:]])),
)
results["f1"][0] = (float(np.mean([float(x) for x in results["f1"][1:]])), float(np.std([float(x) for x in results["f1"][1:]])))
results["mcc"][0] = (
    float(np.mean([float(x) for x in results["mcc"][1:]])),
    float(np.std([float(x) for x in results["mcc"][1:]])),
)
results["training_time"][0] = float(np.sum([float(x) for x in results["training_time"][1:]]))
results["testing_time"][0] = float(np.sum([float(x) for x in results["testing_time"][1:]]))
results = pd.DataFrame(results, index=["10-Fold"] + list(range(10)))
results.to_csv("./results/10_fold_results.csv")

clear_output()
results

Unnamed: 0,accuracy,precision,recall,f1,mcc,training_time,testing_time,hyperparameters
10-Fold,"(0.990923484367636, 0.00017775370332674957)","(0.9926272239768709, 0.0002466145347177362)","(0.9907697090487352, 0.0002587625741835391)","(0.991697560739215, 0.00016804781688462914)","(0.98168992179833, 0.0003563422180865093)",1860.814378,6.825309,-
0,0.990815,0.992118,0.991089,0.991603,0.981467,243.413909,0.791862,"{'iterations': 1000, 'learning_rate': 0.098606..."
1,0.99085,0.992861,0.990391,0.991624,0.981546,182.513112,0.596778,"{'iterations': 750, 'learning_rate': 0.0555347..."
2,0.990658,0.992315,0.990541,0.991427,0.981165,166.033108,0.598139,"{'iterations': 1000, 'learning_rate': 0.096886..."
3,0.991145,0.992679,0.99117,0.991924,0.982126,198.919532,0.723571,"{'iterations': 1000, 'learning_rate': 0.066999..."
4,0.991168,0.992917,0.990919,0.991917,0.982184,198.676279,0.756084,"{'iterations': 1000, 'learning_rate': 0.099684..."
5,0.99077,0.992707,0.990408,0.991556,0.981381,145.307872,0.615124,"{'iterations': 1000, 'learning_rate': 0.094257..."
6,0.990715,0.99241,0.990619,0.991514,0.981267,166.056314,0.673215,"{'iterations': 1000, 'learning_rate': 0.095079..."
7,0.990946,0.992695,0.990765,0.991729,0.981731,181.432734,0.662414,"{'iterations': 750, 'learning_rate': 0.0971849..."
8,0.991091,0.992822,0.990857,0.991839,0.982033,196.253646,0.707156,"{'iterations': 1000, 'learning_rate': 0.073602..."


## Predição da partição de teste para submissão no Kaggle

### Treinando com todos os dados da partição original de treino

In [None]:
kaggle_X_train = np.concatenate([X_train, X_val, X_test])
kaggle_y_train = np.concatenate([y_train, y_val, y_test])

study = optuna.load_study(
    study_name="train_val_test",
    storage="sqlite:///results/optuna.sqlite3",
)
kaggle_model = CatBoostClassifier(verbose=50, task_type=DEVICE, **study.best_params)
kaggle_model.fit(kaggle_X_train, kaggle_y_train, verbose=False)

test_data = pd.read_csv("data/kaggle_mushroom/test.csv")
X_test, y_test, ids_test = preprocessor.transform(test_data, return_ids=True)

sorted_indices = np.argsort(ids_test)
X_test = X_test[sorted_indices]
ids_test = ids_test[sorted_indices]
y_pred = kaggle_model.predict(X_test)

solution = pd.DataFrame({"id": ids_test, "class": y_pred})
solution["class"] = solution["class"].map({0: "e", 1: "p"})
solution.to_csv("results/submission.csv", index=False)

clear_output()
solution.head()

Unnamed: 0,id,class
0,3116945,e
1,3116946,p
2,3116947,p
3,3116948,p
4,3116949,e


### Criando um classificador por votação com os modelos da validação cruzada

#### Hard

In [None]:
models = []
for i in range(10):
    with open(f"results/10_fold_models/{i}.pkl", "rb") as f:
        model = pickle.load(f)
        models.append(model)

all_predictions = []
for model in models:
    preds = model.predict(X_test)
    all_predictions.append(preds)

all_predictions = np.array(all_predictions).T  # para formato (n_samples, n_models)

# hard voting
y_pred_voting_hard = []
for preds in all_predictions:
    counts = Counter(preds)
    max_count = max(counts.values())
    candidates = [label for label, count in counts.items() if count == max_count]
    majority_vote = max(candidates)  # resolve empate com a classe mais alta (1: venenoso)
    y_pred_voting_hard.append(majority_vote)


solution_voting_hard = pd.DataFrame({"id": ids_test, "class": y_pred_voting_hard})
solution_voting_hard["class"] = solution_voting_hard["class"].map({0: "e", 1: "p"})
solution_voting_hard.to_csv("results/solution_voting_hard.csv", index=False)

clear_output()
solution_voting_hard.head()

Unnamed: 0,id,class
0,3116945,e
1,3116946,p
2,3116947,p
3,3116948,p
4,3116949,e


#### Soft

In [None]:
models = []
for i in range(10):
    with open(f"results/10_fold_models/{i}.pkl", "rb") as f:
        model = pickle.load(f)
        models.append(model)

all_probabilities = []
for model in models:
    probs = model.predict_proba(X_test)
    all_probabilities.append(probs)

# soft voting
avg_probs = np.mean(np.array(all_probabilities), axis=0)
y_pred_voting_soft = np.argmax(avg_probs, axis=1)

solution_voting_soft = pd.DataFrame({"id": ids_test, "class": y_pred_voting_soft})
solution_voting_soft["class"] = solution_voting_soft["class"].map({0: "e", 1: "p"})
solution_voting_soft.to_csv("results/solution_voting_soft.csv", index=False)

clear_output()
solution_voting_soft.head()

Unnamed: 0,id,class
0,3116945,e
1,3116946,p
2,3116947,p
3,3116948,p
4,3116949,e
