In [26]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")

spambase_df = pd.read_csv("spambase_encoded.csv")
mushroom_df = pd.read_csv("mushroom_encoded.csv")
segment_df  = pd.read_csv("segment_encoded.csv")

datasets = {
    "Spambase": spambase_df,
    "Mushroom": mushroom_df,
    "Segment":  segment_df
}

N_REPEATS = 10
N_TREES   = 100
DEPTHS    = [1, 2, 4, 8, 12, 16, 20, 32, None]
FEATURES  = [1, 2, 4, 10, 14, "sqrt", "log2", None]

records = []

# Experiment 1 
for name, df in datasets.items():
    X = df.drop(columns=["class"])
    y = df["class"]

    for d in DEPTHS:
        train_scores = []
        oob_scores = []
        for seed in range(N_REPEATS):
            rf = RandomForestClassifier(
                n_estimators   = N_TREES,
                max_depth      = d,
                max_features   = "sqrt",
                bootstrap      = True,
                oob_score      = True,
                n_jobs         = -1,
                random_state   = seed
            )
            rf.fit(X, y)
            train_scores.append(rf.score(X, y))
            oob_scores.append(rf.oob_score_)

        records.append({
            "Dataset": name,
            "Exp": "Depth",
            "max_depth": d,
            "max_features": "sqrt",
            "Train_mean": np.mean(train_scores),
            "Train_std":  np.std(train_scores),
            "OOB_mean": np.mean(oob_scores),
            "OOB_std":  np.std(oob_scores)
        })

# Get best depth per dataset
df_results = pd.DataFrame(records)
best_depths = df_results[df_results["Exp"] == "Depth"].groupby("Dataset")["OOB_mean"].idxmax()

best_depth_map = {}
for i in best_depths:
    dataset_name = df_results.loc[i, "Dataset"]
    val = df_results.loc[i, "max_depth"]
    if pd.isna(val):
        best_depth_map[dataset_name] = None
    elif isinstance(val, float):
        best_depth_map[dataset_name] = int(val)
    else:
        best_depth_map[dataset_name] = val

In [48]:
df_results
#df_results.to_csv("df_results.csv", index=False)

Unnamed: 0,Dataset,Exp,max_depth,max_features,Train_mean,Train_std,OOB_mean,OOB_std
0,Spambase,Depth,1.0,sqrt,0.840448,0.007924836,0.835188,0.006977
1,Spambase,Depth,2.0,sqrt,0.893806,0.004686978,0.887959,0.004889
2,Spambase,Depth,4.0,sqrt,0.923734,0.001865744,0.917518,0.002263
3,Spambase,Depth,8.0,sqrt,0.954771,0.0006257848,0.937122,0.001329
4,Spambase,Depth,12.0,sqrt,0.976831,0.0007541557,0.945686,0.001433
5,Spambase,Depth,16.0,sqrt,0.989807,0.0006552842,0.950228,0.001994
6,Spambase,Depth,20.0,sqrt,0.99374,0.0003477505,0.952423,0.00081
7,Spambase,Depth,32.0,sqrt,0.999044,0.0001064764,0.954771,0.001334
8,Spambase,Depth,,sqrt,0.999304,8.693762e-05,0.95451,0.001385
9,Mushroom,Depth,1.0,sqrt,0.875037,0.01802774,0.873769,0.013884


In [30]:
# Experiment 2 
records2 = []

for name, df in datasets.items():
    X = df.drop(columns=["class"])
    y = df["class"]
    best_d = best_depth_map[name]

    for f in FEATURES:
        oob_scores = []
        train_scores = []
        for seed in range(N_REPEATS):
            rf = RandomForestClassifier(
                n_estimators   = N_TREES,
                max_depth      = best_d,
                max_features   = f,
                bootstrap      = True,
                oob_score      = True,
                n_jobs         = -1,
                random_state   = seed
            )
            rf.fit(X, y)
            train_scores.append(rf.score(X, y))
            oob_scores.append(rf.oob_score_)

        records2.append({
            "Dataset": name,
            "Exp": "Features",
            "max_depth": best_d,
            "max_features": f,
            "Train_mean": np.mean(train_scores),
            "Train_std":  np.std(train_scores),
            "OOB_mean": np.mean(oob_scores),
            "OOB_std":  np.std(oob_scores)
        })

In [50]:
df_results2 = pd.DataFrame(records2)
df_results2
#df_results2.to_csv("df_results2.csv", index=False)

Unnamed: 0,Dataset,Exp,max_depth,max_features,Train_mean,Train_std,OOB_mean,OOB_std
0,Spambase,Features,32,1,0.993023,0.000382056,0.952793,0.001032
1,Spambase,Features,32,2,0.996175,0.0003662747,0.954945,0.001322
2,Spambase,Features,32,4,0.998326,0.0003081384,0.955162,0.000542
3,Spambase,Features,32,10,0.999065,0.0002184281,0.953967,0.001174
4,Spambase,Features,32,14,0.999152,0.0001521408,0.951837,0.001163
5,Spambase,Features,32,sqrt,0.999044,0.0001064764,0.954771,0.001334
6,Spambase,Features,32,log2,0.998826,0.0002216483,0.95577,0.001519
7,Spambase,Features,32,,0.999022,0.000242998,0.948316,0.000884
8,Mushroom,Features,12,1,1.0,0.0,1.0,0.0
9,Mushroom,Features,12,2,1.0,0.0,1.0,0.0


In [38]:
#  Experiment 3 
records3 = []


for name, df in datasets.items():
    X = df.drop(columns=["class"])
    y = df["class"]

    for d in [1, 2, 4, 8, 16 ,32, None]:
        for f in [1, 2, 4, 14, "sqrt", None]:
            oob_scores = []
            train_scores = []
            for seed in range(N_REPEATS):
                rf = RandomForestClassifier(
                    n_estimators   = N_TREES,
                    max_depth      = d,
                    max_features   = f,
                    bootstrap      = True,
                    oob_score      = True,
                    n_jobs         = -1,
                    random_state   = seed
                )
                rf.fit(X, y)
                train_scores.append(rf.score(X, y))
                oob_scores.append(rf.oob_score_)

            records3.append({
                "Dataset": name,
                "Exp": "Grid",
                "max_depth": d,
                "max_features": f,
                "Train_mean": np.mean(train_scores),
                "Train_std":  np.std(train_scores),
                "OOB_mean": np.mean(oob_scores),
                "OOB_std":  np.std(oob_scores)
            })


In [39]:
df_results3 = pd.DataFrame(records3)
df_results3

Unnamed: 0,Dataset,Exp,max_depth,max_features,Train_mean,Train_std,OOB_mean,OOB_std
0,Spambase,Grid,1.0,1,0.682026,1.326796e-02,0.687090,0.013771
1,Spambase,Grid,1.0,2,0.761813,1.646988e-02,0.757509,0.015757
2,Spambase,Grid,1.0,4,0.814779,1.130139e-02,0.808911,0.011350
3,Spambase,Grid,1.0,14,0.857727,6.267805e-03,0.852184,0.007206
4,Spambase,Grid,1.0,sqrt,0.840448,7.924836e-03,0.835188,0.006977
...,...,...,...,...,...,...,...,...
121,Segment,Grid,,2,0.999567,1.110223e-16,0.970476,0.001092
122,Segment,Grid,,4,0.999567,1.110223e-16,0.973723,0.001410
123,Segment,Grid,,14,0.999567,1.110223e-16,0.975455,0.001043
124,Segment,Grid,,sqrt,0.999567,1.110223e-16,0.973723,0.001410


In [42]:
df_results3.to_csv("df_results3.csv", index=False)