## Ensemble learning modeling
### Voting, Bagging, Boosting, Stacking


***Imports***

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.base import clone

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    BaggingClassifier,
    VotingClassifier,
    StackingClassifier
)

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

RSEED = 42
np.random.seed(RSEED)

**Leitura dos Datasets**

In [2]:

train_df = pd.read_csv("train_data_prepared.csv", encoding="latin-1", keep_default_na=False, na_values=['NULL', ''])
test_df = pd.read_csv("test_data_prepared.csv", encoding="latin-1", keep_default_na=False, na_values=['NULL', ''])


***Inspeção dos Dados***

In [3]:
print("Train shape:", train_df.shape)

Train shape: (6812, 33)


In [4]:
print("Test shape:", test_df.shape)

Test shape: (1500, 32)


In [5]:
train_df.head()

Unnamed: 0,AVERAGE_FREE_FLOW_SPEED,AVERAGE_TIME_DIFF,AVERAGE_FREE_FLOW_TIME,LUMINOSITY,AVERAGE_TEMPERATURE,AVERAGE_ATMOSP_PRESSURE,AVERAGE_HUMIDITY,AVERAGE_WIND_SPEED,AVERAGE_CLOUDINESS,RAIN_INTENSITY,...,supermarket_peak_level,periodo_aulas,fim_de_semana,hour_sin,hour_cos,dow_sin,dow_cos,month_sin,month_cos,congestion_ratio
0,41.5,11.5,71.4,2,15.0,1019.0,100.0,3.0,0,0,...,0,0,0,0.9659258,-0.258819,0.433884,-0.900969,-0.8660254,-0.5,0.161064
1,41.7,48.3,87.4,2,21.0,1021.0,53.0,5.0,1,0,...,1,0,0,-0.5,-0.866025,-0.433884,-0.900969,-0.8660254,-0.5,0.552632
2,38.6,38.4,85.2,2,26.0,1014.0,61.0,4.0,0,0,...,1,0,1,-0.8660254,-0.5,-0.781831,0.62349,-1.0,-1.83697e-16,0.450704
3,37.4,61.0,94.1,2,18.0,1025.0,48.0,4.0,1,0,...,0,1,0,0.258819,-0.965926,0.781831,0.62349,0.8660254,0.5,0.648247
4,41.6,50.4,77.0,2,15.0,1008.0,82.0,10.0,0,0,...,1,1,0,1.224647e-16,-1.0,0.433884,-0.900969,1.224647e-16,-1.0,0.654545


***Separar features / target***


In [6]:
TARGET_COL = "AVERAGE_SPEED_DIFF"


X = train_df.drop(columns=[TARGET_COL])
y = train_df[TARGET_COL]
X_test = test_df.copy()

In [7]:
class_weight = {0: 3.04, 1: 5.06, 2: 4.41, 3: 5.49, 4: 15.52}


sample_weights = y.map(class_weight).values

cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=RSEED
)

#### Treinar modelos


**Decision Tree**

In [8]:
dt = DecisionTreeClassifier(
    random_state=RSEED,
    class_weight=class_weight,
    criterion="entropy",
    max_depth=5,
    min_samples_leaf=10, 
    min_samples_split=2
)


**Random Forest**

In [9]:
rf = RandomForestClassifier(
    n_estimators=300,
    max_features="sqrt",
    min_samples_leaf=3,
    min_samples_split=2,
    max_depth=None,
    class_weight=class_weight,
    random_state=RSEED,
    n_jobs=-1
)


**XGBoost com peso**

In [10]:
xgb = XGBClassifier(
    n_estimators=700,
    learning_rate=0.03,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    reg_lambda=1,
    objective="multi:softprob",
    num_class=5,
    eval_metric="mlogloss",
    random_state=RSEED,
    tree_method="hist",
    gamma=0,
    reg_alpha=0,
    n_jobs=-1
)


***Bagging***

In [11]:
bagging = BaggingClassifier(
    estimator=DecisionTreeClassifier(
        class_weight=class_weight,
        random_state=RSEED
    ),
    n_estimators=300,
    bootstrap=True,
    random_state=RSEED,
    n_jobs=-1
)


#### Comparação dos diferentes modelos

***Voting Ensemble***

In [12]:
voting = VotingClassifier(
    estimators=[
        ("dt", dt),
        ("rf", rf)
    ],
    voting="hard"
)

***Stacking Ensemble***

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
stacking = StackingClassifier(
    estimators=[
        ("dt", dt),
        ("rf", rf),
        ("xgb", xgb)
    ],
    final_estimator=LogisticRegression(class_weight="balanced",  random_state=RSEED),
    cv=5,
    n_jobs=-1
)


***Escolher o melhor***

In [15]:
def evaluate_cv(
    model,
    name,
    X,
    y,
    cv,
    sample_weights=None,
    use_sample_weight=False
):
    scores = []

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model_fold = clone(model)

        if use_sample_weight and sample_weights is not None:
            sw_train = sample_weights[train_idx]
            model_fold.fit(X_train, y_train, sample_weight=sw_train)
        else:
            model_fold.fit(X_train, y_train)

        preds = model_fold.predict(X_val)
        score = f1_score(y_val, preds, average="weighted")
        scores.append(score)

    scores = np.array(scores)

    print(f"== {name} ==")
    print("F1 por fold:", np.round(scores, 4))
    print("Média:", scores.mean())
    print("Std:", scores.std())
    print()

    return scores.mean(), scores.std()


#### Treinar o melhor modelo

In [16]:
results = {}

results["DecisionTree"] = evaluate_cv(
    dt, "Decision Tree", X, y, cv,
    sample_weights, use_sample_weight=True
)

results["RandomForest"] = evaluate_cv(
    rf, "Random Forest", X, y, cv,
    sample_weights, use_sample_weight=True
)

results["XGBoost"] = evaluate_cv(
    xgb, "XGBoost", X, y, cv,
    sample_weights, use_sample_weight=True
)

results["Bagging"] = evaluate_cv(
    bagging, "Bagging", X, y, cv,
    sample_weights, use_sample_weight=True
)

results["Voting"] = evaluate_cv(
    voting, "Voting", X, y, cv
)

results["Stacking"] = evaluate_cv(
    stacking, "Stacking", X, y, cv
)


== Decision Tree ==
F1 por fold: [0.7481 0.7246 0.7416 0.7179 0.75  ]
Média: 0.7364313311565478
Std: 0.012858836848479795

== Random Forest ==
F1 por fold: [0.8146 0.7962 0.8013 0.7885 0.8048]
Média: 0.8010914877317779
Std: 0.008708000267300231

== XGBoost ==
F1 por fold: [0.8194 0.7928 0.7988 0.8106 0.8147]
Média: 0.8072497475090172
Std: 0.009936553629320184

== Bagging ==
F1 por fold: [0.8084 0.7904 0.7876 0.7828 0.8012]
Média: 0.7940956069236331
Std: 0.00936270525989885

== Voting ==
F1 por fold: [0.8016 0.7683 0.7903 0.7778 0.8047]
Média: 0.7885383134617869
Std: 0.013840758452950129

== Stacking ==
F1 por fold: [0.8248 0.7989 0.8088 0.7942 0.8161]
Média: 0.808567579739335
Std: 0.011112371315128728



In [17]:
results_df = pd.DataFrame(
    [(k, v[0], v[1]) for k, v in results.items()],
    columns=["Model", "F1_mean", "F1_std"]
).sort_values("F1_mean", ascending=False)

print("\nRESULTADOS FINAIS:\n")
print(results_df)



RESULTADOS FINAIS:

          Model   F1_mean    F1_std
5      Stacking  0.808568  0.011112
2       XGBoost  0.807250  0.009937
1  RandomForest  0.801091  0.008708
3       Bagging  0.794096  0.009363
4        Voting  0.788538  0.013841
0  DecisionTree  0.736431  0.012859


In [18]:
best_model_name = results_df.iloc[0]["Model"]
print("Melhor modelo:", best_model_name)

models_map = {
    "DecisionTree": dt,
    "RandomForest": rf,
    "XGBoost": xgb,
    "Bagging": bagging,
    "Voting": voting,
    "Stacking": stacking
}

best_model = models_map[best_model_name]

if best_model_name in ["DecisionTree", "RandomForest", "Bagging", "XGBoost"]:
    best_model.fit(X, y, sample_weight=sample_weights)
else:
    best_model.fit(X, y)


Melhor modelo: Stacking


Ficheiro de submissão


In [19]:
reverse_mapping = {
0: "None",
1: "Low",
2: "Medium",
3: "High",
4: "Very_High"
}


test_pred = best_model.predict(X_test)
test_labels = pd.Series(test_pred).map(reverse_mapping)


submission = pd.DataFrame({
"RowId": range(1, len(test_labels) + 1),
"Speed_Diff": test_labels
})


output_name = f"submission.csv"
submission.to_csv(output_name, index=False)
print("Guardado em:", output_name)


Guardado em: submission.csv


Accuracy Kaggle: 0.82222