# Especialização em Inteligência Artificial

**Aprendizado de Máquina - Aula 4.5 (extra): Implementação das abordagens de comitês**

Código de exemplo desenvolvido pelo docente [Adriano Rivolli](mailto:rivolli@utpfr.edu.br)

*O código apresenta uma implementação das estratégias de comitês*

In [1]:
import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Carregando o dataset e separando o conjunto de treinamento/teste
cancer = load_breast_cancer()
X, y = cancer.data, cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Bagging

In [2]:
dtmodel = DecisionTreeClassifier(random_state=0)
dtmodel.fit(X_train, y_train)

y_pred = dtmodel.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Decision Tree Accuracy:", accuracy)

Decision Tree Accuracy: 0.9385964912280702


In [3]:
# Numero de modelos usados pelo comitê
n_estimators = 10

# Lista dos modelos
base_models = []

# Treinamento dos modelos bases
for i in range(n_estimators):
    # Fazendo a amostragem com repetição (Boosting)
    indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
    X_train_sampled, y_train_sampled = X_train[indices], y_train[indices]

    # Criando o modelo base (decision tree)
    base_model = DecisionTreeClassifier(random_state=i)
    base_model.fit(X_train_sampled, y_train_sampled)

    # Adicionando o modelo a lista de modelos
    base_models.append(base_model)

# Fazendo as predições
def bagging_predict(base_models, X):
    predictions = np.zeros((X.shape[0], len(base_models)), dtype=int)
    for i, base_model in enumerate(base_models):
        predictions[:, i] = base_model.predict(X)

    # Fazendo as predições usando o voto majoritário (classe mais recorrente)
    aggregated_predictions = pd.DataFrame(predictions).mode(axis=1).iloc[:, 0]

    return aggregated_predictions

# Usando um conjunto de teste
bagging_predictions = bagging_predict(base_models, X_test)

# Calculando a acurácia
bagging_accuracy = accuracy_score(y_test, bagging_predictions)
print("Bagging Classifier Accuracy:", bagging_accuracy)

Bagging Classifier Accuracy: 0.9385964912280702


## Boosting

In [4]:
# Numero de modelos usados pelo comitê
n_estimators = 10

# Inicializando os pesos de cada instancia
weights = np.ones(len(X_train)) / len(X_train)

# Lista dos modelos e dos seus pesos
base_models = []
base_models_weights = []

# Treinamento dos modelos bases
for i in range(n_estimators):
    # Treinamento dos modelos
    base_model = DecisionTreeClassifier(max_depth=1, random_state=42)
    base_model.fit(X_train, y_train, sample_weight=weights)

    # Fazendo as predições do próprio conjunto de treinamento
    predictions = base_model.predict(X_train)

    # Calcula o erro de acordo com os pesos das instancias
    error = np.sum(weights * (predictions != y_train)) / np.sum(weights)

    # Calculando o peso atribuido ao modelo (quanto mais erros, menor o peso)
    base_model_weight = 0.5 * np.log((1 - error) / error)

    # Atualizando o peso das instancias
    weights *= np.exp(-base_model_weight * predictions * y_train)
    weights /= np.sum(weights)

    #Adicionando o modelo e o peso na lista de modelos e pesos
    base_models.append(base_model)
    base_models_weights.append(base_model_weight)

# Fazendo a predição com o conjunto de teste
def adaboost_predict(base_models, base_models_weights, X):
    predictions = np.zeros((X.shape[0], len(base_models)))
    for i, base_model in enumerate(base_models):
        predictions[:, i] = base_model.predict(X)

    weighted_predictions = np.dot(predictions, base_models_weights)
    aggregated_predictions = np.sign(weighted_predictions)
    return np.array([1 if pred > 0 else 0 for pred in aggregated_predictions])

adaboost_predictions = adaboost_predict(base_models, base_models_weights, X_test)

# Calculando a acurácia
adaboost_accuracy = accuracy_score(y_test, adaboost_predictions)
print("AdaBoost Classifier Accuracy:", adaboost_accuracy)

AdaBoost Classifier Accuracy: 0.9210526315789473


## Stacking

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

X_train_train, X_train_val, y_train_train, y_train_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

# Treinando os modelos
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_train, y_train_train)

lg = LogisticRegression(random_state=42, C=100, max_iter=5000)
lg.fit(X_train_train, y_train_train)

sv = SVC(random_state=42, C=0.1, probability=True)
sv.fit(X_train_train, y_train_train)

# Make predictions on the validation set
dt_score = dt.predict_proba(X_train_val)
lg_score = lg.predict_proba(X_train_val)
sv_score = sv.predict_proba(X_train_val)

dt_pred = dt.classes_[dt_score.argmax(axis=1)]
lg_pred = lg.classes_[lg_score.argmax(axis=1)]
sv_pred = sv.classes_[sv_score.argmax(axis=1)]

print("Decision Tree Performance")
print(accuracy_score(y_train_val, dt_pred))
print()
print("Regressão Logistica Performance")
print(accuracy_score(y_train_val, lg_pred))
print()
print("SVM Performance")
print(accuracy_score(y_train_val, sv_pred))

Decision Tree Performance
0.9343065693430657

Regressão Logistica Performance
0.948905109489051

SVM Performance
0.9197080291970803


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=5000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
# Combinando as predições do modelo base em uma única matriz
X_meta = pd.concat([pd.DataFrame(dt_score, columns=['DT_0', 'DT_1']),
                    pd.DataFrame(lg_score, columns=['LR_0', 'LR_1']),
                    pd.DataFrame(sv_score, columns=['SVM_0', 'SVM_1'])], axis=1)

X_meta.round(2)

Unnamed: 0,DT_0,DT_1,LR_0,LR_1,SVM_0,SVM_1
0,1.0,0.0,1.00,0.00,1.00,0.00
1,0.0,1.0,0.00,1.00,0.17,0.83
2,0.0,1.0,0.00,1.00,0.08,0.92
3,0.0,1.0,0.00,1.00,0.03,0.97
4,1.0,0.0,0.98,0.02,0.36,0.64
...,...,...,...,...,...,...
132,0.0,1.0,0.00,1.00,0.04,0.96
133,1.0,0.0,1.00,0.00,1.00,0.00
134,0.0,1.0,0.00,1.00,0.04,0.96
135,0.0,1.0,0.00,1.00,0.04,0.96


In [7]:
# Treinando o meta-modelo com o resultado das predições
meta_model = DecisionTreeClassifier()
meta_model.fit(X_meta, y_train_val)

# Fazendo predições dos modelos bases
dt_score_new = dt.predict_proba(X_test)
lg_score_new = lg.predict_proba(X_test)
sv_score_new = sv.predict_proba(X_test)

dt_pred_new = dt.classes_[dt_score_new.argmax(axis=1)]
lg_pred_new = lg.classes_[lg_score_new.argmax(axis=1)]
sv_pred_new = sv.classes_[sv_score_new.argmax(axis=1)]

# Combinando as predições em um único vetor - gerando o meta-exemplo
X_new_meta = pd.concat([pd.DataFrame(dt_score_new, columns=['DT_0', 'DT_1']),
                        pd.DataFrame(lg_score_new, columns=['LR_0', 'LR_1']),
                        pd.DataFrame(sv_score_new, columns=['SVM_0', 'SVM_1'])], axis=1)

# Fazendo a predição do meta-modelo
y_new_pred = meta_model.predict(X_new_meta)

print("Stacking Classifier Accuracy:", accuracy_score(y_test, y_new_pred))

Stacking Classifier Accuracy: 0.9298245614035088
