In [None]:
import pandas as pd

# Definiamo i nomi delle colonne
column_names = [
    'id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
    'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave_points_mean',
    'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se',
    'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se',
    'concave_points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst',
    'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
    'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst',
    'fractal_dimension_worst'
]

data = pd.read_csv('wdbc.data', header=None, names=column_names)

In [None]:
from pgmpy.estimators import HillClimbSearch, K2Score
from pgmpy.models import BayesianNetwork
from pgmpy.inference import VariableElimination
from sklearn.preprocessing import KBinsDiscretizer

features = data.drop(columns=['id', 'diagnosis'])
target = data['diagnosis']

discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform', subsample=None)
features_disc = pd.DataFrame(discretizer.fit_transform(features), columns=features.columns, index=data.index).astype(int)

data_disc = pd.concat([features_disc, target], axis=1)

In [None]:
scoring_method = K2Score(data_disc)
hc = HillClimbSearch(data_disc)
best_model = hc.estimate(scoring_method=scoring_method, max_indegree=5, max_iter=int(1e4))
print("Archi appresi:", best_model.edges())

bn = BayesianNetwork(best_model.edges())
bn.fit(data_disc)

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

plt.figure(figsize=(40, 40))

G = nx.DiGraph(best_model.edges())

node_colors = ['red' if node == 'diagnosis' else 'skyblue' for node in G.nodes()]

pos = nx.spring_layout(G, k=2.5, iterations=150)
nx.draw(G, pos, with_labels=True, node_size=5000, node_color=node_colors, font_size=25, font_weight='bold', arrows=True, arrowsize=23)
plt.title("Struttura della Rete Bayesiana Appresa", size=23)
plt.show()

In [None]:
markov_blanket = bn.get_markov_blanket('diagnosis')

print(f"La Markov Blanket di 'diagnosis' è: {markov_blanket}")
print(f"\nNumero di features nella Markov Blanket: {len(markov_blanket)}")
print(f"\nSono state eliminate {32 - len(markov_blanket)} features.")
print("\nQueste sono le uniche variabili necessarie per predire la diagnosi secondo il modello appreso.")
print("Le altre variabili possono essere considerate ridondanti per questo scopo.")

features_ridotte = markov_blanket.copy()
if 'diagnosis' not in features_ridotte:
    features_ridotte.append('diagnosis')

data_ridotto = data_disc[features_ridotte]

In [None]:
G_ridotto = G.subgraph(features_ridotte)

plt.figure(figsize=(20, 20))

node_colors_ridotto = ['red' if node == 'diagnosis' else 'skyblue' for node in G_ridotto.nodes()]

pos_ridotto = nx.spring_layout(G_ridotto, k=3.5, iterations=150)
nx.draw(G_ridotto, pos_ridotto, with_labels=True, node_size=6000, node_color=node_colors_ridotto, font_size=20, font_weight='bold', arrows=True, arrowsize=20)
plt.title("Sottografo della Rete Bayesiana: (Markov Blanket)", size=25)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np

best_model_ridotto = bn.subgraph(features_ridotte)
print("Archi del modello ridotto:", best_model_ridotto.edges())

bn_ridotto = BayesianNetwork(best_model_ridotto.edges())
bn_ridotto.fit(data_ridotto)

X_ridotto = data_ridotto.drop('diagnosis', axis=1)
le = LabelEncoder()
y_ridotto = pd.Series(le.fit_transform(data_ridotto['diagnosis']), index=data_ridotto.index, name='diagnosis')

X_train, X_test, y_train, y_test = train_test_split(X_ridotto, y_ridotto, test_size=0.3, random_state=42)

y_pred_labels = bn_ridotto.predict(X_test)

y_pred = le.transform(y_pred_labels['diagnosis'])


accuracy = accuracy_score(y_test, y_pred)
report_str = classification_report(y_test, y_pred, target_names=le.classes_)
report = classification_report(y_test, y_pred, target_names=le.classes_, output_dict=True)


print(f"Accuratezza del modello ridotto: {accuracy:.4f}\n")
print("Report del modello ridotto:\n", report_str)


In [None]:
import numpy as np
import matplotlib.pyplot as plt

labels = le.classes_
precision = [report[label]['precision'] for label in labels]
recall = [report[label]['recall'] for label in labels]
f1 = [report[label]['f1-score'] for label in labels]

x = np.arange(len(labels))
width = 0.25

fig, ax = plt.subplots(figsize=(12, 8))
rects1 = ax.bar(x - width, precision, width, label='Precision')
rects2 = ax.bar(x, recall, width, label='Recall')
rects3 = ax.bar(x + width, f1, width, label='F1-Score')

ax.set_ylabel('Punteggio')
ax.set_title('Metriche di Valutazione per Classe del Modello Ridotto')
ax.set_xticks(x)
ax.set_xticklabels(['Benigno (B)', 'Maligno (M)'])

ax.axhline(y=accuracy, color='r', linestyle='--', label=f'Accuracy: {accuracy:.2f}')
ax.legend()

ax.bar_label(rects1, padding=3, fmt='%.2f')
ax.bar_label(rects2, padding=3, fmt='%.2f')
ax.bar_label(rects3, padding=3, fmt='%.2f')

fig.tight_layout()
plt.ylim(0, 1.1)
plt.show()