In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import numpy as np
import matplotlib.pyplot as plt

# Definiamo i nomi delle colonne
column_names = [
    'id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
    'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave_points_mean',
    'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se',
    'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se',
    'concave_points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst',
    'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
    'compactness_worst', 'concavity_worst', 'concave_points_worst', 'symmetry_worst',
    'fractal_dimension_worst'
]

data = pd.read_csv('wdbc.data', header=None, names=column_names)

data.drop('id', axis=1, inplace=True)
le = LabelEncoder()
data['diagnosis'] = le.fit_transform(data['diagnosis'])  # 0=B, 1=M


In [None]:
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

scaler = MinMaxScaler()
X_train_sm = scaler.fit_transform(X_train_sm)
X_test = scaler.transform(X_test)

print(f"Forma train dopo SMOTE: {X_train_sm.shape}, Classi bilanciate: {np.bincount(y_train_sm)}")

In [None]:
import seaborn as sns

plt.figure(figsize=(8, 6))
ax = sns.countplot(x=y_train_sm)
plt.title('Distribuzione delle Classi nel Training Set dopo SMOTE')
plt.xlabel('Diagnosi (0: Benigno, 1: Maligno)')
plt.ylabel('Conteggio')
plt.xticks([0, 1], ['Benigno (0)', 'Maligno (1)'])

# Aggiungi i numeri sopra le barre
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', fontsize=12, color='black', xytext=(0, 5),
                textcoords='offset points')

plt.show()

plt.figure(figsize=(8, 8))
counts = np.bincount(y_train_sm)
labels = [f'Benigno ({counts[0]})', f'Maligno ({counts[1]})']
plt.pie(counts, labels=labels, autopct='%1.1f%%', startangle=140, colors=['skyblue', 'lightcoral'])
plt.title('Distribuzione a Torta delle Classi nel Training Set dopo SMOTE')
plt.ylabel('')
plt.show()

In [None]:
cols = X_train.columns
X_train_scaled = scaler.transform(X_train)

n_cols = 5
n_rows = 6
fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 18))
axes = axes.ravel()

for i, col in enumerate(cols):
    ax = axes[i]
    ax.hist(X_train_scaled[:, i], bins=30, alpha=0.5,  color='red', density=True, label='Prima')
    ax.hist(X_train_sm[:, i], bins=30, alpha=0.25, color='blue',   density=True, label='Dopo SMOTE')
    ax.set_title(col, fontsize=9)
    ax.tick_params(axis='both', labelsize=8)


axes[0].legend(fontsize=9)

plt.suptitle('Distribuzione delle features - Prima vs Dopo SMOTE', fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()


In [None]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

models = {
    'RF': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'GB': GradientBoostingClassifier(random_state=42),
    'LR': LogisticRegression(max_iter=100000, random_state=42)
}

param_dists = {
    'RF': {
        'n_estimators': [10, 50, 100,],
        'max_depth': [10, 15, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'criterion': ['gini', 'entropy']
    },
    'KNN': {
        'n_neighbors': [10, 15, 20],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]
    },
    'GB': {
        'n_estimators': [10, 50, 100],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [5, 10, 15],
        'subsample': [0.8, 1.0]
    },
    'LR': [
        {
            'penalty': ['l1', 'l2'],
            'C': [0.1, 1, 10, 100],
            'solver': ['liblinear']
        },
        {
            'penalty': ['l2'],
            'C': [0.1, 1, 10, 100],
            'solver': ['lbfgs']
        }
    ]
}

best_models = {}
for name, model in models.items():
    print(f"--- {name} ---")
    search = RandomizedSearchCV(
        model,
        param_distributions=param_dists[name],
        n_iter=10,
        cv=10,
        scoring='accuracy',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    search.fit(X_train_sm, y_train_sm)

    best_models[name] = search.best_estimator_
    print(f"Migliori parametri: {search.best_params_}")

    # Valutazione sul test set
    y_pred_test = best_models[name].predict(X_test)
    print("\nReport di classificazione sul test set:")
    print(classification_report(y_test, y_pred_test, target_names=['Benigno (0)', 'Maligno (1)']))
    print("\n" + "="*60 + "\n")

In [None]:
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import matplotlib.pyplot as plt

metrics_per_model = {}
metrics_to_plot = ['precision', 'recall', 'f1-score']


for name, model in best_models.items():
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True, target_names=['Benigno (0)', 'Maligno (1)'])

    metrics_per_model[name] = {metric: report['macro avg'][metric] for metric in metrics_to_plot}

    metrics_per_model[name]['accuracy'] = accuracy_score(y_test, y_pred)


labels = list(metrics_per_model.keys())
precision_scores = [metrics_per_model[model]['precision'] for model in labels]
recall_scores = [metrics_per_model[model]['recall'] for model in labels]
f1_scores = [metrics_per_model[model]['f1-score'] for model in labels]
accuracy_scores = [metrics_per_model[model]['accuracy'] for model in labels]

x = np.arange(len(labels))
width = 0.2

fig, ax = plt.subplots(figsize=(16, 8))
rects1 = ax.bar(x - 1.5*width, precision_scores, width, label='Precision', color='cornflowerblue')
rects2 = ax.bar(x - 0.5*width, recall_scores, width, label='Recall', color='lightcoral')
rects3 = ax.bar(x + 0.5*width, f1_scores, width, label='F1-Score', color='lightgreen')
rects4 = ax.bar(x + 1.5*width, accuracy_scores, width, label='Accuracy', color='gold')

ax.set_ylabel('Punteggio')
ax.set_title('Confronto delle Metriche per Modello sul Test Set')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend(loc='upper left', bbox_to_anchor=(1, 1))
ax.set_ylim(0, 1.1)

ax.bar_label(rects1, padding=3, fmt='%.4f')
ax.bar_label(rects2, padding=3, fmt='%.4f')
ax.bar_label(rects3, padding=3, fmt='%.4f')
ax.bar_label(rects4, padding=3, fmt='%.4f')

fig.tight_layout()
plt.show()