In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


def StackedHist(df):
    # Aggregate counts by Center and Morf_Codificata
    counts = df.groupby(['luogoTC_codificato', 'morf_codificata']).size().unstack(fill_value=0)

    # Define all possible center labels (1 to 14)
    all_centers = list(range(1, 15))
    all_labels = sorted(set(map(int,df['morf_codificata'])))
    
    # Reindex the DataFrame to include all centers, filling missing values with 0
    counts = counts.reindex(index=all_centers, columns=all_labels, fill_value=0)

    # Create the stacked histogram
    fig, ax = plt.subplots(figsize=(12, 10))

    # Initialize bottom positions for stacking
    bottom = np.zeros(len(all_centers))
    
    
    colors = plt.cm.tab20.colors



    # Plot each Morf_Codificata category as a stacked bar segment
    for i, label in enumerate(all_labels):
        ax.bar(all_centers, counts[label], bottom=bottom, label=f'{label}', color=colors[i])
        bottom += counts[label].values  # Update bottom for next stack

    # Set x-ticks from 1 to 14
    ax.set_xticks(all_centers)
    ax.set_xticklabels(all_centers, rotation=45, ha='right')

    # Adding labels and title
    plt.xlabel('Center')
    plt.ylabel('Count')
    plt.title('Stacked Histogram by Center and Morf_Codificata')
    plt.legend(title='Morf_Codificata', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()


In [58]:
from Components import Ingestion
def extract(PATH):
    df = pd.read_excel(PATH)
    df1 = Ingestion.FirstFiltering(df)
    db = Ingestion.FilteringColumns_withLuogoTc(df1)
    return db

db = extract("Dataset/db_29-07-2024_2_Ing_Petti.xlsx")

[2024-11-07 12:20:00,199: INFO: Ingestion: from first fitering using the tenere_finale attribute,  I obtained (1108, 184) patients]
[2024-11-07 12:20:00,201: INFO: Ingestion: from removing columns with too many Nans,I obtained (1046, 128) patients]


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV,StratifiedKFold
import numpy as np

or_db = pd.read_excel("Dataset/Cleaned_dataset.xlsx")
db = or_db[or_db["luogoTC_codificato"] == 1]
val_set = or_db[or_db["luogoTC_codificato"] != 1]

 # extrating the label
y = db.pop("maligno")
X = db

X_train_folds = []
X_test_folds = []

# configure the cross-validation procedure
cv_outer = StratifiedKFold(n_splits=5,shuffle=True, random_state=42)
# enumerate splits
outer_results = list()
best_params_per_fold = []
scores = pd.DataFrame(columns=['Outer Fold',"Grid-Search",'F1-score', 'Recall', 'Precision', 'Accuracy', 'Auc-Score', '#Features'])

for index, (train_ix, test_ix) in enumerate(cv_outer.split(X, y)):
    # split data
    X_train, X_test = X.iloc[train_ix, :], X.iloc[test_ix, :]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    

    unique, counts = np.unique(y_test, return_counts=True)
    hist = dict(zip(unique, counts))
    print(f"Fold {index} - Test set class distribution: {hist}")
    X_test_outer = X_test
    X_test_folds.append(X_test)
    morf = X_test_outer["morf_codificata"]
    unique, counts = np.unique(morf, return_counts=True)
    hist = dict(zip(unique, counts))
    print(f"Fold {index} - Test set class distribution: {hist}")


    fold_df = pd.DataFrame(X_test)
    fold_df["label"] = y_test
    fold_df.to_csv(f"nested_CV/fold_{index}_test_set.csv", index=False)


    fold_df = pd.DataFrame(X_train)
    X_train_folds.append(fold_df)
    fold_df["label"] = y_train
    fold_df.to_csv(f"nested_CV/fold_{index}_train_set.csv", index=False)


Fold 0 - Test set class distribution: {0: 85, 1: 23}
Fold 0 - Test set class distribution: {0: 53, 1: 29, 2: 5, 3: 8, 4: 10, 5: 3}
Fold 1 - Test set class distribution: {0: 85, 1: 23}
Fold 1 - Test set class distribution: {0: 53, 1: 26, 2: 5, 3: 7, 4: 11, 5: 6}
Fold 2 - Test set class distribution: {0: 84, 1: 23}
Fold 2 - Test set class distribution: {0: 57, 1: 25, 2: 5, 3: 9, 4: 9, 5: 2}
Fold 3 - Test set class distribution: {0: 84, 1: 23}
Fold 3 - Test set class distribution: {0: 58, 1: 21, 2: 4, 3: 12, 4: 7, 5: 5}
Fold 4 - Test set class distribution: {0: 84, 1: 23}
Fold 4 - Test set class distribution: {0: 60, 1: 21, 2: 3, 3: 9, 4: 11, 5: 3}
