# ML for metastasis

ML classifiers for metastatic vs non-metastatic protein sequences.

1.   Get fasta sequences and convert them into TXT (metastatic and non-metastatic proteins)
2.   Calculate molecular descriptors (dataset for ML)
3.   Build ML classifiers for metastatic and non-metastatic protein sequences (split dataset, build models, feature selection, model comparison)
4. Statistical analysis



In [None]:
import os
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
dsPath =  # set path to dataset subfolder

In [None]:
# Print content in drive to check if it is OK
for filename in os.listdir(dsPath):
  print(filename)

In [None]:
resPath =  # set path to results subfolder

## Convert FASTA to CSV format

Read all the sequences for metastatic and non-metastatic proteins and convert in ProteinName, Sequence format as CSV files.

In [None]:
!pip install biopython

In [None]:
from Bio import SeqIO

In [None]:
# Define a function to read fasta sequences one at a time
def fasta_generator(input_file):
    with open(input_file, 'r') as fasta_file:
        for record in SeqIO.parse(fasta_file, 'fasta'):
            yield record

In [None]:
# Define 2 lists with protein name
metastasis_POS = []  # metastatic proteins
metastasis_NEG = []  # non-metastatic proteins

In [None]:
# Get fasta sequences for metastasis_POS proteins
# Define fasta file to read and csv file to write the protein name and the sequences
inFile  = dsPath+'MODELO_1_POSITIVE_metastasis.fasta' # to read
outFile = dsPath+'metastasis_POS.csv'       # to write

# Open the output file for writing
with open(outFile, 'w') as out_file:
  out_file.write(f'ProteinDescription,Sequence\n')  # write the header
  # Iterate over each fasta sequence using the generator
  for fasta in fasta_generator(inFile):
      name = fasta.id
      sequence = str(fasta.seq)               # convert the sequence to a string
      out_file.write(f'{name},{sequence}\n')  # write only the sequence to the output file
      metastasis_POS.append(name+","+sequence)           # add the sequence to a list TFs

out_file.close()

In [None]:
# Checking metastasis_POS list
print("No of metastasis_POS sequences:", len(metastasis_POS))
# Print first sequence in metastasis_POS to check if it is OK
print("First metastasis_POS sequence:\n"+metastasis_POS[0])

In [None]:
# Get fasta sequences for metastasis_NEG proteins
# Define fasta file to read and csv file to write the protein names and sequences
inFile  = dsPath+'MODELO_1_NEGATIVE_metastasis.fasta'
outFile = dsPath+'metastasis_NEG.csv'

# Open the output file for writing
with open(outFile, 'w') as out_file:
  out_file.write(f'ProteinDescription,Sequence\n')  # write the header
  # Iterate over each fasta sequence using the generator
  for fasta in fasta_generator(inFile):
      name = fasta.id
      sequence = str(fasta.seq)               # convert the sequence to a string
      out_file.write(f'{name},{sequence}\n')  # write only the sequence to the output file
      metastasis_NEG.append(name+","+sequence)        # add the sequence to a list

out_file.close()

In [None]:
# Checking metastasis_NEG list
print("No of metastasis_NEG sequences:", len(metastasis_NEG))
# Print first sequence to check if it is OK
print("First metastasis_NEG sequence:\n"+metastasis_NEG[0])

## Check for common sequences in metastasis_POS and metastasis_NEG

In [None]:
# List with errors
errors = []

# Check if we have the same sequence in both lists
with open(dsPath+"Errors.csv", 'w') as out_file:
  out_file.write(f'ErrNo,metastasis_POS,metastasis_NEG,metastasis_POS_seq, metastasis_NEG_seq\n')
  n=0
  for idmPOS in range(len(metastasis_POS)):
    name_metastasis_POS, seq_metastasis_POS = metastasis_POS[idmPOS].split(',')
    for idmNEG in range(len(metastasis_NEG)):
      name_metastasis_NEG, seq_metastasis_NEG = metastasis_NEG[idmNEG].split(',')
      if (name_metastasis_POS == name_metastasis_NEG):
        n=n+1
        print(n, name_metastasis_POS, name_metastasis_NEG, seq_metastasis_POS, seq_metastasis_NEG)
        out_file.write(f'{n},{name_metastasis_POS},{name_metastasis_NEG},{seq_metastasis_POS},{seq_metastasis_NEG}\n')
        errors.append(name_metastasis_NEG+","+seq_metastasis_NEG)        # add the sequence to a list
        continue
  if n==0: print("No errors!")
  else:
    print(errors)
out_file.close()

## Get the list of sequences to use for descriptors calculation

In [None]:
# Get only the lists of ONLY the sequences
listmPOS = []
for seqPOS in metastasis_POS:
    name_seqPOS, seq_seqPOS = seqPOS.split(',')
    listmPOS.append(seq_seqPOS)

# Get only the lists of ONLY the sequences
listmNEG = []
for seqNEG in metastasis_NEG:
    name_seqNEG, seq_seqNEG = seqNEG.split(',')
    listmNEG.append(seq_seqNEG)

In [None]:
print("The study will use", len(listmPOS),"metastasic sequences vs.", len(listmNEG), "non metastasic sequences.")

## Molecular descriptors

In [None]:
# Install package for protein molecular descriptors
!pip install propy3

In [None]:
from propy import PyPro
import pandas as pd

### Descriptors from list (metastasis_POS and metastasis_NEG) to dataframes (AAC, DPC, Mix)

In [None]:
def to_dataframe(list_sequences, is_POS):
    data_AAC = []  # list with AAC descriptors
    data_DPC = []  # list with DPC descriptors
    data_Mix = []  # list with Mix descriptors

    for sequence in list_sequences:  # for each sequence of the list
        DesObject = PyPro.GetProDes(sequence)  # create an object for descriptors
        amino_acid_composition = DesObject.GetAAComp()  # calculate amino_acid_composition (AAC) descriptors (dictionary)
        dipeptide_composition = DesObject.GetDPComp()  # calculate dipeptide_composition (DPC) descriptors (dictionary)
        data_AAC.append(list(amino_acid_composition.values()))  # add AAC descriptors to list data_AAC
        data_DPC.append(list(dipeptide_composition.values()))  # add DPC descriptors to list data_DPC
        data_Mix.append(list(amino_acid_composition.values()) + list(dipeptide_composition.values()))  # add both AAC and DPC descriptors (mix) to list data_Mix

    # create 3 dataframes, one for each set of descriptors: AAC, DPC and Mix,
    # using the descriptors values from the previous lists and the header with the descriptors names (dictionary keys)
    # amino_acid_composition, dipeptide_composition are dictionaries
    columns_AAC = list(amino_acid_composition.keys())
    columns_DPC = list(dipeptide_composition.keys())

    df_AAC = pd.DataFrame(data_AAC, columns=columns_AAC)
    df_DPC = pd.DataFrame(data_DPC, columns=columns_DPC)
    df_Mix = pd.DataFrame(data_Mix, columns=columns_AAC + columns_DPC)

    # add a new column as the last one with the class (metastasis_POS = 1; metastasis_NEG = 0)
    df_AAC['Class'] = 1 if is_POS else 0
    df_DPC['Class'] = 1 if is_POS else 0
    df_Mix['Class'] = 1 if is_POS else 0

    return df_AAC, df_DPC, df_Mix

# listmPOS
df_AAC_mPOS, df_DPC_mPOS, df_Mix_mPOS = to_dataframe(listmPOS, is_POS=True)

# listmNEG
df_AAC_mNEG, df_DPC_mNEG, df_Mix_mNEG = to_dataframe(listmNEG, is_POS=False)


In [None]:
# Checking dataframes

# Metastasis_POS
#df_AAC_mPOS
#df_DPC_mPOS
#df_Mix_mPOS

# Metastasis_NEG
#df_AAC_mNEG
#df_DPC_mNEG
#df_Mix_mNEG

## Datasets

We will create different datasets using the previous dataframes with descriptors for metastasis_POS and metastasis_NEG.

### Datasets with all descriptors (no feature selection, without normalization)

We have 6 dataframes with descriptors for metastasis_POS and metastasis_NEG:

*   for metastasis_POS: df_AAC_mPOS, df_DPC_mPOS, df_Mix_mPOS
*   for metastasis_NEG: df_AAC_mNEG, df_DPC_mNEG, df_Mix_mNEG

We will mix descriptors for metastasis_POS and metastasis_NEG for each subset of descriptors: AAC, DPC and Mix (by merging the correspondent dataframes). We will obtain datasets: ds_AAC, ds_DPC, ds_Mix as dataframes and CSV files.



In [None]:
ds_AAC = pd.concat([df_AAC_mPOS, df_AAC_mNEG], axis=0) # dataset for AAC (AAC descriptors for metastasis_POS and metastasis_NEG)
ds_DPC = pd.concat([df_DPC_mPOS, df_DPC_mNEG], axis=0) # dataset for DPC (DPC descriptors for metastasis_POS and metastasis_NEG)
ds_Mix = pd.concat([df_Mix_mPOS, df_Mix_mNEG], axis=0) # dataset for Mix (AAC and DPC descriptors for metastasis_POS and metastasis_NEG)

# Save the datasets on files (in your Gdrive folder)
# These are the datasets with all the descriptors, without feature selection, without data normalization!
ds_AAC.to_csv(dsPath+'dsAAC.csv', index=False)
ds_DPC.to_csv(dsPath+'dsDPC.csv', index=False)
ds_Mix.to_csv(dsPath+'dsMix.csv', index=False)

In [None]:
# Checking dataframes
#ds_AAC
#ds_DPC
#ds_Mix

The datasets from dsAAC.csv, dsDPC.csv and dsMix.csv will be used in feature selections and directly to build ML models.

### Dataset description

In [None]:
data = ds_DPC

class_column = data.columns[-1]
class_count = data[class_column].value_counts()

plt.figure(figsize=(4, 5))

class_count.plot(kind='bar', color=['#87CEEB', '#98FB98'], width=0.35)

plt.xlabel('Class')
plt.ylabel('Amount of proteins')
plt.title('Data Distribution')

plt.xticks(ticks=[0, 1], labels=['Non-metastatic proteins', 'Metastatic proteins'], rotation=0, wrap=True)

plt.show()


### Normalized datasets of AAC, DPC and Mix

In [None]:
from sklearn.preprocessing import MinMaxScaler
import joblib

In [None]:
# Normalises the datasets between 0 and 1

def normalize(ds, ds_name, ds_path):
    # create the normalized dataframe as copy of the raw dataset
    ds_norm = ds.copy()

    # create a scaler object
    scaler = MinMaxScaler()

    # select all columns except the last one (excluding the class)
    cols_to_scale = ds_norm.columns[:-1]

    # normalize the selected columns
    ds_norm[cols_to_scale] = scaler.fit_transform(ds_norm[cols_to_scale])

    # save the scaler for future predictions
    scaler_file = ds_path + 'ds{}_norm_scalerMinMax.pkl'.format(ds_name)
    joblib.dump(scaler, scaler_file)

    # save the normalized dataset as CSV file
    ds_norm.to_csv(ds_path + 'ds{}_norm.csv'.format(ds_name), index=False)

    return ds_norm

In [None]:
# AAC dataset
ds_AAC_norm = normalize(ds_AAC, 'AAC', dsPath)

# DPC dataset
ds_DPC_norm = normalize(ds_DPC, 'DPC', dsPath)

# Mix dataset
ds_Mix_norm = normalize(ds_Mix, 'Mix', dsPath)

In [None]:
# Checking the normalized datasets
#ds_AAC_norm
#ds_DPC_norm
#ds_Mix_norm

## Univariate feature selection

The normalized datasets for AAC, DPC and Mix will be used to build ML models and for selection features. This moment, we have 3 datasets (normalized datasets with pool features for AAC, DPC and Mix).

We will create 3 new datasets with the best features for AAC, DPC and Mix descriptors.

In [None]:
# Function to obtain input descriptors and output class as arrays, and the list with the names of the descriptors

def getDataFromDataFrame(df, OutVar='Class'):
    # get X, Y data and column names from df
    print('\n-> Get X & Y data, Features list')
    print('Shape', df.shape)

    # select X and Y
    ds_y = df[OutVar]
    ds_X = df.drop(OutVar,axis = 1)
    Xdata = ds_X.values # get values of features
    Ydata = ds_y.values # get output values

    print('Shape X data:', Xdata.shape)
    print('Shape Y data:', Ydata.shape)
    print('Done!')

    # return data for X and Y, feature names as list
    return (Xdata, Ydata, list(ds_X.columns))

In [None]:
# Univariate feature selection
from sklearn.feature_selection import f_classif, SelectKBest

def FeatureSelection(df,label,nFeats=1):
    if nFeats == 0:
        print("\n NO feature selection")
        return df

    # Get separated info
    Xdata, Ydata, Features = getDataFromDataFrame(df)  # out var = Class

    # Feature selection
    print('\n-> Univariate Feature selection')
    print('Initial columns:', list(df.columns))
    selector= SelectKBest(f_classif, k=nFeats)  # you can select other feature selection
    Xdata = selector.fit_transform(Xdata, Ydata)  # select the features

    # Get the selected features
    SelFeatures = []
    for i in selector.get_support(indices=True):
        SelFeatures.append(Features[i])

    # Create the new dataframe with selected features
    df = pd.DataFrame(Xdata,columns=SelFeatures)
    df['Class'] = Ydata  # add class column
    print('Final columns:', list(df.columns))

    # Save selected feature dataset
    selectFile = dsPath+'ds'+label+'.normFS('+str(nFeats)+').csv'  # dataset with selected features
    print('* Save selected features dataset:', selectFile)
    df.to_csv(selectFile, index=False)

    print('Done!')
    return df

### 50% of descriptors

In [None]:
# Univariate feature selection for AAC using the normalized dataset
ds_AAC_normFS50 = FeatureSelection(ds_AAC_norm,"AAC",nFeats=10) # select the best 10 features (10/20)
ds_AAC_normFS50 # check the feature selected dataset for AAC

In [None]:
# Univariate feature selection for DPC using the normalized dataset
ds_DPC_normFS50 = FeatureSelection(ds_DPC_norm,"DPC",nFeats=200) # select the best 200 features (200/400)
ds_DPC_normFS50 # check the feature selected dataset for DPC

In [None]:
# Univariate feature selection for Mix using the normalized dataset
ds_Mix_normFS50 = FeatureSelection(ds_Mix_norm,"Mix",nFeats=210) # select the best 210 features (210/420)
ds_Mix_normFS50 # check the feature selected dataset for Mix

### 25% of descriptors

In [None]:
# Univariate feature selection for AAC using the normalized dataset
ds_AAC_normFS25 = FeatureSelection(ds_AAC_norm,"AAC",nFeats=5) # select the best 5 features (5/20)
ds_AAC_normFS25 # check the feature selected dataset for AAC

In [None]:
# Univariate feature selection for DPC using the normalized dataset
ds_DPC_normFS25 = FeatureSelection(ds_DPC_norm,"DPC",nFeats=100) # select the best 100 features (100/400)
ds_DPC_normFS25 # check the feature selected dataset for DPC

In [None]:
# Univariate feature selection for Mix using the normalized dataset
ds_Mix_normFS25 = FeatureSelection(ds_Mix_norm,"Mix",nFeats=105) # select the best 105 features (105/420)
ds_Mix_normFS25 # check the feature selected dataset for Mix

### ML classifiers for metastasic vs non-metastasic protein sequences

We have 9 datasets to use with different ML classifier:
*   3 normalized datasets with all descriptors: ds_AAC_norm, ds_DPC_norm, ds_Mix_norm
*  3 dataset with 50% of the best features selected from the previous datasets: ds_AAC_normFS50 (only 10 features from 20), ds_DPC_normFS50 (only 200 features from 400), ds_Mix_normFS50 (only 210 features from 420).
*   3 dataset with 25% of the best features selected from the previous datasets: ds_AAC_normFS25 (only 5 features from 20), ds_DPC_normFS25 (only 100 features from 400), ds_Mix_normFS25 (only 105 features from 420).



### Functions for ML

In [None]:
nfold = 10 # number of fold-CV

In [None]:
# Create a function that will build ML models for one dataset

import numpy as np
import time

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, f1_score, cohen_kappa_score, recall_score, precision_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.svm import LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

def MLOuterCV(Xdata, Ydata, folds, seed=2024):
    # define classifiers labels in results
    names = ['NB', 'KNN', 'LDA', 'SVM', 'SVMrbf', 'LR', 'MLP', 'DT', 'RF', 'XGB']

    classifiers = [GaussianNB(),
                   KNeighborsClassifier(5),
                   LinearDiscriminantAnalysis(solver='svd'),  # no random_state
                   SVC(kernel="linear", random_state=seed, gamma='scale'),
                   SVC(kernel='rbf', random_state=seed, gamma='scale'),
                   LogisticRegression(solver='lbfgs', random_state=seed),
                   MLPClassifier(random_state=seed, max_iter=50000, shuffle=False),
                   DecisionTreeClassifier(random_state=seed),
                   RandomForestClassifier(n_jobs=-1, random_state=seed),
                   XGBClassifier(n_jobs=-1, seed=seed)
                   ]

    # results dataframe: each column for a classifier
    df_res_auroc = pd.DataFrame(columns=names, dtype=object)
    df_res_f1 = pd.DataFrame(columns=names, dtype=object)
    df_res_kappa = pd.DataFrame(columns=names, dtype=object)
    df_res_recall = pd.DataFrame(columns=names, dtype=object)
    df_res_precision = pd.DataFrame(columns=names, dtype=object)

    # build each classifier
    print('* Building ' + str(folds) + '-fold CV for ' + str(len(names)) + ' classifiers:', str(names))
    total = time.time()

    # define a fold-CV for all the classifier
    outer_cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)

    print('ML method, AUROC Mean, AUROC SD, F1 Mean, F1 SD, Kappa Mean, Kappa SD, Recall Mean, Recall SD, Precision Mean, Precision SD, Time (min)')

    for name, clf in zip(names, classifiers):
        start = time.time()

        # evaluate pipeline
        scores_auroc = cross_val_score(clf, Xdata, Ydata, cv=outer_cv, scoring='roc_auc', n_jobs=-1)
        scores_f1 = cross_val_score(clf, Xdata, Ydata, cv=outer_cv, scoring='f1', n_jobs=-1)
        scores_kappa = cross_val_score(clf, Xdata, Ydata, cv=outer_cv, scoring='accuracy', n_jobs=-1)
        scores_recall = cross_val_score(clf, Xdata, Ydata, cv=outer_cv, scoring='recall', n_jobs=-1)
        scores_precision = cross_val_score(clf, Xdata, Ydata, cv=outer_cv, scoring='precision', n_jobs=-1)

        df_res_auroc[name] = scores_auroc
        df_res_f1[name] = scores_f1
        df_res_kappa[name] = scores_kappa
        df_res_recall[name] = scores_recall
        df_res_precision[name] = scores_precision

        results_string = ('%s, %0.3f, %0.4f, %0.3f, %0.4f, %0.3f, %0.4f, %0.3f, %0.4f, %0.3f, %0.4f, %0.1f' %
                        (name, scores_auroc.mean(), scores_auroc.std(),
                          scores_f1.mean(), scores_f1.std(),
                          scores_kappa.mean(), scores_kappa.std(),
                          scores_recall.mean(), scores_recall.std(),
                          scores_precision.mean(), scores_precision.std(),
                          (time.time() - start) / 60))
        print(results_string)

    print('Total time:', (time.time() - total) / 60, ' mins')
    return [df_res_auroc, df_res_f1, df_res_kappa, df_res_recall, df_res_precision]


In [None]:
# Function to build ML models, write the results plot box plots for a dataframe

def MLmodels(df, df_fold, nfold, label="X", label_y="Y"):
    df_results = None
    df_fold['Dataset'] = label
    df_fold['folds'] = nfold

    # add each result to a summary dataframe
    df_results = pd.concat([df_results,df_fold])
    summaryFile = resPath+'ML_'+label+'_'+label_y+'.csv' # ML metrics results
    boxplotFile = resPath+'ML_'+label+'_'+label_y+'.png' # box plot of the metrics

    # save all results
    print('\n==>> Saving summary', summaryFile)
    df_results.to_csv(summaryFile, index=False)

    # save boxplot
    classifierNames = list(df_results.columns)
    classifierNames.remove('Dataset')
    classifierNames.remove('folds')

    foldTypes=[nfold]

    plt.figure()
    plt.clf()
    print('==> Fold =', nfold)
    grouped = df_results[df_results['folds']==nfold].drop(['folds'], axis=1).groupby('Dataset')
    #grouped.boxplot(figsize=(16,12), return_type='axes')
    grouped.boxplot(return_type='axes')
    plt.title("")
    #plt.xlabel("Machine Learning methods for "+label,size=18)
    #plt.ylabel("AUROC ("+str(nfold)+"-fold CV)",size=18)
    plt.xlabel("Machine Learning methods for "+ label)
    plt.ylabel(label_y + "("+str(nfold)+"-fold CV)")
    #plt.tick_params(labelsize=14)
    plt.ylim(0,1.0)
    #plt.savefig(boxplotFile, dpi=1200)
    plt.savefig(boxplotFile)
    plt.show()

    df_results
    return

### ML models and box plots

In [None]:
# ML for AAC normalized dataset with all descriptors (no feature selection)

Xdata, Ydata, Features = getDataFromDataFrame(ds_AAC_norm)
df_res_auroc, df_res_f1, df_res_kappa_AAC, df_res_recall, df_res_precision = MLOuterCV(Xdata, Ydata, nfold)

# AUROC
MLmodels(ds_AAC_norm, df_res_auroc, nfold, label="AAC_norm", label_y="AUROC")

# F1_Score
MLmodels(ds_AAC_norm, df_res_f1, nfold, label="AAC_norm", label_y="F1_Score")

# Kappa_Score
MLmodels(ds_AAC_norm, df_res_kappa_AAC, nfold, label="AAC_norm", label_y="Kappa_Score")

# Recall
MLmodels(ds_AAC_norm, df_res_recall, nfold, label="AAC_norm", label_y="Recall")

# Precision
MLmodels(ds_AAC_norm, df_res_precision, nfold, label="AAC_norm", label_y="Precision")


In [None]:
# ML for DPC normalized dataset with all descriptors (no feature selection)

Xdata, Ydata, Features = getDataFromDataFrame(ds_DPC_norm)
df_res_auroc, df_res_f1, df_res_kappa_DPC, df_res_recall, df_res_precision = MLOuterCV(Xdata, Ydata, nfold)

# AUROC
MLmodels(ds_DPC_norm, df_res_auroc, nfold, label="DPC_norm", label_y="AUROC")

# F1_Score
MLmodels(ds_DPC_norm, df_res_f1, nfold, label="DPC_norm", label_y="F1_Score")

# Kappa_Score
MLmodels(ds_DPC_norm, df_res_kappa_DPC, nfold, label="DPC_norm", label_y="Kappa_Score")

# Recall
MLmodels(ds_DPC_norm, df_res_recall, nfold, label="DPC_norm", label_y="Recall")

# Precision
MLmodels(ds_DPC_norm, df_res_precision, nfold, label="DPC_norm", label_y="Precision")

In [None]:
# ML for Mix normalized dataset with all descriptors (no feature selection)

Xdata, Ydata, Features = getDataFromDataFrame(ds_Mix_norm)
df_res_auroc, df_res_f1, df_res_kappa_Mix, df_res_recall, df_res_precision = MLOuterCV(Xdata, Ydata, nfold)

# AUROC
MLmodels(ds_Mix_norm, df_res_auroc, nfold, label="Mix_norm", label_y="AUROC")

# F1_Score
MLmodels(ds_Mix_norm, df_res_f1, nfold, label="Mix_norm", label_y="F1_Score")

# Kappa_Score
MLmodels(ds_Mix_norm, df_res_kappa_Mix, nfold, label="Mix_norm", label_y="Kappa_Score")

# Recall
MLmodels(ds_Mix_norm, df_res_recall, nfold, label="Mix_norm", label_y="Recall")

# Precision
MLmodels(ds_Mix_norm, df_res_precision, nfold, label="Mix_norm", label_y="Precision")

### 50% of descriptors

In [None]:
# ML for AAC normalized dataset with selected features

Xdata, Ydata, Features = getDataFromDataFrame(ds_AAC_normFS50)
df_res_auroc, df_res_f1, df_res_kappa_AAC_FS50, df_res_recall, df_res_precision = MLOuterCV(Xdata, Ydata, nfold)

# AUROC
MLmodels(ds_AAC_normFS50, df_res_auroc, nfold, label="AAC_normFS50", label_y="AUROC")

# F1_Score
MLmodels(ds_AAC_normFS50, df_res_f1, nfold, label="AAC_normFS50", label_y="F1_Score")

# Kappa_Score
MLmodels(ds_AAC_normFS50, df_res_kappa_AAC_FS50, nfold, label="AAC_normFS50", label_y="Kappa_Score")

# Recall
MLmodels(ds_AAC_normFS50, df_res_recall, nfold, label="AAC_normFS50", label_y="Recall")

# Precision
MLmodels(ds_AAC_normFS50, df_res_precision, nfold, label="AAC_normFS50", label_y="Precision")

In [None]:
# ML for DPC normalized dataset with selected features

Xdata, Ydata, Features = getDataFromDataFrame(ds_DPC_normFS50)
df_res_auroc, df_res_f1, df_res_kappa_DPC_FS50, df_res_recall, df_res_precision = MLOuterCV(Xdata, Ydata, nfold)

# AUROC
MLmodels(ds_DPC_normFS50, df_res_auroc, nfold, label="DPC_normFS50", label_y="AUROC")

# F1_Score
MLmodels(ds_DPC_normFS50, df_res_f1, nfold, label="DPC_normFS50", label_y="F1_Score")

# Kappa_Score
MLmodels(ds_DPC_normFS50, df_res_kappa_DPC_FS50, nfold, label="DPC_normFS50", label_y="Kappa_Score")

# Recall
MLmodels(ds_DPC_normFS50, df_res_recall, nfold, label="DPC_normFS50", label_y="Recall")

# Precision
MLmodels(ds_DPC_normFS50, df_res_precision, nfold, label="DPC_normFS50", label_y="Precision")

In [None]:
# ML for Mix normalized dataset with selected features

Xdata, Ydata, Features = getDataFromDataFrame(ds_Mix_normFS50)
df_res_auroc, df_res_f1, df_res_kappa_Mix_FS50, df_res_recall, df_res_precision = MLOuterCV(Xdata, Ydata, nfold)

# AUROC
MLmodels(ds_Mix_normFS50, df_res_auroc, nfold, label="Mix_normFS50", label_y="AUROC")

# F1_Score
MLmodels(ds_Mix_normFS50, df_res_f1, nfold, label="Mix_normFS50", label_y="F1_Score")

# Kappa_Score
MLmodels(ds_Mix_normFS50, df_res_kappa_Mix_FS50, nfold, label="Mix_normFS50", label_y="Kappa_Score")

# Recall
MLmodels(ds_Mix_normFS50, df_res_recall, nfold, label="Mix_normFS50", label_y="Recall")

# Precision
MLmodels(ds_Mix_normFS50, df_res_precision, nfold, label="Mix_normFS50", label_y="Precision")

### 25% of descriptors

In [None]:
# ML for AAC normalized dataset with selected features

Xdata, Ydata, Features = getDataFromDataFrame(ds_AAC_normFS25)
df_res_auroc, df_res_f1, df_res_kappa_AAC_FS25, df_res_recall, df_res_precision = MLOuterCV(Xdata, Ydata, nfold)

# AUROC
MLmodels(ds_AAC_normFS25, df_res_auroc, nfold, label="AAC_normFS25", label_y="AUROC")

# F1_Score
MLmodels(ds_AAC_normFS25, df_res_f1, nfold, label="AAC_normFS25", label_y="F1_Score")

# Kappa_Score
MLmodels(ds_AAC_normFS25, df_res_kappa_AAC_FS25, nfold, label="AAC_normFS25", label_y="Kappa_Score")

# Recall
MLmodels(ds_AAC_normFS25, df_res_recall, nfold, label="AAC_normFS25", label_y="Recall")

# Precision
MLmodels(ds_AAC_normFS25, df_res_precision, nfold, label="AAC_normFS25", label_y="Precision")

In [None]:
# ML for DPC normalized dataset with selected features

Xdata, Ydata, Features = getDataFromDataFrame(ds_DPC_normFS25)
df_res_auroc, df_res_f1, df_res_kappa_DPC_FS25, df_res_recall, df_res_precision = MLOuterCV(Xdata, Ydata, nfold)

# AUROC
MLmodels(ds_DPC_normFS25, df_res_auroc, nfold, label="DPC_normFS25", label_y="AUROC")

# F1_Score
MLmodels(ds_DPC_normFS25, df_res_f1, nfold, label="DPC_normFS25", label_y="F1_Score")

# Kappa_Score
MLmodels(ds_DPC_normFS25, df_res_kappa_DPC_FS25, nfold, label="DPC_normFS25", label_y="Kappa_Score")

# Recall
MLmodels(ds_DPC_normFS25, df_res_recall, nfold, label="DPC_normFS25", label_y="Recall")

# Precision
MLmodels(ds_DPC_normFS25, df_res_precision, nfold, label="DPC_normFS25", label_y="Precision")

In [None]:
# ML for Mix normalized dataset with selected features

Xdata, Ydata, Features = getDataFromDataFrame(ds_Mix_normFS25)
df_res_auroc, df_res_f1, df_res_kappa_Mix_FS25, df_res_recall, df_res_precision = MLOuterCV(Xdata, Ydata, nfold)

# AUROC
MLmodels(ds_Mix_normFS25, df_res_auroc, nfold, label="Mix_normFS25", label_y="AUROC")

# F1_Score
MLmodels(ds_Mix_normFS25, df_res_f1, nfold, label="Mix_normFS25", label_y="F1_Score")

# Kappa_Score
MLmodels(ds_Mix_normFS25, df_res_kappa_Mix_FS25, nfold, label="Mix_normFS25", label_y="Kappa_Score")

# Recall
MLmodels(ds_Mix_normFS25, df_res_recall, nfold, label="Mix_normFS25", label_y="Recall")

# Precision
MLmodels(ds_Mix_normFS25, df_res_precision, nfold, label="Mix_normFS25", label_y="Precision")

### Statistical Analysis

### For each dataset:

#### Normality: Shapiro-Wilk and QQ Plot:

#### Shapiro-Wilk test

In [None]:
# Remove the last two columns from each dataframe

dicc = {
    "df_res_kappa_AAC": df_res_kappa_AAC,
    "df_res_kappa_DPC": df_res_kappa_DPC,
    "df_res_kappa_Mix": df_res_kappa_Mix,
    "df_res_kappa_AAC_FS50": df_res_kappa_AAC_FS50,
    "df_res_kappa_DPC_FS50": df_res_kappa_DPC_FS50,
    "df_res_kappa_Mix_FS50": df_res_kappa_Mix_FS50,
    "df_res_kappa_AAC_FS25": df_res_kappa_AAC_FS25,
    "df_res_kappa_DPC_FS25": df_res_kappa_DPC_FS25,
    "df_res_kappa_Mix_FS25": df_res_kappa_Mix_FS25
}

datasets = {}

def remove(df):
    return df.iloc[:, :-2]

for dataset_name, dataset in dicc.items():
    datasets[dataset_name] = remove(dataset)

In [None]:
from scipy.stats import shapiro

def shapiro_test(df_res_kappa, dataset_name):
  ''' H0: The null hypothesis states that the data come from a population with a normal distribution.
      H1: The alternative hypothesis suggests that the data do not come from a population with a normal distribution.'''

  # Shapiro-Wilk
  statistic, p_value = shapiro(df_res_kappa)

  # Results
  if p_value < 0.05:
    print("Shapiro-Wilk test rejects the null hypothesis.")
    print("The dataset "f"\033[1m{dataset_name}\033[0m DOES NOT FOLLOWS a normal distribution.")

  else:
    print("Shapiro-Wilk test does not reject the null hypothesis.")
    print("The dataset "f"\033[1m{dataset_name}\033[0m FOLLOWS a normal distribution.")

  print()

In [None]:
# Call Shapiro-Wilks function for each data set

for dataset_name, dataset in datasets.items():
    shapiro_test(dataset, dataset_name)

#### QQ Plot

In [None]:
import statsmodels.api as sm

def assess_normality(df, dataset_name):

  # Calculate row averages and convert to a single column DataFrame
  y = df.values.flatten()

  # Create a sequence of integers as indexes for the columns
  x = range (len(y))

  # Fit a linear regression model and obtain the residuals
  model = sm.OLS(y, sm.add_constant(x)).fit()
  residuals = model.resid

  # Plot QQ plot
  sm.qqplot(residuals, line='s')
  plt.title(f'Normality Assessment of Residuals for {dataset_name}', fontsize=18)
  plt.xlabel('Theoretical Quantiles', fontsize=15)
  plt.ylabel('Sample Residual Quantiles', fontsize=15)
  plt.grid(True, which='both', linestyle='--', linewidth=0.5)
  plt.show()

In [None]:
for dataset_name, dataset in datasets.items():
    assess_normality(dataset, dataset_name)

#### ANOVA
For those datasets following a normal distribution

In [None]:
from scipy.stats import f_oneway

def anova(df_res_kappa, dataset_name):

  '''H0: The means of the classifiers are equal.
     H1: The means of the classifiers are not equal.'''

  # ANOVA
  f_statistic, p_value = f_oneway(*[df_res_kappa[column] for column in df_res_kappa.columns])

  # Results
  print("Statistical F:", f_statistic)
  print("P value:", p_value)

  if p_value < 0.05:
    print("Rejects the null hypothesis.")
    print("For the dataset "f"\033[1m{dataset_name}\033[0m THERE ARE statistically significant differences between the classifiers's means.")

  else:
    print("Does not reject the null hypothesis.")
    print("For the dataset "f"\033[1m{dataset_name}\033[0m THERE ARE NO statistically significant differences between the classifiers's means.")

  print()

In [None]:
norm_dicc = {
    "df_res_kappa_AAC": df_res_kappa_AAC,
    "df_res_kappa_DPC": df_res_kappa_DPC,
    "df_res_kappa_AAC_FS50": df_res_kappa_AAC_FS50,
    "df_res_kappa_DPC_FS50": df_res_kappa_DPC_FS50,
    "df_res_kappa_Mix_FS50": df_res_kappa_Mix_FS50,
    "df_res_kappa_AAC_FS25": df_res_kappa_AAC_FS25,
}

norm_datasets = {}
for dataset_name, dataset in norm_dicc.items():
    norm_datasets[dataset_name] = remove(dataset)

In [None]:
for dataset_name, dataset in norm_datasets.items():
    anova(dataset, dataset_name)

#### Kruskal-Wallis
For those datasets that do not follow a normal distribution

In [None]:
not_norm_dicc = {
    "df_res_kappa_Mix": df_res_kappa_Mix,
    "df_res_kappa_DPC_FS25": df_res_kappa_DPC_FS25,
    "df_res_kappa_Mix_FS25": df_res_kappa_Mix_FS25
}

not_norm_datasets = {}
for dataset_name, dataset in not_norm_dicc.items():
    not_norm_datasets[dataset_name] = remove(dataset)

In [None]:
from scipy.stats import kruskal

def kruskal_w(df_res_kappa, dataset_name):

  '''H0: The performance distributions of the classifiers are equal.
     H1: At least one classifier's performance distribution is different from the others.'''

  # Kruskal-Wallis
  h_statistic, p_value = kruskal(*[df_res_kappa[column] for column in df_res_kappa.columns])

  # Results
  print("Statistical F:", h_statistic)
  print("P value:", p_value)

  if p_value < 0.05:
    print("Rejects the null hypothesis.")
    print("For the dataset "f"\033[1m{dataset_name}\033[0m THERE IS at least one classifier whose performance is statistically different from the rest.")

  else:
    print("Does not reject the null hypothesis.")
    print("For the dataset "f"\033[1m{dataset_name}\033[0m THERE ARE NO statistically significant differences in the performance of the classifiers.")

  print()

In [None]:
for dataset_name, dataset in not_norm_datasets.items():
    kruskal_w(dataset, dataset_name)

#### Levene's test
Variance of the classifiers

In [None]:
from scipy.stats import levene

def levene_test(dataset, dataset_name):

  ''' H0: The variances of the classifiers are homogeneous.
      H1: At least one of the variances of the classifiers differs from the others.'''

  # Levene's test
  statistic, p_value = levene(*[dataset[column] for column in dataset.columns])

  # Results
  print("Levene's Test Statistic:", statistic)
  print("p-value:", p_value)

  # p-value
  if p_value < 0.05:
      print("Reject the null hypothesis of equal variances.")
      print("For the dataset "f"\033[1m{dataset_name}\033[0m THERE IS at least one classifier whose variance is statistically different from the others.")

  else:
      print("Does no reject null hypothesis of equal variances.")
      print("For the dataset "f"\033[1m{dataset_name}\033[0m THERE ARE NO statistically significant differences between the variance of the classifiers.")

  print()


In [None]:
for dataset_name, dataset in datasets.items():
    levene_test(dataset, dataset_name)

#### Tukey's Test
Assumes that the data are approximately normally distributed and that the variances of the groups are homogeneous.

Determine whether there are significant differences between the means of classifier performance.

In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

def tukey_test(dataset, dataset_name):
    # List of classifiers
    classifiers = dataset.columns.tolist()

    # List to store data for each classifier
    data = [dataset[classifier].values for classifier in classifiers]

    # Flatten the list of lists into a single list
    flattened_data = np.concatenate(data)

    # Create groups for each classifier
    groups = [classifier for classifier in classifiers for _ in range(len(dataset))]

    # Tukey's test
    tukey_result = pairwise_tukeyhsd(flattened_data, groups)

    # Results
    print("\033[1mResults for the dataset "f"{dataset_name}\033[0m")
    print()
    print(tukey_result)
    print()

    # Plot the results of Tukey's test
    tukey_result.plot_simultaneous()
    plt.title(f"Tukey Test for {dataset_name}. Comparisons Among Classifiers")
    plt.xlabel("Metric Value")
    plt.ylabel("Classifiers")
    plt.show()
    print()

In [None]:
for dataset_name, dataset in datasets.items():
    tukey_test(dataset, dataset_name)

### Statistical Analysis

### For all datasets:

### NB classifier
For the comparison of all datasets, the NB classifier of each dataset is selected as there are no significant differences between classifiers.

In [None]:
def NB_columns(dataframes, dataframe_names):
    NB_columns = []

    for df, df_name in zip(dataframes, dataframe_names):
        # Extract the NB column from each dataframe
        NB_column = pd.DataFrame(df['NB'].values)

        # Rename the column
        new_name = df_name + '_NB'
        NB_column = NB_column.rename(columns={0: new_name})

        NB_columns.append(NB_column)

    # Combine all columns into a single dataframe
    combined_df = pd.concat(NB_columns, axis=1)

    return combined_df

In [None]:
# Function call

kappa_all_datasets = [df_res_kappa_AAC, df_res_kappa_DPC, df_res_kappa_Mix, df_res_kappa_AAC_FS50, df_res_kappa_DPC_FS50, df_res_kappa_Mix_FS50, df_res_kappa_AAC_FS25, df_res_kappa_DPC_FS25, df_res_kappa_Mix_FS25]
dataframe_names = ['df_res_kappa_AAC', 'df_res_kappa_DPC', 'df_res_kappa_Mix', 'df_res_kappa_AAC_FS50', 'df_res_kappa_DPC_FS50', 'df_res_kappa_Mix_FS50', 'df_res_kappa_AAC_FS25', 'df_res_kappa_DPC_FS25', 'df_res_kappa_Mix_FS25']

final_df = NB_columns(kappa_all_datasets, dataframe_names)
print(final_df)

#### Normality: Shapiro-Wilk and QQ Plot:

#### Shapiro-Wilk

In [None]:
# One-dimensional array
kappa_data = final_df.values.flatten()

# Shapiro-Wilk
statistic, p_value = shapiro(kappa_data)

# Results
print("Statistical:", statistic)
print("P value:", p_value)
print()

if p_value < 0.05:
  print("Shapiro-Wilk test rejects the null hypothesis.")
  print("The NB Dataset DOES NOT FOLLOWS a normal distribution.")

else:
  print("Shapiro-Wilk test does not reject the null hypothesis.")
  print("The NB Dataset FOLLOWS a normal distribution.")

#### QQ-plot

In [None]:
# Call asses_normality function

assess_normality(final_df, 'NB Dataset')

#### ANOVA

In [None]:
# Call anova function

anova(final_df, 'NB Dataset')

#### Levene's Test

In [None]:
# Call levene_test function

levene_test(final_df, 'NB Dataset')

#### Tukey's Test

In [None]:
# Call tukey_test function
tukey_test(final_df, 'DB Dataset')

## Multivariate feature selection

In [None]:
# Multivariate feature selection

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

# Feature selection function using RFE with Random Forest
def FeatureSelectionWithRFE(df, label, nFeats=1):
    if nFeats == 0:
        print("\n NO feature selection")
        return df

    # Separating features and target variable using getDataFromDataFrame
    Xdata, Ydata, Features = getDataFromDataFrame(df)

    # Feature selection using RFE
    print('\n-> Multivariate Feature selection with RFE and Random Forest')
    print('Initial columns:', list(df.columns))

    # Define the Random Forest model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

    # Configure RFE
    selector = RFE(estimator=rf_model, n_features_to_select=nFeats, step=1)

    # Adjust the selector to the data
    selector = selector.fit(Xdata, Ydata)

    # Obtain the selected characteristics
    SelFeatures = []
    for i in selector.get_support(indices=True):
        SelFeatures.append(Features[i])

    # Create the new DataFrame with the selected features
    Xdata_selected = selector.transform(Xdata)
    df_selected = pd.DataFrame(Xdata_selected, columns=SelFeatures)
    df_selected['Class'] = Ydata

    print('Final columns:', list(df_selected.columns))

    # Save the dataset with the selected characteristics
    selectFile = dsPath + 'ds' + label + '.normFS_RFE(' + str(nFeats) + ').csv'
    print('* Save selected features dataset:', selectFile)
    df_selected.to_csv(selectFile, index=False)

    print('Done!')
    return df_selected

In [None]:
# Feature selection

ds_AAC_norm_RFE = FeatureSelectionWithRFE(ds_AAC_norm, "AAC", nFeats=20)  # ALL
ds_DPC_norm_RFE = FeatureSelectionWithRFE(ds_DPC_norm, "DPC", nFeats=400)  # ALL
ds_Mix_norm_RFE = FeatureSelectionWithRFE(ds_Mix_norm, "Mix", nFeats=420)  # ALL
ds_AAC_norm_RFE

In [None]:
ds_AAC_normFS50_RFE = FeatureSelectionWithRFE(ds_AAC_norm, "AAC", nFeats=10)  # 50%
ds_DPC_normFS50_RFE = FeatureSelectionWithRFE(ds_DPC_norm, "DPC", nFeats=200)  # 50%
ds_Mix_normFS50_RFE = FeatureSelectionWithRFE(ds_Mix_norm, "Mix", nFeats=210)  # 50%

In [None]:
ds_AAC_normFS25_RFE = FeatureSelectionWithRFE(ds_AAC_norm, "AAC", nFeats=5)  # 25%
ds_DPC_normFS25_RFE = FeatureSelectionWithRFE(ds_DPC_norm, "DPC", nFeats=100)  # 25%
ds_Mix_normFS25_RFE = FeatureSelectionWithRFE(ds_Mix_norm, "Mix", nFeats=105)  # 25%

In [None]:
# ML for AAC normalized dataset with all descriptors (no feature selection)

Xdata, Ydata, Features = getDataFromDataFrame(ds_AAC_norm_RFE)
df_res_auroc, df_res_f1, df_res_kappa_AAC_RFE, df_res_recall, df_res_precision = MLOuterCV(Xdata, Ydata, nfold)

# AUROC
MLmodels(ds_AAC_norm_RFE, df_res_auroc, nfold, label="AAC_norm_RFE", label_y="AUROC")

# F1_Score
MLmodels(ds_AAC_norm_RFE, df_res_f1, nfold, label="AAC_norm_RFE", label_y="F1_Score")

# Kappa_Score
MLmodels(ds_AAC_norm_RFE, df_res_kappa_AAC_RFE, nfold, label="AAC_norm_RFE", label_y="Kappa_Score")

# Recall
MLmodels(ds_AAC_norm_RFE, df_res_recall, nfold, label="AAC_norm_RFE", label_y="Recall")

# Precision
MLmodels(ds_AAC_norm_RFE, df_res_precision, nfold, label="AAC_norm_RFE", label_y="Precision")


In [None]:
# ML for DPC normalized dataset with all descriptors (no feature selection)

Xdata, Ydata, Features = getDataFromDataFrame(ds_DPC_norm_RFE)
df_res_auroc, df_res_f1, df_res_kappa_DPC_RFE, df_res_recall, df_res_precision = MLOuterCV(Xdata, Ydata, nfold)

# AUROC
MLmodels(ds_DPC_norm_RFE, df_res_auroc, nfold, label="DPC_norm_RFE", label_y="AUROC")

# F1_Score
MLmodels(ds_DPC_norm_RFE, df_res_f1, nfold, label="DPC_norm_RFE", label_y="F1_Score")

# Kappa_Score
MLmodels(ds_DPC_norm_RFE, df_res_kappa_DPC_RFE, nfold, label="DPC_norm_RFE", label_y="Kappa_Score")

# Recall
MLmodels(ds_DPC_norm_RFE, df_res_recall, nfold, label="DPC_norm_RFE", label_y="Recall")

# Precision
MLmodels(ds_DPC_norm_RFE, df_res_precision, nfold, label="DPC_norm_RFE", label_y="Precision")

In [None]:
# ML for Mix normalized dataset with all descriptors (no feature selection)

Xdata, Ydata, Features = getDataFromDataFrame(ds_Mix_norm_RFE)
df_res_auroc, df_res_f1, df_res_kappa_Mix_RFE, df_res_recall, df_res_precision = MLOuterCV(Xdata, Ydata, nfold)

# AUROC
MLmodels(ds_Mix_norm_RFE, df_res_auroc, nfold, label="Mix_norm_RFE", label_y="AUROC")

# F1_Score
MLmodels(ds_Mix_norm_RFE, df_res_f1, nfold, label="Mix_norm_RFE", label_y="F1_Score")

# Kappa_Score
MLmodels(ds_Mix_norm_RFE, df_res_kappa_Mix_RFE, nfold, label="Mix_norm_RFE", label_y="Kappa_Score")

# Recall
MLmodels(ds_Mix_norm_RFE, df_res_recall, nfold, label="Mix_norm_RFE", label_y="Recall")

# Precision
MLmodels(ds_Mix_norm_RFE, df_res_precision, nfold, label="Mix_norm_RFE", label_y="Precision")

In [None]:
# ML for AAC normalized dataset with selected features (50%)

Xdata, Ydata, Features = getDataFromDataFrame(ds_AAC_normFS50_RFE)
df_res_auroc, df_res_f1, df_res_kappa_AAC_FS50_RFE, df_res_recall, df_res_precision = MLOuterCV(Xdata, Ydata, nfold)

# AUROC
MLmodels(ds_AAC_normFS50_RFE, df_res_auroc, nfold, label="AAC_norm_FS50_RFE", label_y="AUROC")

# F1_Score
MLmodels(ds_AAC_normFS50_RFE, df_res_f1, nfold, label="AAC_norm_FS50_RFE", label_y="F1_Score")

# Kappa_Score
MLmodels(ds_AAC_normFS50_RFE, df_res_kappa_AAC_FS50_RFE, nfold, label="AAC_norm_FS50_RFE", label_y="Kappa_Score")

# Recall
MLmodels(ds_AAC_normFS50_RFE, df_res_recall, nfold, label="AAC_norm_FS50_RFE", label_y="Recall")

# Precision
MLmodels(ds_AAC_normFS50_RFE, df_res_precision, nfold, label="AAC_norm_FS50_RFE", label_y="Precision")

In [None]:
# ML for DPC normalized dataset with selected features (50%)

Xdata, Ydata, Features = getDataFromDataFrame(ds_DPC_normFS50_RFE)
df_res_auroc, df_res_f1, df_res_kappa_DPC_FS50_RFE, df_res_recall, df_res_precision = MLOuterCV(Xdata, Ydata, nfold)

# AUROC
MLmodels(ds_DPC_normFS50_RFE, df_res_auroc, nfold, label="DPC_norm_FS50_RFE", label_y="AUROC")

# F1_Score
MLmodels(ds_DPC_normFS50_RFE, df_res_f1, nfold, label="DPC_norm_FS50_RFE", label_y="F1_Score")

# Kappa_Score
MLmodels(ds_DPC_normFS50_RFE, df_res_kappa_DPC_FS50_RFE, nfold, label="DPC_norm_FS50_RFE", label_y="Kappa_Score")

# Recall
MLmodels(ds_DPC_normFS50_RFE, df_res_recall, nfold, label="DPC_norm_FS50_RFE", label_y="Recall")

# Precision
MLmodels(ds_DPC_normFS50_RFE, df_res_precision, nfold, label="DPC_norm_FS50_RFE", label_y="Precision")

In [None]:
# ML for Mix normalized dataset with selected features (50%)

Xdata, Ydata, Features = getDataFromDataFrame(ds_Mix_normFS50_RFE)
df_res_auroc, df_res_f1, df_res_kappa_Mix_FS50_RFE, df_res_recall, df_res_precision = MLOuterCV(Xdata, Ydata, nfold)

# AUROC
MLmodels(ds_Mix_normFS50_RFE, df_res_auroc, nfold, label="Mix_norm_FS50_RFE", label_y="AUROC")

# F1_Score
MLmodels(ds_Mix_normFS50_RFE, df_res_f1, nfold, label="Mix_norm_FS50_RFE", label_y="F1_Score")

# Kappa_Score
MLmodels(ds_Mix_normFS50_RFE, df_res_kappa_Mix_FS50_RFE, nfold, label="Mix_norm_FS50_RFE", label_y="Kappa_Score")

# Recall
MLmodels(ds_Mix_normFS50_RFE, df_res_recall, nfold, label="Mix_norm_FS50_RFE", label_y="Recall")

# Precision
MLmodels(ds_Mix_normFS50_RFE, df_res_precision, nfold, label="Mix_norm_FS50_RFE", label_y="Precision")

In [None]:
# ML for AAC normalized dataset with selected features (25%)

Xdata, Ydata, Features = getDataFromDataFrame(ds_AAC_normFS25_RFE)
df_res_auroc, df_res_f1, df_res_kappa_AAC_FS25_RFE, df_res_recall, df_res_precision = MLOuterCV(Xdata, Ydata, nfold)

# AUROC
MLmodels(ds_AAC_normFS25_RFE, df_res_auroc, nfold, label="AAC_norm_FS25_RFE", label_y="AUROC")

# F1_Score
MLmodels(ds_AAC_normFS25_RFE, df_res_f1, nfold, label="AAC_norm_FS25_RFE", label_y="F1_Score")

# Kappa_Score
MLmodels(ds_AAC_normFS25_RFE, df_res_kappa_AAC_FS25_RFE, nfold, label="AAC_norm_FS25_RFE", label_y="Kappa_Score")

# Recall
MLmodels(ds_AAC_normFS25_RFE, df_res_recall, nfold, label="AAC_norm_FS25_RFE", label_y="Recall")

# Precision
MLmodels(ds_AAC_normFS25_RFE, df_res_precision, nfold, label="AAC_norm_FS25_RFE", label_y="Precision")

In [None]:
# ML for DPC normalized dataset with selected features (25%)

Xdata, Ydata, Features = getDataFromDataFrame(ds_DPC_normFS25_RFE)
df_res_auroc, df_res_f1, df_res_kappa_DPC_FS25_RFE, df_res_recall, df_res_precision = MLOuterCV(Xdata, Ydata, nfold)

# AUROC
MLmodels(ds_DPC_normFS25_RFE, df_res_auroc, nfold, label="DPC_norm_FS25_RFE", label_y="AUROC")

# F1_Score
MLmodels(ds_DPC_normFS25_RFE, df_res_f1, nfold, label="DPC_norm_FS25_RFE", label_y="F1_Score")

# Kappa_Score
MLmodels(ds_DPC_normFS25_RFE, df_res_kappa_DPC_FS25_RFE, nfold, label="DPC_norm_FS25_RFE", label_y="Kappa_Score")

# Recall
MLmodels(ds_DPC_normFS25_RFE, df_res_recall, nfold, label="DPC_norm_FS25_RFE", label_y="Recall")

# Precision
MLmodels(ds_DPC_normFS25_RFE, df_res_precision, nfold, label="DPC_norm_FS25_RFE", label_y="Precision")

In [None]:
# ML for Mix normalized dataset with selected features (25%)

Xdata, Ydata, Features = getDataFromDataFrame(ds_Mix_normFS25_RFE)
df_res_auroc, df_res_f1, df_res_kappa_Mix_FS25_RFE, df_res_recall, df_res_precision = MLOuterCV(Xdata, Ydata, nfold)

# AUROC
MLmodels(ds_Mix_normFS25_RFE, df_res_auroc, nfold, label="Mix_norm_FS25_RFE", label_y="AUROC")

# F1_Score
MLmodels(ds_Mix_normFS25_RFE, df_res_f1, nfold, label="Mix_norm_FS25_RFE", label_y="F1_Score")

# Kappa_Score
MLmodels(ds_Mix_normFS25_RFE, df_res_kappa_Mix_FS25_RFE, nfold, label="Mix_norm_FS25_RFE", label_y="Kappa_Score")

# Recall
MLmodels(ds_Mix_normFS25_RFE, df_res_recall, nfold, label="Mix_norm_FS25_RFE", label_y="Recall")

# Precision
MLmodels(ds_Mix_normFS25_RFE, df_res_precision, nfold, label="Mix_norm_FS25_RFE", label_y="Precision")

### Statistical Analysis
### For each dataset

In [None]:
# Remove the last two columns from each dataframe

dicc_RFE = {
    "df_res_kappa_AAC_RFE": df_res_kappa_AAC_RFE,
    "df_res_kappa_DPC_RFE": df_res_kappa_DPC_RFE,
    "df_res_kappa_Mix_RFE": df_res_kappa_Mix_RFE,
    "df_res_kappa_AAC_FS50_RFE": df_res_kappa_AAC_FS50_RFE,
    "df_res_kappa_DPC_FS50_RFE": df_res_kappa_DPC_FS50_RFE,
    "df_res_kappa_Mix_FS50_RFE": df_res_kappa_Mix_FS50_RFE,
    "df_res_kappa_AAC_FS25_RFE": df_res_kappa_AAC_FS25_RFE,
    "df_res_kappa_DPC_FS25_RFE": df_res_kappa_DPC_FS25_RFE,
    "df_res_kappa_Mix_FS25_RFE": df_res_kappa_Mix_FS25_RFE
}

datasets_RFE = {}

for dataset_name, dataset in dicc_RFE.items():
    datasets_RFE[dataset_name] = remove(dataset)

In [None]:
# Call Shapiro-Wilks function for each data set

for dataset_name, dataset in datasets_RFE.items():
    shapiro_test(dataset, dataset_name)

In [None]:
# QQplot

for dataset_name, dataset in datasets_RFE.items():
    assess_normality(dataset, dataset_name)

In [None]:
# ANOVA

norm_dicc_RFE = {
    "df_res_kappa_AAC_RFE": df_res_kappa_AAC_RFE,
    "df_res_kappa_DPC_RFE": df_res_kappa_DPC_RFE,
    "df_res_kappa_AAC_FS50_RFE": df_res_kappa_AAC_FS50_RFE,
    "df_res_kappa_Mix_FS50_RFE": df_res_kappa_Mix_FS50_RFE,
    "df_res_kappa_AAC_FS25_RFE": df_res_kappa_AAC_FS25_RFE,
    "df_res_kappa_DPC_FS25_RFE": df_res_kappa_DPC_FS25_RFE,
}

norm_datasets_RFE = {}

for dataset_name, dataset in norm_dicc_RFE.items():
    norm_datasets_RFE[dataset_name] = remove(dataset)

In [None]:
for dataset_name, dataset in norm_datasets_RFE.items():
    anova(dataset, dataset_name)

In [None]:
# Kruskall Wallis

not_norm_dicc_RFE = {
    "df_res_kappa_Mix_RFE": df_res_kappa_Mix_RFE,
    "df_res_kappa_DPC_FS50_RFE": df_res_kappa_DPC_FS50_RFE,
    "df_res_kappa_Mix_FS25_RFE": df_res_kappa_Mix_FS25_RFE
}

not_norm_datasets_RFE = {}

for dataset_name, dataset in not_norm_dicc.items():
    not_norm_datasets_RFE[dataset_name] = remove(dataset)

In [None]:
for dataset_name, dataset in not_norm_datasets_RFE.items():
    kruskal_w(dataset, dataset_name)

In [None]:
# Tukey

for dataset_name, dataset in datasets_RFE.items():
    tukey_test(dataset, dataset_name)

### Statistical analysis
### For all datasets

In [None]:
kappa_all_datasets = [df_res_kappa_AAC_RFE, df_res_kappa_DPC_RFE, df_res_kappa_Mix_RFE, df_res_kappa_AAC_FS50_RFE, df_res_kappa_DPC_FS50_RFE, df_res_kappa_Mix_FS50_RFE, df_res_kappa_AAC_FS25_RFE, df_res_kappa_DPC_FS25_RFE, df_res_kappa_Mix_FS25_RFE]
dataframe_names = ['df_res_kappa_AAC_RFE', 'df_res_kappa_DPC_RFE', 'df_res_kappa_Mix_RFE', 'df_res_kappa_AAC_FS50_RFE', 'df_res_kappa_DPC_FS50_RFE', 'df_res_kappa_Mix_FS50_RFE', 'df_res_kappa_AAC_FS25_RFE', 'df_res_kappa_DPC_FS25_RFE', 'df_res_kappa_Mix_FS25_RFE']

final_df_RFE = NB_columns(kappa_all_datasets, dataframe_names)
print(final_df_RFE)

In [None]:
# one-dimensional array
kappa_data = final_df_RFE.values.flatten()

# Shapiro-Wilk
statistic, p_value = shapiro(kappa_data)

# results
print("Statistical:", statistic)
print("P value:", p_value)
print()

if p_value < 0.05:
  print("Shapiro-Wilk test rejects the null hypothesis.")
  print("The NB Dataset DOES NOT FOLLOWS a normal distribution.")

else:
  print("Shapiro-Wilk test does not reject the null hypothesis.")
  print("The NB Dataset FOLLOWS a normal distribution.")

In [None]:
# ANOVA
anova(final_df_RFE, 'NB Dataset')

In [None]:
# Levene's test
levene_test(final_df_RFE, 'NB Dataset')

In [None]:
#Tukey's test
tukey_test(final_df_RFE, 'DB Dataset')