## Imports

In [None]:
%%capture
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from statistics import pstdev, mean
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score, accuracy_score,precision_score, recall_score,roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix, log_loss
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.utils import class_weight
from sklearn.manifold import MDS, TSNE
!pip install deslib
from deslib.util.diversity import double_fault
import scipy.stats as stats
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import average, fcluster, dendrogram, ward, single, linkage
!pip install umap
!pip install umap-learn
!pip install 'umap-learn==0.3.10'
from umap.umap_ import UMAP

## Funções

In [None]:
def filter_df_train_test(train_df, test_df, name, filter_first=True):
    train = train_df.filter(regex=name, axis=1)
    test = test_df.filter(regex=name, axis=1)
    return train.to_numpy(), test.to_numpy(), train.columns

def filter_collinearity(X_train, X_test):
    X_train = X_train[:, ::2]
    X_test = X_test[:, ::2]
    return X_train, X_test

def create_coefficient_plot(coefs, names, group, savename):
    log_odds = np.exp(coefs.T)
    log_odds = pd.DataFrame(log_odds,names,['Coef']).sort_values(by='Coef', ascending=False)
    log_odds.round(3)
    log_odds.plot.bar()
    plt.ylabel('Coefficients')
    plt.xlabel(group)
    plt.tight_layout()
    plt.savefig(savename+'.pdf', dpi=300)
def organize_names_df(names):
    names_coeffs = names[::2]
    names_coeffs = [name.replace("-0", "") for name in names_coeffs]
    return names_coeffs

stackingLR = ['LR',LogisticRegression(class_weight='balanced',multi_class='multinomial')]
stackingRF = ['RF',RandomForestClassifier(class_weight='balanced')]
stackingNB = ['NB',GaussianNB()]

def conditions_name_stacking(stacking):
  name=''
  if stacking==stackingLR[1]:
    name='LR'
  elif stacking==stackingRF[1]:
    name='RF'
  elif stacking==stackingNB[1]:
    name='NB'
  return name

def conditions_name_metric(metric):
  name=''
  if metric==accuracy_score:
    name='accuracy_score'
  elif metric==recall_score:
    name='recall_score'
  elif metric==precision_score:
    name='precision_score'
  elif metric==f1_score:
    name='f1_score'
  return name

def stackingA(stacking,metric):
  algorithms_list = ['SVM', 'LR','RF','NB','MLP', 'EXTRA','KNN','CNN']
  results_A = np.zeros(len(algorithms_list))
  for idx_alg, algorithm in enumerate(algorithms_list):
    X_val, X_test, cols = filter_df_train_test(probas_val, probas_test, algorithm)
    X_val, X_test = filter_collinearity(X_val, X_test)
    stacking.fit(X_val, labels_val)
    y_pred = stacking.predict(X_test)
    score = round(metric(labels_test, y_pred),3)
    stack=conditions_name_stacking(stacking)
    name=conditions_name_metric(metric)
    results_A[idx_alg] = score
  return results_A

def stackingB(stacking,metric):
  fe_list = ['CV', 'TFIDF', 'W2V', 'GLOVE', 'FAST']
  results_B = np.zeros(len(fe_list))
  for idx_alg, fe in enumerate(fe_list):
    X_val, X_test, cols = filter_df_train_test(probas_val, probas_test, fe)
    X_val, X_test = filter_collinearity(X_val, X_test)
    stacking.fit(X_val, labels_val)
    y_pred = stacking.predict(X_test)
    results_B[idx_alg] = round(metric(labels_test, y_pred),3)
    stack=conditions_name_stacking(stacking)
    name=conditions_name_metric(metric)
  return results_B

def stackingC(stacking,metric):
  results_C = 0
  X_val, X_test = filter_collinearity(probas_val.to_numpy(), probas_test.to_numpy())
  stacking.fit(X_val, labels_val)
  y_pred = stacking.predict(X_test)
  results_C = round(metric(labels_test, y_pred),3)
  stack=conditions_name_stacking(stacking)
  name=conditions_name_metric(metric)
  return results_C

def compute_pairwise_diversity_matrix(targets, prediction_matrix, diversity_func):
    n_classifiers = prediction_matrix.shape[1]
    diversity = np.zeros((n_classifiers, n_classifiers))
    for clf_index in range(n_classifiers):
        for clf_index2 in range(clf_index + 1, n_classifiers):
            this_diversity = diversity_func(targets,prediction_matrix[:, clf_index],
                                            prediction_matrix[:, clf_index2])
            diversity[clf_index, clf_index2] = this_diversity
            diversity[clf_index2, clf_index] = this_diversity
    return diversity

def load_predictions(dataset_name,path):
    table_pred = pd.read_csv(path)
    print(dataset_name)
    if dataset_name == 'fakes':
        label = table_pred["label"]
        methods = table_pred.drop('label', axis=1)
    methods = methods.drop('Unnamed: 0', axis=1)
    methods = methods.drop('Unnamed: 0.1', axis=1)
    return label, methods

def Elbow_method_graph(z,total_clusters,fold):
  last = z[-total_clusters:, 2]
  last_rev = last[::-1]
  idxs = np.arange(1, len(last) + 1)
  #plt.plot(idxs, last_rev)
  acceleration = np.diff(last, 2)
  acceleration_rev = acceleration[::-1]
  k = acceleration_rev.argmax() + 2
  return(print("k clusters - elbow:", k))

def plot_diversity(D_changed, method_tmp, dataset_name, title):
    s, colors, markers = 100, {}, {}
    colors[0],colors[1],colors[2],colors[3],colors[4] = '#DAA520','#FF0000','#0000FF','#228B22','#000000'
    markers[0], markers[1], markers[2], markers[3], markers[4]= 'X','d','*',"^",'o'
    n_classifiers, n_features, _ = D_changed.shape
    plt.figure(figsize=(15,10))
    method=0
    for idx in range(n_classifiers):
        for idx2 in range(n_features):
            x, y = D_changed[idx,idx2, 0], D_changed[idx,idx2, 1]
            plt.scatter(x, y, color=colors[idx2], s=s, lw=0, marker=markers[idx2])
            plt.annotate(method_tmp[method], xy=(x, y), textcoords='offset points', xytext=(5, 15), ha='right', va='top')
            method += 1
    m1 = mlines.Line2D([], [], color=colors[0], marker=markers[0], linestyle='None', markersize=10, label='CV')
    m2 = mlines.Line2D([], [], color=colors[1], marker=markers[1], linestyle='None', markersize=10, label='TFIDF')
    m3 = mlines.Line2D([], [], color=colors[2], marker=markers[2], linestyle='None', markersize=10, label='Glove')
    m4 = mlines.Line2D([], [], color=colors[3], marker=markers[3], linestyle='None', markersize=10, label='Word2Vec')
    m5 = mlines.Line2D([], [], color=colors[4], marker=markers[4], linestyle='None', markersize=10, label='FastText')
    plt.title('CPS ' + dataset_name.upper()+' dataset')
    plt.legend(handles=[m1, m2, m3, m4, m5])
    plt.tight_layout()
    plt.savefig(title + "_" + dataset_name.upper() +'_dataset.jpg', dpi=450)

def compute_matrix_embedding(dataset_name, n_classifiers, n_features, method, n_neighbors=5, min_dist=0.7):
    label,methods=load_predictions(dataset_name,path)
    D = compute_pairwise_diversity_matrix(label.to_numpy(), methods.to_numpy(), double_fault)
    D = 1/D
    D[D==np.inf] = 0
    if method == 'mds':
        method = MDS(dissimilarity='precomputed', random_state=123456987, n_init=20, max_iter=100000)
    elif method == 'tsne':
        method = TSNE(perplexity=25, init='pca', random_state=42, early_exaggeration=50,
                      learning_rate=200, n_iter=2500, angle=0.5)
    else:
        method = UMAP(n_neighbors=2, metric='euclidean', random_state=123456987, min_dist=0.7, n_components=2,)
    D_tilde = method.fit_transform(D)
    D_tilde = D_tilde.reshape(n_classifiers, n_features, 2)
    return D_tilde

def plot_graph(table_pred,stacking,metric):
  acc_cortes=[]
  for i in range(1,41): #vai do 1 ao 40 no experimento
    a,b=stack_D_Hierarchical(k=i,table_pred=table_pred,stacking=stacking,metric=metric)
    acc_cortes.append(a)
  X_plot=pd.DataFrame(acc_cortes)[1]
  y_plot=pd.DataFrame(acc_cortes)[0]
  return X_plot,y_plot

lista_1=['CNN-CV','KNN-W2V','KNN-FAST','CNN-GLOVE','CNN-FAST','CNN-TFIDF','CNN-W2V','MLP-FAST','EXTRA-CV',
         'EXTRA-TFIDF','EXTRA-GLOVE','KNN-TFIDF','KNN-GLOVE','KNN-CV','EXTRA-W2V','EXTRA-FAST','NB-FAST',
         'MLP-CV','MLP-TFIDF','MLP-GLOVE','MLP-W2V','NB-CV','RF-W2V','RF-FAST','NB-W2V','NB-TFIDF','NB-GLOVE',
         'SVM-W2V','SVM-GLOVE','SVM-CV','SVM-TFIDF','RF-TFIDF','RF-GLOVE','LR-FAST','RF-CV','SVM-FAST','LR-CV',
         'LR-W2V','LR-TFIDF','LR-GLOVE']


## Classificadores monolíticos e grupos A, B e C

In [None]:
!mkdir Data/kaggle/F1/accuracy_score
!mkdir Data/kaggle/F1/precision_score
!mkdir Data/kaggle/F1/recall_score
!mkdir Data/kaggle/F1/f1_score
!mkdir Data/kaggle/F1/roc_auc_score
!mkdir Data/kaggle/F2/accuracy_score
!mkdir Data/kaggle/F2/precision_score
!mkdir Data/kaggle/F2/recall_score
!mkdir Data/kaggle/F2/f1_score
!mkdir Data/kaggle/F2/roc_auc_score
!mkdir Data/kaggle/F3/accuracy_score
!mkdir Data/kaggle/F3/precision_score
!mkdir Data/kaggle/F3/recall_score
!mkdir Data/kaggle/F3/f1_score
!mkdir Data/kaggle/F3/roc_auc_score
!mkdir Data/kaggle/F4/accuracy_score
!mkdir Data/kaggle/F4/precision_score
!mkdir Data/kaggle/F4/recall_score
!mkdir Data/kaggle/F4/f1_score
!mkdir Data/kaggle/F4/roc_auc_score
!mkdir Data/kaggle/F5/accuracy_score
!mkdir Data/kaggle/F5/precision_score
!mkdir Data/kaggle/F5/recall_score
!mkdir Data/kaggle/F5/f1_score
!mkdir Data/kaggle/F5/roc_auc_score
!mkdir Data/kaggle/F6/accuracy_score
!mkdir Data/kaggle/F6/precision_score
!mkdir Data/kaggle/F6/recall_score
!mkdir Data/kaggle/F6/f1_score
!mkdir Data/kaggle/F6/roc_auc_score
!mkdir Data/kaggle/F7/accuracy_score
!mkdir Data/kaggle/F7/precision_score
!mkdir Data/kaggle/F7/recall_score
!mkdir Data/kaggle/F7/f1_score
!mkdir Data/kaggle/F7/roc_auc_score
!mkdir Data/kaggle/F8/accuracy_score
!mkdir Data/kaggle/F8/precision_score
!mkdir Data/kaggle/F8/recall_score
!mkdir Data/kaggle/F8/f1_score
!mkdir Data/kaggle/F8/roc_auc_score
!mkdir Data/kaggle/F9/accuracy_score
!mkdir Data/kaggle/F9/precision_score
!mkdir Data/kaggle/F9/recall_score
!mkdir Data/kaggle/F9/f1_score
!mkdir Data/kaggle/F9/roc_auc_score
!mkdir Data/kaggle/F10/accuracy_score
!mkdir Data/kaggle/F10/precision_score
!mkdir Data/kaggle/F10/recall_score
!mkdir Data/kaggle/F10/f1_score
!mkdir Data/kaggle/F10/roc_auc_score

In [None]:
Columns_table=['SVM-CV', 'SVM-TFIDF', 'SVM-W2V', 'SVM-GLOVE','SVM-FAST', 'LR-CV', 'LR-TFIDF',
               'LR-GLOVE', 'LR-W2V', 'LR-FAST','RF-CV', 'RF-TFIDF', 'RF-GLOVE', 'RF-W2V','RF-FAST',
               'NB-CV','NB-TFIDF', 'NB-GLOVE', 'NB-W2V', 'NB-FAST', 'MLP-CV', 'MLP-TFIDF','MLP-GLOVE',
               'MLP-W2V', 'MLP-FAST', 'EXTRA-CV', 'EXTRA-TFIDF','EXTRA-GLOVE', 'EXTRA-W2V', 'EXTRA-FAST',
               'KNN-CV', 'KNN-TFIDF','KNN-GLOVE', 'KNN-W2V', 'KNN-FAST', 'CNN-CV', 'CNN-TFIDF', 'CNN-W2V','CNN-GLOVE', 'CNN-FAST']

for fold in ["F1", "F2", "F3", "F4", "F5","F6", "F7", "F8", "F9", "F10"]:
  Results_table=pd.DataFrame(index=['P'],columns=Columns_table)
  Results_table.to_csv('/content/Data/'+ dataset_name + '/' + fold + '/results_table.csv')

#funçao para salvar resultados grupos A/B/C
def saving_results_test_groups(Results_table,item,metric,fold):
  Results_table.at[item,'A-SVM-LR']=stackingA(stackingLR[1],metric)[0]
  Results_table.at[item,'A-LR-LR']=stackingA(stackingLR[1],metric)[1]
  Results_table.at[item,'A-RF-LR']=stackingA(stackingLR[1],metric)[2]
  Results_table.at[item,'A-NB-LR']=stackingA(stackingLR[1],metric)[3]
  Results_table.at[item,'A-MLP-LR']=stackingA(stackingLR[1],metric)[4]
  Results_table.at[item,'A-EXTRA-LR']=stackingA(stackingLR[1],metric)[5]
  Results_table.at[item,'A-KNN-LR']=stackingA(stackingLR[1],metric)[6]
  Results_table.at[item,'A-CNN-LR']=stackingA(stackingLR[1],metric)[7]
  Results_table.at[item,'A-SVM-RF']=stackingA(stackingRF[1],metric)[0]
  Results_table.at[item,'A-LR-RF']=stackingA(stackingRF[1],metric)[1]
  Results_table.at[item,'A-RF-RF']=stackingA(stackingRF[1],metric)[2]
  Results_table.at[item,'A-NB-RF']=stackingA(stackingRF[1],metric)[3]
  Results_table.at[item,'A-MLP-RF']=stackingA(stackingRF[1],metric)[4]
  Results_table.at[item,'A-EXTRA-RF']=stackingA(stackingRF[1],metric)[5]
  Results_table.at[item,'A-KNN-RF']=stackingA(stackingRF[1],metric)[6]
  Results_table.at[item,'A-CNN-RF']=stackingA(stackingRF[1],metric)[7]
  Results_table.at[item,'A-SVM-NB']=stackingA(stackingNB[1],metric)[0]
  Results_table.at[item,'A-LR-NB']=stackingA(stackingNB[1],metric)[1]
  Results_table.at[item,'A-RF-NB']=stackingA(stackingNB[1],metric)[2]
  Results_table.at[item,'A-NB-NB']=stackingA(stackingNB[1],metric)[3]
  Results_table.at[item,'A-MLP-NB']=stackingA(stackingNB[1],metric)[4]
  Results_table.at[item,'A-EXTRA-NB']=stackingA(stackingNB[1],metric)[5]
  Results_table.at[item,'A-KNN-NB']=stackingA(stackingNB[1],metric)[6]
  Results_table.at[item,'A-CNN-NB']=stackingA(stackingNB[1],metric)[7]
  Results_table.at[item,'B-CV-LR']=stackingB(stackingLR[1],metric)[0]
  Results_table.at[item,'B-TFIDF-LR']=stackingB(stackingLR[1],metric)[1]
  Results_table.at[item,'B-W2V-LR']=stackingB(stackingLR[1],metric)[2]
  Results_table.at[item,'B-GLOVE-LR']=stackingB(stackingLR[1],metric)[3]
  Results_table.at[item,'B-FAST-LR']=stackingB(stackingLR[1],metric)[4]
  Results_table.at[item,'B-CV-RF']=stackingB(stackingRF[1],metric)[0]
  Results_table.at[item,'B-TFIDF-RF']=stackingB(stackingRF[1],metric)[1]
  Results_table.at[item,'B-W2V-RF']=stackingB(stackingRF[1],metric)[2]
  Results_table.at[item,'B-GLOVE-RF']=stackingB(stackingRF[1],metric)[3]
  Results_table.at[item,'B-FAST-RF']=stackingB(stackingRF[1],metric)[4]
  Results_table.at[item,'B-CV-NB']=stackingB(stackingNB[1],metric)[0]
  Results_table.at[item,'B-TFIDF-NB']=stackingB(stackingNB[1],metric)[1]
  Results_table.at[item,'B-W2V-NB']=stackingB(stackingNB[1],metric)[2]
  Results_table.at[item,'B-GLOVE-NB']=stackingB(stackingNB[1],metric)[3]
  Results_table.at[item,'B-FAST-NB']=stackingB(stackingNB[1],metric)[4]
  Results_table.at[item,'C-LR']=stackingC(stackingLR[1],metric)
  Results_table.at[item,'C-RF']=stackingC(stackingRF[1],metric)
  Results_table.at[item,'C-NB']=stackingC(stackingNB[1],metric)
  return Results_table

files=[]
for fold in ["F1", "F2", "F3", "F4", "F5","F6", "F7", "F8", "F9", "F10"]:
  files.append('/content/Data/'+ dataset_name + '/' + fold + '/pred_test.csv')
#files

In [None]:
%%time
#classificadores monolíticos, grupo A, grupo B e grupo C
def saving_results_test(files,Results_table,metric,dataset_name,fold):
  lista_table=[]
  for element in files:
    table=pd.read_csv(element)
    table=table.drop(columns=['Unnamed: 0','Unnamed: 0.1'])
    label_list=np.array(table['label'])
    for column in table.columns:
      pred_list=np.array(table[column])
      lista_table.append([round(metric(label_list,pred_list),3),column])
    for column in Results_table:
      for i in range(len(lista_table)):
        if column == lista_table[i][1] and element=='/content/Data/'+dataset_name+'/'+fold+'/pred_test.csv':
          Results_table.at['P', column]=lista_table[i][0]
  return Results_table

for fold in ["F1", "F2", "F3", "F4", "F5","F6", "F7", "F8", "F9", "F10"]:
  train_labels=pd.read_csv('/content/Data/'+dataset_name+'/'+fold+'/train_labels.csv')
  val_labels=pd.read_csv('/content/Data/'+dataset_name+'/'+fold+'/val_labels.csv')
  test_labels=pd.read_csv('/content/Data/'+dataset_name+'/'+fold+'/test_labels.csv')
  val_df = pd.read_csv('/content/Data/'+dataset_name+'/'+fold+'/prob_val.csv').dropna()
  test_df = pd.read_csv('/content/Data/'+dataset_name+'/'+fold+'/prob_test.csv').dropna()
  probas_val, labels_val = val_df.drop(columns=["Unnamed: 0","Unnamed: 0.1","label"]), val_labels["label"]
  probas_test, labels_test = test_df.drop(columns=["Unnamed: 0","Unnamed: 0.1","label"]), test_labels["label"]

  for metric in [accuracy_score,precision_score,recall_score,f1_score,roc_auc_score]:
    path='./Data/'+dataset_name+'/'+fold+'/'+ str(conditions_name_metric(metric))
    results_test_monolithcs=saving_results_test(files,Results_table,metric,dataset_name,fold)
    results_test_monolithcs.to_csv('{}/results_test_monolithcs.csv'.format(path))

    Results_table=saving_results_test(files,Results_table,metric,dataset_name,fold)
    Results_table2=Results_table.copy()
    Results_table2=saving_results_test_groups(Results_table=Results_table2,item='P',metric=metric,fold=fold)
    Results_table2.to_csv('{}/Results_table2.csv'.format(path))

## Grupo D (Agrupamento Hierárquico)

In [None]:
%%time
for fold in ["F1", "F2", "F3", "F4", "F5","F6", "F7", "F8", "F9", "F10"]:
  train_labels = pd.read_csv('/content/Data/'+dataset_name+'/'+fold+'/train_labels.csv')
  val_labels = pd.read_csv('/content/Data/'+dataset_name+'/'+fold+'/val_labels.csv')
  test_labels = pd.read_csv('/content/Data/'+dataset_name+'/'+fold+'/test_labels.csv')
  val_df = pd.read_csv('/content/Data/'+dataset_name+'/'+fold+'/val.csv')
  test_df = pd.read_csv('/content/Data/'+dataset_name+'/'+fold+'/test.csv')
  probas_val, labels_val = val_df.drop(columns=["Unnamed: 0","Unnamed: 0.1","label"]), val_labels["label"]
  probas_test, labels_test = test_df.drop(columns=["Unnamed: 0","Unnamed: 0.1","label"]), test_labels["label"]
  table_pred =  pd.read_csv('/content/Data/' + dataset_name + '/' + fold + '/pred_val.csv')
  labels = table_pred["label"]
  methods = table_pred
  methods = table_pred.drop('label', axis=1)
  methods = methods.drop('Unnamed: 0', axis=1)
  methods=methods.drop('Unnamed: 0.1', axis=1)
  D = compute_pairwise_diversity_matrix(labels.to_numpy(), methods.to_numpy(), double_fault)
  D_prime = np.triu(D)
  z = linkage(D_prime,'average')
  distancias = z[-40:,2]
  max_d = max(distancias)+0.1
  min_d = 0.0
  lista_distancias = np.sort(distancias)[::-1]
  print("threshould of dendrogram rounded" + fold +": ",round(0.7*max(z[:,2]),3))
  Elbow_method_graph(z,40,fold)

In [None]:
def stackingD(k,stacking,metric,fold):
    train_df = pd.read_csv('/content/content/Data/'+dataset_name+'/'+fold+'/prob_train.csv').dropna()
    val_df = pd.read_csv('/content/content/Data/'+dataset_name+'/'+fold+'/prob_val.csv').dropna()
    test_df = pd.read_csv('/content/content/Data/'+dataset_name+'/'+fold+'/prob_test.csv').dropna()
    probas_val, labels_val = val_df.drop(columns=["Unnamed: 0","label"]), val_df["label"]
    probas_test, labels_test = test_df.drop(columns=["Unnamed: 0","label"]), test_df["label"]
    results = []
    table_pred = pd.read_csv('/content/content/Data/' + dataset_name + '/' + fold + '/pred_val.csv')
    table = table_pred.drop(columns=['Unnamed: 0','Unnamed: 0.1','label'])
    label_list=np.array(table_pred['label'])
    for column in table.columns:
      pred_list = np.array(table[column])
      results.append([metric(label_list,pred_list),column])
    resuts = results.sort(reverse=True)
    df_results=pd.DataFrame(results,columns=['col0','col1'])
    if k == 1:
      lista_2 = fcluster(z, max_d , criterion='distance')
    elif k == 40:
      lista_2 = fcluster(z, min_d , criterion='distance')
    else:
      lista_2 = fcluster(z, lista_distancias[k-1] , criterion='distance')
    df = pd.DataFrame(data = {'col0': lista_2, 'col1': lista_1})
    df = df.merge(df_results, on='col1')
    df = df.sort_values(by='col0_y', ascending=False)
    models_D=[]
    names=[]
    final_dataframe=pd.DataFrame(columns=['k','metric','models'])
    window = df.groupby('col0_x').max()
    result = window.sort_values(by='col0_y', ascending=False)
    models_D = list(result['col1'])
    names=str(models_D[0])
    for item in models_D[1:]:
      names += '|'+item
    results_D = np.zeros(len(models_D))
    sum=0.0
    for idx_alg, fe in enumerate(models_D):
      X_val, X_test, cols = filter_df_train_test(probas_val, probas_test, names)
      X_val, X_test = filter_collinearity(X_val, X_test)
      stacking.fit(X_val, labels_val)
      y_pred = stacking.predict(X_test)
      results_D[idx_alg] = round(metric(labels_test, y_pred),3)
    return k,round(results_D.mean(),3),models_D

In [None]:
def saving_stacking_D(stacking, metric):
  final_dataframe=pd.DataFrame(columns=['k','metric','models'])
  for fold in ["F1", "F2", "F3", "F4", "F5","F6", "F7", "F8", "F9", "F10"]:
    for k in range(1,41):
      a,b,c = stackingD(k,stacking,metric,fold)
      final_dataframe.loc[k-1] = [a,b,c]
      final_dataframe = final_dataframe.iloc[0:40]
      final_dataframe.to_csv('/content/Data/' + dataset_name + '/'+ fold +'/'+ conditions_name_metric(metric) +
                             '/final_dataframe_'+ conditions_name_stacking(stacking) +'.csv')
  data_final = pd.DataFrame()
  for fold in ["F1", "F2", "F3", "F4", "F5","F6", "F7", "F8", "F9", "F10"]:
    df = pd.read_csv('/content/Data/'+ dataset_name + '/'+fold+'/'+ conditions_name_metric(metric) +
                     '/final_dataframe_'+ conditions_name_stacking(stacking) +'.csv')
    df = df.sort_values(by=['metric'],ascending=False).head(1)
    df['fold'] = fold
    data_final = pd.concat([data_final,df])
  data_final = data_final.drop(columns=['Unnamed: 0'])
  data_final.reset_index(inplace=False)
  return data_final.to_csv('/content/Data/' + dataset_name + '_' + conditions_name_metric(metric) +
                           '_final_dataframe_all_'+ conditions_name_stacking(stacking) +'.csv')

In [None]:
for metrica in accuracy_score, precision_score, recall_score, f1_score:
  for stacking_algorithm in stackingLR[1], stackingRF[1], stackingNB[1]:
    stacking, metric = stacking_algorithm, metrica
    saving_stacking_D(stacking, metric)