In [None]:
import numpy as np
from numpy import array
import pandas as pd
import joblib
from sklearn import metrics
from sklearn import model_selection
from numpy import loadtxt
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline 
from mlxtend.classifier import StackingClassifier
from sklearn.model_selection import train_test_split,cross_val_score,KFold,cross_validate
from sklearn.metrics import precision_score,accuracy_score,roc_auc_score,recall_score,confusion_matrix,f1_score
from mlxtend.feature_selection import ColumnSelector
import warnings
warnings.filterwarnings("ignore")

In [None]:
def model_train(str,data_x,data_y):
    result=[]
    x_train,x_test,y_train,y_test = train_test_split(data_x,data_y,test_size=0.3,stratify=y,random_state=10)
    model = str.fit(x_train,y_train)
    y_predict = model.predict(x_test)
    conf = confusion_matrix(y_test,y_predict)
    tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
    print(conf)
    print('Accuracy score: ',accuracy_score(y_test, y_predict))
    print('Recall score: ', recall_score(y_test, y_predict))
    print('Precision score: ', precision_score(y_test, y_predict))
    print('f1 score: ', f1_score(y_test, y_predict))
    print('FPR:',fp/(fp+tn))
    result = [accuracy_score(y_test, y_predict),precision_score(y_test, y_predict),recall_score(y_test, y_predict),
              f1_score(y_test, y_predict),fp/(fp+tn)]
    return result

In [None]:
# import dataset
X = loadtxt('M7_2w.csv')
y = loadtxt('y4w.csv')

In [None]:
# define sub-classifier
clf = RandomForestClassifier(oob_score=True,n_jobs=-1)
xgb = XGBClassifier(eval_metric=['logloss','auc','error'],max_depth=12,n_jobs=-1)
gnb = GaussianNB()

In [None]:
# define DMF classifier
def model(num):
    sclf = RandomForestClassifier(max_depth=12,n_estimators=100,oob_score=True,n_jobs=-1)
    sxgb = XGBClassifier(eval_metric=['logloss','auc','error'],max_depth=12,n_estimators=120,n_jobs=-1)
    sgnb = GaussianNB()
    pipe1 = make_pipeline(ColumnSelector(cols=range(num)),sclf)

    pipe2 = make_pipeline(ColumnSelector(cols=range(num)),sxgb)

    pipe3 = make_pipeline(ColumnSelector(cols=range(num)),sgnb)

    stack = StackingClassifier(classifiers=[pipe1,pipe2,pipe3], meta_classifier=LogisticRegression(solver="lbfgs",class_weight = 'balanced'))
    return stack

# Before feature reduction：

In [None]:
%%time
stack1 = model_train(model(X.shape[1]),X,y)

# After feature reduction：

# F10 results

In [None]:
# DMF performance
X10 = loadtxt('reduction10.csv')

%time stack10 = model_train(model(10),X10,y)

In [None]:
# compare sub-classifier with DMF
print('X10: clf')
c10 = model_train(clf,X10,y)
print('\n','xgb')
x10 = model_train(clf,X10,y)
print('\n','gnb')
g10 = model_train(clf,X10,y)

# F15 results

In [None]:
X15 = loadtxt('reduction15.csv')

%time stack15 = model_train(model(15),X15,y)

# F20 results

In [None]:
X20 = np.loadtxt('reduction20.csv')

%time stack20 = model_train(model(20),X20,y)

# F25 results

In [None]:
X25 = np.loadtxt('reduction25.csv')

%time stack25 = model_train(model(25),X25,y)

# F30 results

In [None]:
X30 = np.loadtxt('reduction30.csv')

%time stack30 = model_train(model(30),X30,y)

# F35 results

In [None]:
X35 = np.loadtxt('reduction35.csv')

%time stack35 = model_train(model(35),X35,y)

# F40 result

In [None]:
X40 = np.loadtxt('reduction40.csv')

%time stack40 = model_train(model(40),X40,y)

# F45 result

In [None]:
X45 = np.loadtxt('reduction45.csv')

%time stack45 = model_train(model(45),X45,y)

# F50 result

In [None]:
X50 = np.loadtxt('reduction50.csv')

%time stack50 = model_train(model(50),X50,y)