In [None]:
#!/bin/python

import sys
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from numpy import interp
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import plot_precision_recall_curve
from sklearn.utils import resample
from sklearn import preprocessing
from pathlib import Path
from joblib import dump, load

In [None]:
iteration_CI = 1000

n_tree= 1000
n_max_depth = None
n_min_samples_split = 4
n_min_samples_leaf = 2

prediction_variable = 'ASAS20'

std_cutoff = 0.05
Path("./CI").mkdir(parents=True, exist_ok=True)
Path("./CI/RF").mkdir(parents=True, exist_ok=True)
output_file = './CI/RF/performance_for_CI_of_RF_models.txt'

df=pd.read_csv('./Input_data/AS_input.txt', sep='\t')

df=df[df.ASAS20 != 3]

colnames = df.columns
result_column_list = ['newID', 'region', 'BSD', 'DFT1', 'DFT2', 'ASDAS1', 'ASDAS2', 'ASAS20', 'ASAS40']
x_colnames_1 = [item for item in colnames if item not in result_column_list] 
# They are not needed because it means results

In [None]:
df_training = df[(df.region!=2) & (df.region != 11) & (df.region != 21) & (df.region != 3) & (df.region != 24)]
df_independent = df[(df.region == 2) | (df.region == 11) | (df.region == 21) | (df.region == 3) | (df.region == 24)]

In [None]:
# Independent dataset

data_y_independent = df_independent[prediction_variable].to_numpy()

In [None]:
perf_file = open(output_file, 'w')

for i in range(iteration_CI):
    
    print(i)
    
    seed = random.randint(1,1000)
    
    train_bootstrap_index = resample(range(len(df_training)), n_samples = round(len(df_training)*2/3), random_state = seed)
    oob_index = [x for x in range(len(df_training)) if x not in train_bootstrap_index]
    train_bootstrap = df_training.iloc[train_bootstrap_index,:]
    test_bootstrap = df_training.iloc[oob_index,:]
    
    train_bootstrap_x_data = train_bootstrap[x_colnames_1]
    # Remove variables includes only one value.
    train_bootstrap_x_remov_novar = train_bootstrap_x_data.loc[:,train_bootstrap_x_data.std() != 0]
    
    # Remove variables includes only small variance.
    pre_scaler = preprocessing.MinMaxScaler()
    df_training_pre_scaled = pre_scaler.fit_transform(train_bootstrap_x_remov_novar)
    remain_boolean = df_training_pre_scaled.std(axis=0) >= std_cutoff
    colnames_remain = train_bootstrap_x_remov_novar.columns[remain_boolean]
    # colnames_remain will be used for independent data again.

    training_scaler = preprocessing.MinMaxScaler()
    train_bootstrap_x_bf_scaling = train_bootstrap_x_remov_novar[colnames_remain].to_numpy()
    train_bootstrap_x = training_scaler.fit_transform(train_bootstrap_x_bf_scaling)
    train_bootstrap_y = train_bootstrap[prediction_variable].to_numpy()
    
    test_bootstrap_x_bf_scaling = test_bootstrap[colnames_remain].to_numpy()
    test_bootstrap_x = training_scaler.transform(test_bootstrap_x_bf_scaling)
    test_bootstrap_y = test_bootstrap[prediction_variable].to_numpy()
    
    model = RandomForestClassifier(n_estimators=n_tree, max_depth = n_max_depth, 
                                   min_samples_split = n_min_samples_split, min_samples_leaf = n_min_samples_leaf)
    model.fit(train_bootstrap_x, train_bootstrap_y)
    
    # Result variable 1: accuracy (test_acc)
    test_acc = model.score(test_bootstrap_x, test_bootstrap_y)
    
    predictions = model.predict_proba(test_bootstrap_x)[:,1]
    fpr, tpr, threshold = metrics.roc_curve(test_bootstrap_y, predictions)
    # Result variable 2: roc_auc
    roc_auc = metrics.auc(fpr, tpr)

    precision, recall, thresholds = precision_recall_curve(test_bootstrap_y, predictions)
    # Result variable 3: f1
    f1 = f1_score(test_bootstrap_y, predictions.round())
    # Result variable 4: Precision-Recall AUC
    rp_auc = metrics.auc(recall, precision)
    
    # Independent dataset with independent scaler
    
    ind_scaler = preprocessing.MinMaxScaler()
    data_x_independent_bf_scaling = df_independent[colnames_remain].to_numpy()
    data_x_independent = ind_scaler.fit_transform(data_x_independent_bf_scaling)
    
    # test_acc_ind
    test_acc_ind = model.score(data_x_independent, data_y_independent)

    predictions_ind = model.predict_proba(data_x_independent)[:,1]
    fpr_ind, tpr_ind, threshold = metrics.roc_curve(data_y_independent, predictions_ind)
    # roc_auc_ind
    roc_auc_ind = metrics.auc(fpr_ind, tpr_ind)

    precision_ind, recall_ind, thresholds = precision_recall_curve(data_y_independent, predictions_ind)
    # f1_ind
    f1_ind = f1_score(data_y_independent, predictions_ind.round())
    # rp_auc_ind
    rp_auc_ind = metrics.auc(recall_ind, precision_ind)

    perf_file.write('\t'.join([str(i), str(seed), str(test_acc), str(roc_auc), str(f1), str(rp_auc),
                               str(test_acc_ind), str(roc_auc_ind), str(f1_ind), str(rp_auc_ind)]) +'\n')
    
perf_file.close()