# Imports

In [3]:
import csv
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn import metrics
import math

# sys.path.append('interface')

# Allows jupyter notebook to be imported
import jupyter_import

# Suppress Warnings
import warnings
warnings.filterwarnings('ignore')

# Preprocessing

In [5]:
from data_preproc.Preprocess import preprocess, Normalize

importing Jupyter notebook from D:\FYP i guess\I need to do Naive Bayes\Program\Algorithm\data_preproc\Preprocess.ipynb


# Feature Selection

In [6]:
from data_preproc.CFS import cfs_algo
from data_preproc.RFE import rfe_algo
from data_preproc.RR import ridge_algo

importing Jupyter notebook from D:\FYP i guess\I need to do Naive Bayes\Program\Algorithm\data_preproc\CFS.ipynb
importing Jupyter notebook from D:\FYP i guess\I need to do Naive Bayes\Program\Algorithm\data_preproc\RFE.ipynb
importing Jupyter notebook from D:\FYP i guess\I need to do Naive Bayes\Program\Algorithm\data_preproc\RR.ipynb


# Algorithms

## Base Predictors

In [7]:
from pred_mdls.base.Complement_Naive_Bayes import complement_naive_bayes_model
from pred_mdls.base.Decision_Tree import decision_tree_model
from pred_mdls.base.Logistic_Regression import logistic_regression_model
from pred_mdls.base.Multi_Layer_Perceptron import multi_layer_perceptron_model
from pred_mdls.base.Naive_Bayes import naive_bayes_model

importing Jupyter notebook from D:\FYP i guess\I need to do Naive Bayes\Program\Algorithm\pred_mdls\base\Complement_Naive_Bayes.ipynb
importing Jupyter notebook from D:\FYP i guess\I need to do Naive Bayes\Program\Algorithm\pred_mdls\base\Decision_Tree.ipynb
importing Jupyter notebook from D:\FYP i guess\I need to do Naive Bayes\Program\Algorithm\pred_mdls\base\Logistic_Regression.ipynb
importing Jupyter notebook from D:\FYP i guess\I need to do Naive Bayes\Program\Algorithm\pred_mdls\base\Multi_Layer_Perceptron.ipynb
importing Jupyter notebook from D:\FYP i guess\I need to do Naive Bayes\Program\Algorithm\pred_mdls\base\Naive_Bayes.ipynb


## Ensemble Predictors

In [8]:
from pred_mdls.ensemble.Random_Forest import random_forest_model
from pred_mdls.ensemble.Rotation_Forest import rotation_forest_model
from pred_mdls.ensemble.Voting import voting_model

importing Jupyter notebook from D:\FYP i guess\I need to do Naive Bayes\Program\Algorithm\pred_mdls\ensemble\Random_Forest.ipynb
importing Jupyter notebook from D:\FYP i guess\I need to do Naive Bayes\Program\Algorithm\pred_mdls\ensemble\Rotation_Forest.ipynb
importing Jupyter notebook from D:\FYP i guess\I need to do Naive Bayes\Program\Algorithm\pred_mdls\ensemble\Voting.ipynb


# Evaluation Metrics

In [9]:
from pf_eval.AUC_ROC import auc_roc_model
from pf_eval.F1_Score import f1_model
from pf_eval.CSV import write_results
from pf_eval.Confusion_Matrix import confusion_matrix_model

importing Jupyter notebook from D:\FYP i guess\I need to do Naive Bayes\Program\Algorithm\pf_eval\AUC_ROC.ipynb
importing Jupyter notebook from D:\FYP i guess\I need to do Naive Bayes\Program\Algorithm\pf_eval\F1_Score.ipynb
importing Jupyter notebook from D:\FYP i guess\I need to do Naive Bayes\Program\Algorithm\pf_eval\CSV.ipynb
importing Jupyter notebook from D:\FYP i guess\I need to do Naive Bayes\Program\Algorithm\pf_eval\Confusion_Matrix.ipynb


# Additional Functions

In [10]:
def data_conversion(data):
    for i in range(len(data)):
        if data[i] == b'N' or data[i] == b'false' or data[i] == b'no':
            data[i] = 0
        else:
            data[i] = 1
    return data

def read_data(filename):
    data = arff.loadarff(filename)
    loaddata = pd.DataFrame(data[0])
    return loaddata

def process_data(loaddata,features):
    # Features are selected based on CFS
    software_metrics = np.array(loaddata[features])
    labels = np.array(loaddata['Defective'])
    return software_metrics,labels

def train_data(software_metrics,labels):
    X_train, X_test, y_train, y_test = train_test_split(software_metrics, labels, test_size = 0.1)
    y_train = y_train.astype('str')
    y_test = y_test.astype('str')
    return X_train, X_test, y_train, y_test

def evaluate_data(model,X_test,y_test):
    auc_score = auc_roc_model(model,X_test,y_test)
    f1_score = f1_model(model,X_test,y_test)
    fpr,fnr = confusion_matrix_model(model,X_test,y_test)
    return auc_score,f1_score,fpr,fnr

def translate(result):
    count = 1
    res = []
    while count <= 3:
        for i in range(len(result[0])):
            res.append([result[0][i], result[1][((i+1)*count)-1],result[2][((i+1)*count)-1]])
        count += 1
    return res

# Result writers

In [11]:
def main_writer(header,result):
    #Writes the output of a single dataset for main function
    filters = ['No filter','CFS','RFE']
    with open('pred_results.csv','w',encoding='UTF8', newline='') as file:
        res = csv.writer(file)
        for i in range(len(filters)):
            res.writerow('')
            res.writerow([filters[i]])
            res.writerow(header)
            res.writerow([result[0][0]] + result[0][1][i*8:i*8+8])
            res.writerow([result[1][0]] + result[1][1][i*8:i*8+8])
    
# def run(datasets, savename, repository):
def run(datasets, savename, results, model_name, pp_name):
    #Writes the output of multiple datasets for the main function
    # header = ['Model name','Complement Naive Bayes','Decision Tree','Logistic regression',
    #                     'Multi Layer Perceptron','Naive Bayes','Random Forest','Rotation Forest','Voting']
    header = ['Model name'] + model_name
    # filters = ['(All)','(CFS)','(RFE)']
    n = len(model_name)
    with open('csv_results/' + savename + '.csv','w',encoding='UTF8', newline='') as csv_file:
        res = csv.writer(csv_file)
        for k in range(len(results[0])):
            # AUC, F1, FPR, FNR
            res.writerow([results[0][k][0]])
            # Model Name
            res.writerow(header)
            for j in range(len(results)):
                col_num = 0
                for i in range(len(pp_name[j])):
                    res.writerow([f'{datasets[j]} ({pp_name[j][i]})'] + results[j][k][1][col_num:col_num+n])
                    col_num += n
            if k != len(results[0])-1:
                res.writerow('')
    # with open(savename,'w',encoding='UTF8', newline='') as file:
        # results = []
        # for ds in datasets:
        #     if repository == 'NASA':
        #         results.append(main('datasets/NASA/' + ds + '.txt')[0])
        #     else:
        #         results.append(main('datasets/PROMISE/' + ds + '.txt')[0])
        # res = csv.writer(file)
        # for k in range(len(results[0])):
        #     res.writerow([results[0][k][0]])
        #     res.writerow(header)
        #     for j in range(len(results)):
        #         for i in range(len(filters)):
        #             res.writerow([datasets[j]+filters[i]] + results[j][k][1][i*8:i*8+8])
        #     if k != len(results[0])-1:
        #         res.writerow('')

        #res.writerow([results[0][0][0]])
        #res.writerow(header)        
        #for j in range(len(results)):
            #for i in range(len(filters)):
                #res.writerow([datasets[j]+filters[i]] + results[j][0][1][i*8:i*8+8])
        #res.writerow('')
        #res.writerow([results[0][1][0]])
        #res.writerow(header)
        #for j in range(len(results)):
            #for i in range(len(filters)):
                #res.writerow([datasets[j]+filters[i]] + results[j][1][1][i*8:i*8+8])
        #res.writerow('')
        #res.writerow([results[0][2][0]])
        #res.writerow(header)
        #for j in range(len(results)):
            #for i in range(len(filters)):
                #res.writerow([datasets[j]+filters[i]] + results[j][2][1][i*8:i*8+8])
        #res.writerow('')
        #res.writerow([results[0][3][0]])
        #res.writerow(header)
        #for j in range(len(results)):
            #for i in range(len(filters)):
                #res.writerow([datasets[j]+filters[i]] + results[j][3][1][i*8:i*8+8])

# Main

In [12]:
def main_algo_run(filename,fs_res,pred_res,train_res):
    # Read the file
    loaddata = read_data(filename)
    loaddata = Normalize(loaddata)
    SM = np.array(loaddata.iloc[:,:-1]) #Software metrics
    L = data_conversion(np.array(loaddata.iloc[:,-1])).astype(int) #Labels
    data = [SM,L]
    lookup_model = ['Complement Naive Bayes','Decision Tree','Logistic regression','Multi Layer Perceptron','Naive Bayes',
            'Random Forest','Rotation Forest','Voting'] #Models used
    lookup_pp = ['All','CFS','RFE']
    # selection = ""
    # while selection.strip().replace(" ","").isdigit() == False:
    #     selection = input("Please select which models you would like to use by inputting the numbers specified beside them.\n" 
    #     "To make multiple selections, seperate the numbers by spaces.\n" 
    #     "1. Complement Naive Bayes\n"  
    #     "2. Decision Tree\n" 
    #     "3. Logistic regression\n" 
    #     "4. Multi Layer Perceptron\n"  
    #     "5. Naive Bayes\n"  
    #     "6. Random Forest\n"  
    #     "7. Rotation Forest\n" 
    #     "8. Voting\n")
    # selection = selection.strip().replace(" ","")
    # selection = sorted(selection)
    # model_name = []
    # for i in selection:
    #     model_name.append((model_name[int(i)-1],model_name.index(model_name[int(i)-1])))
    # print(model_name)
    # ===== Feature Selection ====== #

    fs_arr = [i for i in range(len(fs_res)) if fs_res[i]]

    pp_name = [lookup_pp[i] for i in fs_arr]

    def feature_selection(fs_res,loaddata,data,train_size,k_fold):
        pp_arr = []
        feature_funcs = [cfs_algo,rfe_algo]
        if fs_res[0]:
            pp_arr.append(preprocess(loaddata,k_fold))
        for i in range(1,len(fs_res)):
            if fs_res[i]:
                _,f_selection = feature_funcs[i-1](data,train_size)
                pp_arr.append(preprocess(loaddata,k_fold,f_selection))
        return pp_arr

    train_size = int(train_res['tt']) if not train_res['tt'] == '' else 10

    k_fold = int(train_res['kfold']) if not train_res['kfold'] == '' else 5

    pp_arr = feature_selection(fs_res,loaddata,data,train_size,k_fold)
            
    # # ==== CFS ==== #
    # cfs, cfs_selections = cfs_algo(data,10)
    # # ============= #

    # # ===== RFE ======== #
    # rfe, rfe_selections = rfe_algo(data,10)
    # # ================== #
    
    # # ========= Preprocessing ============= #
    # pp = preprocess(loaddata)
    # pp_cfs = preprocess(loaddata, cfs_selections)
    # pp_rfe = preprocess(loaddata, rfe_selections)

    # pp_arr = [pp,pp_cfs,pp_rfe]
    # pp_name = ['No filters','CFS Feature Selection','RFE Feature Selection']
    # ====================================== #

    base_preds = [i for i,pred in enumerate(pred_res['base']) if pred == 1]

    ensemble_preds = [i for i,pred in enumerate(pred_res['ensemble']) if pred == 1]

    model_name = [lookup_model[index] for index in base_preds] + [lookup_model[index+5] for index in ensemble_preds]
    
    length_preds = len(base_preds) + len(ensemble_preds)

    def model_creation(base_preds,ensemble_preds,data):
        models = []
        args = [1000]
        base_funcs = [
            complement_naive_bayes_model,
            decision_tree_model,
            logistic_regression_model,
            multi_layer_perceptron_model,
            naive_bayes_model 
        ]
        ensemble_funcs = [
            random_forest_model,
            rotation_forest_model,
            voting_model
        ]
        for index in base_preds:
            models.append(base_funcs[index](data))
        for index in ensemble_preds:
            models.append(ensemble_funcs[index](data,args))
        return models

    result = []
    arr_size = length_preds*len(pp_name) #Result array size
    auc_arr = [0]*arr_size
    f1_arr = [0]*arr_size
    fpr_arr = [0]*arr_size
    fnr_arr = [0]*arr_size
    header = []
    folds = 5
    for j,pp in enumerate(pp_arr):
        for i in range(folds):
            data = [pp[i][0],pp[i][2]]
            
            # # ======== Model Creation =========== #
            # # Base Predictors
            # cnb = complement_naive_bayes_model(data)
            # dt = decision_tree_model(data)
            # lr = logistic_regression_model(data)
            # mlp = multi_layer_perceptron_model(data)
            # nb = naive_bayes_model(data)

            # args = [1000]
            # # Ensemble Predictors
            # rf = random_forest_model(data,args)
            # rof = rotation_forest_model(data,args)
            # vt = voting_model(data,args)
            # ==================================== #
            # models = [cnb,dt,lr,mlp,nb,rf,rof,vt]
            # used_models = []
            # for x in selection:
            #     used_models.append(models[int(x)-1])

            models = model_creation(base_preds,ensemble_preds,data)
            
            for k in range(len(models)):
                auc_score,f1_score,fpr,fnr = evaluate_data(models[k],pp[i][1],pp[i][3])
                if math.isnan(auc_score):
                    #print(model_name[k], auc_score)
                    auc_score = 0
                auc_arr[(j*len(model_name))+k] += auc_score
                f1_arr[(j*len(model_name))+k] += f1_score
                fpr_arr[(j*len(model_name))+k] += fpr
                fnr_arr[(j*len(model_name))+k] += fnr

    for i in range(len(auc_arr)):
        auc_arr[i] /= folds
        auc_arr[i] = round(auc_arr[i],3)
        f1_arr[i] /= folds
        f1_arr[i] = round(f1_arr[i],3)
        fpr_arr[i] /= folds
        fpr_arr[i] = round(fpr_arr[i],3)
        fnr_arr[i] /= folds
        fnr_arr[i] = round(fnr_arr[i],3)
    header.append('Model Name')
    for i in model_name:
        header.append(i[0]) 
    result.append(('AUC', auc_arr))
    result.append(('F1 Score', f1_arr))
    result.append(('False Positive Rate', fpr_arr))
    result.append(('False Negative Rate', fnr_arr))
    return model_name,pp_name,result
    #Print filename upon completion
    # print(filename)
    # return result, header
      
if __name__=='__main__':
    N_filenames = ['CM1.arff','JM1.arff','KC1.arff','KC3.arff',
                   'KC4.arff','MC1.arff','MC2.arff','MW1.arff',
                   'PC1.arff','PC2.arff','PC3.arff','PC4.arff','PC5.arff']
    P_filenames = ['cm1.arff','jm1.arff','kc1.arff','kc2.arff','pc1.arff']
    run(N_filenames,'NASA.csv','NASA')
    run(P_filenames,'PROMISE.csv','PROMISE')
    #========== Running main program =========#
    result, header = main_algo_run('datasets/NASA/CM1.arff.txt')
    main_writer(header,result)

Please select which models you would like to use by inputting the numbers specified beside them.
To make multiple selections, seperate the numbers by spaces.
1. Complement Naive Bayes
2. Decision Tree
3. Logistic regression
4. Multi Layer Perceptron
5. Naive Bayes
6. Random Forest
7. Rotation Forest
8. Voting
1 3 2
[('Complement Naive Bayes', 0), ('Decision Tree', 1), ('Logistic regression', 2)]
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
datasets/NASA/CM1.arff.txt
