# Imports

In [10]:
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Preprocessing

In [11]:
#========== IMPORTS =============#
# Allows jupyter notebook to be imported
import jupyter_import
from Algorithm.data_preproc.Preprocess import preprocess
#================================#

# Feature Selection

In [8]:
from Algorithm.data_preproc.CFS import cfs_algo
from Algorithm.data_preproc.RFE import rfe_algo
from Algorithm.data_preproc.RR import ridge_algo

# Algorithms

## Base Predictors

In [9]:
from Algorithm.pred_mdls.base.Complement_Naive_Bayes import complement_naive_bayes_model
from Algorithm.pred_mdls.base.Decision_Tree import decision_tree_model
from Algorithm.pred_mdls.base.Logistic_Regression import logistic_regression_model
from Algorithm.pred_mdls.base.Multi_Layer_Perceptron import multi_layer_perceptron_model
from Algorithm.pred_mdls.base.Naive_Bayes import naive_bayes_model

## Ensemble Predictors

In [12]:
from Algorithm.pred_mdls.ensemble.Random_Forest import random_forest_model
from Algorithm.pred_mdls.ensemble.Rotation_Forest import rotation_forest_model
from Algorithm.pred_mdls.ensemble.Voting import voting_model

In [66]:
def data_conversion(data):
    for i in range(len(data)):
        if data[i] == b'N':
            data[i] = 0
        else:
            data[i] = 1
    return data

def read_data(filename):
    data = arff.loadarff(filename)
    loaddata = pd.DataFrame(data[0])
    return loaddata

def process_data(loaddata,features):
    # Features are selected based on CFS
    software_metrics = np.array(loaddata[features])
    labels = np.array(loaddata['Defective'])
    return software_metrics,labels

def train_data(software_metrics,labels):
    X_train, X_test, y_train, y_test = train_test_split(software_metrics, labels, test_size = 0.1)
    y_train = y_train.astype('str')
    y_test = y_test.astype('str')
    return X_train, X_test, y_train, y_test

def feature_selection(fs,X):
    dfscores = pd.DataFrame(fs.scores_)
    dfcolumns = pd.DataFrame(X.columns)
    feature_scores = pd.concat([dfcolumns,dfscores.fillna(0)],axis=1)
    feature_scores.columns = ['Feature','Score']
    best_ten_features = feature_scores.nlargest(10,'Score')
    return best_ten_features

# def feature_selection_algo(X,Y):
#     array = []
#     data = [X,Y]
#     cfs = cfs_algo(data)
#     ridge = ridge_algo(data)
#     print(f'Ridge score: {ridge.get_support()}')
    # rfe = rfe_algo(data)
    # print(f'RFE score: {rfe.score}')

def main(filename):
    # Read the file
    loaddata = read_data(filename)

    software_metrics = loaddata.iloc[:,:-1] #Software metrics
    labels = loaddata.iloc[:,-1] #Labels

    X_train, X_test, y_train, y_test = train_data(software_metrics,labels)

    # ===== Feature Selection ====== #

    # ==== CFS ==== #
    data = [X_train,y_train]
    cfs = cfs_algo(data)
    cfs_best_ten_features = feature_selection(cfs,software_metrics)
    print(cfs_best_ten_features)
    # ============= #

    # ===== Ridge ===== #
    software_metrics = np.array(loaddata.iloc[:,:-1]) #Software metrics
    labels = data_conversion(np.array(loaddata.iloc[:,-1])) #label 

    X_train, X_test, y_train, y_test = train_data(software_metrics,labels)

    data = [X_train,y_train]
    ridge = ridge_algo(data)
    # print(ridge.intercept_)
    # print(ridge.score(software_metrics,labels))
    # features = best_ten_features['Feature'].values
    # ===================== #

    # ===== RFE ======== #
    rfe = rfe_algo(data)
    print(rfe.ranking_)

    # software_metrics,labels = process_data(loaddata,features)

    # X_train, X_test, y_train, y_test = train_data(software_metrics,labels)

    # Preprocessing
    preprocessed_data = preprocess(filename)

    # Algorithms
    # return preprocessed_data

filename = 'KC4.arff.txt'
main(filename)

 37 38] are constant.
  f = msb / msw


                            Feature      Score
27             MAINTENANCE_SEVERITY  30.962073
2                        CALL_PAIRS  23.967034
30                       NODE_COUNT  14.646462
12                       EDGE_COUNT  14.167501
1                      BRANCH_COUNT  10.361136
6             CYCLOMATIC_COMPLEXITY  10.361136
31  NORMALIZED_CYLOMATIC_COMPLEXITY   8.996847
11                   DESIGN_DENSITY   8.764292
10                DESIGN_COMPLEXITY   6.430053
14                ESSENTIAL_DENSITY   1.074798


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

[38  4  3 26 24 23  9  1 20 14  6  1 10  2  8 28 30 32 34 36 37 35 33 31
 29 27 25  1 16 17  7  5 18 19 21 22 13 11 15 12]
