In [4]:
import numpy as np
import pandas as pd
df=pd.read_csv("../dataset/poi-1.5.csv")
y = df.iloc[:,-1]
X = df.iloc[:,:-1]
label = 'fault_prone'

# list with feature names 
features = df.columns.tolist()
features.remove(label)
from scipy.stats import pointbiserialr
from math import sqrt

def getMerit(subset, label):
    k = len(subset)

    # average feature-class correlation
    rcf_all = []
    for feature in subset:
        coeff = pointbiserialr( df[label], df[feature] )
        rcf_all.append( abs( coeff.correlation ) )
    rcf = np.mean( rcf_all )

    # average feature-feature correlation
    corr = df[subset].corr()
    corr.values[np.tril_indices_from(corr.values)] = np.nan
    corr = abs(corr)
    rff = corr.unstack().mean()

    return (k * rcf) / sqrt(k + k * (k-1) * rff)
best_value = -1
best_feature = ''
for feature in features:
    coeff = pointbiserialr( df[label], df[feature] )
    abs_coeff = abs( coeff.correlation )
    if abs_coeff > best_value:
        best_value = abs_coeff
        best_feature = feature

print("Feature %s with merit %.4f"%(best_feature, best_value))
class PriorityQueue:
    def  __init__(self):
        self.queue = []

    def isEmpty(self):
        return len(self.queue) == 0
    
    def push(self, item, priority):
        """
        item already in priority queue with smaller priority:
        -> update its priority
        item already in priority queue with higher priority:
        -> do nothing
        if item not in priority queue:
        -> push it
        """
        for index, (i, p) in enumerate(self.queue):
            if (set(i) == set(item)):
                if (p >= priority):
                    break
                del self.queue[index]
                self.queue.append( (item, priority) )
                break
        else:
            self.queue.append( (item, priority) )
        
    def pop(self):
        # return item with highest priority and remove it from queue
        max_idx = 0
        for index, (i, p) in enumerate(self.queue):
            if (self.queue[max_idx][1] < p):
                max_idx = index
        (item, priority) = self.queue[max_idx]
        del self.queue[max_idx]
        return (item, priority)
queue = PriorityQueue()

# push first tuple (subset, merit)
queue.push([best_feature], best_value)
visited = []

# counter for backtracks
n_backtrack = 0

# limit of backtracks
max_backtrack = 5
# repeat until queue is empty
# or the maximum number of backtracks is reached
while not queue.isEmpty():
    # get element of queue with highest merit
    subset, priority = queue.pop()
    
    # check whether the priority of this subset
    # is higher than the current best subset
    if (priority < best_value):
        n_backtrack += 1
    else:
        best_value = priority
        best_subset = subset

    # goal condition
    if (n_backtrack == max_backtrack):
        break
    
    # iterate through all features and look of one can
    # increase the merit
    for feature in features:
        temp_subset = subset + [feature]
        
        # check if this subset has already been evaluated
        for node in visited:
            if (set(node) == set(temp_subset)):
                break
        # if not, ...
        else:
            # ... mark it as visited
            visited.append( temp_subset )
            # ... compute merit
            merit = getMerit(temp_subset, label)
            # and push it to the queue
            queue.push(temp_subset, merit)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn import metrics

X = df[best_subset].to_numpy()
Y = df[label].to_numpy()

features=X
#print(features)
label=Y
#print(label)
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_validate(
    estimator=knn, # model to evaluate
    X=features, # inputs features
    y=label, # output labels
    cv=10, # how many folds
    # list of model evaluation metrics
    scoring=['accuracy', 'precision', 'recall'], 
)
print('K-NN Model After Feature Selection  For Software Fault Prediction on poi-1.5 Dataset')
scores = pd.DataFrame(scores)
scores.round(4)
scores.mean().round(4)

Feature npm with merit 0.3155
K-NN Model After Feature Selection  For Software Fault Prediction on poi-1.5 Dataset


fit_time          0.0014
score_time        0.0105
test_accuracy     0.7505
test_precision    0.8072
test_recall       0.7652
dtype: float64