#Import libraries

In [None]:
import numpy as np
import pandas as pd
import random
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

#Import and process dataset

In [None]:
# U: pandas dataframe
U=pd.read_csv("/content/australian - Copy.dat", sep=' ')
U=U[:30]
# C: list of feature names
C=U.columns
C=list(C)
# D: target variable column name
D=C.pop()
g=random.randint(1,256)
# list of features in descending order of dependency degree
S=[]
T=list(set(U.iloc[:,-1]))
# target variable used in cross-validation of classification accuracy
y=U['O']

#Fuzzy dominance relation value

In [None]:
def f(x,y,a):
  v1=U.iloc[x][a]
  v2=U.iloc[y][a]
  try:
    res=math.exp(-g*(v1-v2))
    return 1/(1+math.exp(-g*(v1-v2)))
  except OverflowError:
    return math.inf

#Upward dependence degree

In [None]:
def fp(P,x,y):
  temp=[]
  for i in P:
    temp.append(f(x,y,i))
  return min(temp)

def gamma(P):
  sum=0
  tot=0
  for i in range(1,len(T)):
    cl=U[U[D[0]]>=i]
    tot=tot+len(cl)
    for x in cl.index:
      temp=[]
      cl2=U[U[D[0]]<i]
      for y in cl2.index:
        pos=fp(P,x,y)
        if pos==math.inf:
          temp.append(pos)
        else:
          temp.append(1-pos)
      val=min(temp)
    sum=sum+val
    return sum/tot

#Feature Selection Algorithm

In [None]:
while len(C)!=0:
  temp={}
  for i in C:
    P=S+[i]
    temp[i]=gamma(P)
  ck = max(temp, key= lambda x: temp[x])
  S=S+[ck]
  C.remove(ck)

print("List of features in descending order of dependency degree:\n", S)

List of features in descending order of dependency degree:
 ['F', 'A', 'B', 'C', 'D', 'E', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N']


#Cross-validating Classification Accuracy

In [None]:
S_list=[]
accuracy_dict={}
for i in S:
  S_list.append(i)
  x_train, x_test, y_train, y_test=train_test_split(U[S_list], y, test_size=0.2)
  rf = RandomForestClassifier()
  rf.fit(x_train, y_train)
  y_pred = rf.predict(x_test)
  accuracy = accuracy_score(y_test, y_pred)
  kf = KFold(n_splits=5, shuffle=True, random_state=42)
  scores = []
  for train_index, test_index in kf.split(U[S_list]):
    rf.fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    score = accuracy_score(y_test, y_pred)
    scores.append(score)
  print(S_list)
  print("Mean accuracy: ", round(np.mean(scores), 2))
  print()
  accuracy_dict[tuple(S_list)]=round(np.mean(scores), 2)
S_max=max(zip(accuracy_dict.values(), accuracy_dict.keys()))[1]
print("\nThe best set of features with highest classification accuracy is:\n", S_max)

['F']
Mean accuracy:  0.67

['F', 'A']
Mean accuracy:  0.67

['F', 'A', 'B']
Mean accuracy:  0.6

['F', 'A', 'B', 'C']
Mean accuracy:  0.17

['F', 'A', 'B', 'C', 'D']
Mean accuracy:  0.5

['F', 'A', 'B', 'C', 'D', 'E']
Mean accuracy:  0.57

['F', 'A', 'B', 'C', 'D', 'E', 'G']
Mean accuracy:  0.47

['F', 'A', 'B', 'C', 'D', 'E', 'G', 'H']
Mean accuracy:  0.7

['F', 'A', 'B', 'C', 'D', 'E', 'G', 'H', 'I']
Mean accuracy:  1.0

['F', 'A', 'B', 'C', 'D', 'E', 'G', 'H', 'I', 'J']
Mean accuracy:  0.87

['F', 'A', 'B', 'C', 'D', 'E', 'G', 'H', 'I', 'J', 'K']
Mean accuracy:  1.0

['F', 'A', 'B', 'C', 'D', 'E', 'G', 'H', 'I', 'J', 'K', 'L']
Mean accuracy:  0.83

['F', 'A', 'B', 'C', 'D', 'E', 'G', 'H', 'I', 'J', 'K', 'L', 'M']
Mean accuracy:  1.0

['F', 'A', 'B', 'C', 'D', 'E', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N']
Mean accuracy:  1.0


The best set of features with highest classification accuracy is:
 ('F', 'A', 'B', 'C', 'D', 'E', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N')
