<a href="https://colab.research.google.com/github/XuyangAbert/EUFSFC/blob/master/example_eufsfc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import math
import pandas as pd
import numpy.matlib as b
from sklearn.preprocessing import normalize
import time
from entropy_estimators import *

In [None]:
class data_load(object):
  def __init__(self, path, file):
    self.df = pd.read_csv(path+file, header = None)
    [N, L] = np.shape(self.df)
    self.dim = L - 1
    self.labels = self.df.iloc[:, L-1].values
    self.data = self.df.iloc[:, 0:self.dim].values
    self.normalized_data = self.preprocess(self.data)

  def preprocess(self):
    [N,L] = np.shape(self.data)
    NewData = np.zeros((N,L))
    for i in range(L):
        Temp = data.iloc[:,i]
        if max(Temp)==0:
            NewData[:,i] = np.zeros((N,1))
        else:
            Temp = (Temp - np.min(Temp))/(max(Temp)-min(Temp))
            NewData[:,i] = Temp
    return NewData

In [None]:
class feature_similarity(object):
  def __init__(self):
    pass
  def distribution_est(self, data, dim):
    DC_mean = np.zeros(dim)
    DC_std = np.zeros(dim)
    for i in range(dim):
        TempClass = data[:,i]
        DC_mean[i] = np.mean(TempClass)
        DC_std[i] = np.std(TempClass)
    return DC_mean,DC_std

  def feature_dist_cont(self, DC_means, DC_std, data, Var, dim, Corr):
    DisC = np.zeros((dim,dim))
    Dist = []
    for i in range(dim):
        for j in range(i,dim):
            DisC[i,j] = self.kld_cal(data,i,j,Var,Corr)
            DisC[j,i] = DisC[i,j]
            Dist.append(DisC[i,j])
    return DisC,Dist

  def feature_dist_disc(self, data, dim):
    DisC = np.zeros((dim,dim))
    Dist = []
    for i in range(dim):
        for j in range(i,dim):
            DisC[i,j] = self.sym_cal(data,i,j,Var,Corr)
            DisC[j,i] = DisC[i,j]
            Dist.append(DisC[i,j])
    return DisC,Dist

  def kld_cal(self, data,i,j,Var,Corr):
    Var1 = Var[i]
    Var2 = Var[j]
    P = Corr[i,j]
    Sim = Var1 + Var2 - ((Var1 + Var2)**2 - 4 * Var1 * Var2 * (1 - P**2))**0.5
    D_KL = Sim / (Var1 + Var2)
    return D_KL

  def sym_cal(data,i,j):
    I_ij = midd(data[:,i],data[:,j])
    H_I = entropyd(data[:,i])
    H_J = entropyd(data[:,j])
    D_KL = 1 - 2*(I_ij)/(H_I + H_J)
    return D_KL

In [None]:
def fitness_cal(DisC, DC_means, DC_std, data, StdF, gamma):
    fitness = np.zeros(len(DC_means))
    # print(np.shape(fitness))
    for i in range(len(DC_means)):
        TempSum = 0
        for j in range(len(DC_means)):
            if j != i:
                D = DisC[i,j]
                TempSum = TempSum + (math.exp(- (D**2) / StdF))**gamma
        fitness[i] = TempSum
    return fitness

In [None]:
class feature_clustering(object):
  def __init__(self):
    pass
  def pseduo_peaks(self, disc, dist, dc_mean, dc_std, data, fitness, stdf, gamma, var1):
      # The temporal sample space in terms of mean and standard deviation
      sample = np.vstack((dc_mean,dc_std)).T
      # Search Stage of Pseduo Clusters at the temporal sample space
      neiRad = 0.15*max(dist)
      i = 0
      marked = []
      c_indices = np.arange(1, len(dc_mean)+1) # The pseduo Cluster label of features
      peakindices = []
      pfitness = []
      co = []
      fitn = fitness
      while True:
          peakIndices.append(np.argmax(fitn))
          pfitness.append(np.max(fitn))
          indices = self.neighborsearch(disc, data,
                                        sample, peakindices[i],
                                        marked, neirad, var1)
          c_indices[indices] = peakindices[i]
          if len(indices) == 0:
              indices=[peakindices[i]]
          co.append(len(indices)) # Number of samples belong to the current
          # identified pseduo cluster
          marked = np.concatenate(([marked,indices]))
          # Fitness Proportionate Sharing
          fitn = self.sharing(fitn, indices)
          # Check whether all of samples has been assigned a pseduo cluster label
          if np.sum(co) >= (len(fitn)):
              break
          i=i+1 # Expand the size of the pseduo cluster set by 1
      c_indices = self.close_fcluster(peakindices, disc, np.shape(disc)[0])
      return peakindices, pfitness, c_Indices

  def neighborsearch(self, disC, data, sample, p_indice, marked, radius, var1):
      cluster = []
      for i in range(np.shape(sample)[0]):
          if i not in marked:
              dist = disC[i, p_indice]
              if dist <= radius:
                  cluster.append(i)
      indices = cluster
      return indices

  def close_fcluster(self, fcluster, disc, dim):
      f_indices = np.arange(dim)
      for i in range(dim):
          dist_fcluster = disc[i, fcluster]
          f_indices[i] = fcluster[np.argmin(dist_fcluster)]
      return f_indices

  def sharing(self, fitness, indices):
      newfitness = fitness
      sum1 = 0
      for j in range(len(indices)):
          sum1 = sum1 + fitness[indices[j]]
      for th in range(len(indices)):
              newfitness[indices[th]] = fitness[indices[th]] / (1+sum1)
      return newfitness

  def pseduo_evolve(self, disc, peakindices, pseduof, c_indices,
                    dc_mean, dc_std, data, fitness, stdf, gamma):
      # Initialize the indices of Historical Pseduo Clusters and their fitness values
      histcluster = peakindices
      histclusterf = pseduof
      while True:
          # Call the merge function in each iteration
          [cluster, cfitness, f_indices] = self.pseduo_merge(disc, histcluster,
                                                             histclusterf, c_indices,
                                                             dc_mean, dc_std, data,
                                                             fitness, stdf, gamma)
          # Check for the stablization of clutser evolution and exit the loop
          if len(np.unique(cluster)) == len(np.unique(histcluster)):
              break
          # Update the feature indices of historical pseduo feature clusters and
          # their corresponding fitness values
          histcluster = cluster
          histclusterf = cfitness
          c_indices = f_indices
      # Compute final evolved feature cluster information
      fcluster = np.unique(cluster)
      ffitness = cfitness
      c_indices = f_indices

      return fcluster, ffitness, c_indices
    def Pseduo_Merge(self, disc, peakindices, pseduof, c_indices,
                     dc_mean, dc_std, data, fitness, stdf, gamma):
      # Initialize the pseduo feature clusters lables for all features
      f_indices = c_indices
      # Initialize the temporal sample space for feature means and stds
      sample = np.vstack((dc_mean,dc_std)).T
      ml = [] # Initialize the merge list as empty
      marked = [] #List of checked Pseduo Clusters Indices
      unmarked = [] # List of unmerged Pseduo Clusters Indices
      for i in range(len(peakindices)):
              M = 1 # Set the merge flag as default zero
              mindist = math.inf # Set the default Minimum distance between two feature clusters as infinite
              minindice = 0 # Set the default Neighboring feature cluster indices as zero
              # Check the current Pseduo Feature Cluster has been evaluated or not
              if peakindices[i] not in marked:
                  for j in range(len(peakindices)):
                          if j != i:
                              # Divergence Calculation between two pseduo feature clusters
                              d = disc[peakindices[i], peakindices[j]]
                              if mindist > d:
                                  mindist = d
                                  minindice = j
                  if minindice != 0:
                      # Current feature pseduo cluster under check
                      current = sample[peakindices[i],:]
                      currentfit = pseduof[i]
                      # Neighboring feature pseduo cluster of the current checked cluster
                      neighbor = sample[peakindices[minindice],:]
                      neighborfit = pseduof[minindice]

                      # A function to identify the bounady feature instance between two
                      # neighboring pseduo feature clusters
                      bp = self.boundary_points(disc, f_indices, data,
                                                peakindices[i],
                                                peakindices[minindice])
                      bpf=fitness[bp]
                      if bpf < 0.85*min(currentfit, neighborfit):
                          M = 0 # Change the Merge flag
                      if M == 1:
                          ml.append([peakindices[i], peakindices[minindice]])
                          marked.append(peakindices[i])
                          marked.append(peakindices[minindice])
                      else:
                          unmarked.append(peakindices[i])
      newpi = []
      # Update the pseduo feature clusters list with the obtained mergelist
      for m in range(np.shape(ml)[0]):
          if fitness[ml[m][0]] > fitness[ml[m][1]]:
              newpi.append(ml[m][0])
              f_indices[c_indices == ml[m][1]] = ml[m][0]
          else:
              newpi.append(ml[m][1])
              f_indices[c_indices == ml[m][0]] = ml[m][1]
      # Update the pseduo feature clusters list with pseduo clusters that have not appeared in the merge list
      for n in range(len(peakindices)):
          if peakindices[n] in unmarked:
              newpi.append(peakindices[n])
      # Updated pseduo feature clusters information after merging
      fcluster = np.unique(newpi)
      ffitness = fitness[fcluster]
      f_indices = self.close_fcluster(fcluster, disc, np.shape(disc)[0])
      return fcluster, ffitness, f_indices

  def boundary_points(self, disc, f_indices, data, current, neighbor):
      [N, dim] = np.shape(data)
      tempcluster1 = np.where(f_indices == current)
      tempcluster2 = np.where(f_indices == neighbor)
      tempcluster = np.append(tempcluster1, tempcluster2)
      for i in range(len(tempcluster)):
          d1 = disc[tempcluster[i], current]
          d2 = disc[tempcluster[i], neighbor]
          d.append(abs(d1 - d2))
      if not d:
          bd = current
      else:
          fi = np.argmin(d)
          bd = tempcluster[fi]
      return bd

  def pseduogeneration(self, psep, n):
      pse_mean = psep[:,0]
      pse_std = psep[:,1]
      data = np.zeros((n, len(pse_mean)))
      for i in range(len(pse_mean)):
          data[:, i] = (np.repeat(pse_mean[i], n) + pse_std[i] * np.random.randn(n)).T
      return data

  def psefitness_cal(psep, sample, data, pseduodata, stdf, gamma):
      orifn = np.shape(sample)[0]
      pn = np.shape(psep)[0]
      psepf = np.zeros(pn)
      for i in range(pn):
          tempsum = 0
          for j in range(orifn):
              var1 = np.var(data[:,j])
              var2 = np.var(pseduodata[:,i])
              p = np.corrcoef(data[:,j], pseduodata[:,i])[0,1]
              sim = var1 + var2 - ((var1 + var2)**2 - 4 * var1 * var2 * (1 - p**2))**0.5
              d_kl = sim / (var1 + var2)
              tempsum = tempsum + (math.exp(-(d_kl**2)/stdf))**gamma
          psepf[i] = tempsum
      return psepf

In [None]:
def main(path, file, feature_type):
  start = time.time()
  [data,label, OriData] = data_load(path, file)
  [N, dim] = np.shape(data)
  fs_calculator = feature_similarity()
  [DC_means, DC_std] = fs_calculator.distribution_est(data,dim)
  Var = np.var(data,axis=0)
  Corr = np.corrcoef(data.T)
  if feature_type == 'continuous':
    DisC,Dist = fs_calculator.feature_dist_cont(DC_means,DC_std,data,Var,dim,Corr)
  else:
    DisC,Dist = fs_calculator.feature_dist_disc(data, dim)
  end1 = time.time()
  print('Distance Calculation Finished:',end1-start)
  StdF = (np.max(np.power(Dist,0.5)))**2
  gamma = 5
  fitness = fitness_cal(DisC, DC_means, DC_std, data, StdF, gamma)
  oldfitness = np.copy(fitness)
  fc_model = feature_clustering()
  [PeakIndices,Pfitness,C_Indices] = fc_model.pseduo_peaks(DisC, Dist, DC_means,
                                                           DC_std, data,fitness,
                                                           StdF,gamma, Var)
  fitness = oldfitness
  # Pseduo Clusters Infomormation Extraction
  PseDuo = DC_means[PeakIndices] # Pseduo Feature Cluster centers
  PseDuoF = Pfitness # Pseduo Feature Clusters fitness values
  #-------------Check for possible merges among pseduo clusters-----------#
  [FCluster,Ffitness,C_Indices] = fc_model.pseduo_evolve(DisC, PeakIndices, PseDuoF, C_Indices, DC_means, DC_std, data, fitness, StdF, gamma)
  SF = FCluster
  Extract_FIndices = SF
  label = label.reshape(N,1)
  Extract_Data = np.concatenate((OriData[:,SF],label),axis=1)
  end2 = time.time()
  print('The total time in seconds:',end2-start)
  return SF, Extract_Data
#--------------------------------------------------------------------------------------------------------------
if __name__ == '__main__':
  fsub, extract_data = main(path, files, ftype)
