In [27]:
import numpy as np
import pandas as pd

from sklearn import datasets

#### Define the function for feature selection using similarity measure and fuzzy entroropy measures based on the article: 


In [28]:
def feature_selection_sim(data, measure = 'luca', p = 1):
    
    # OUTPUTS:
    # data_mod      data with removed feature
    # index_rem     index of removed feature in original data

    # INPUTS:
    # data          dataframe, contains class values
    # measure       fuzzy entropy measure, either 'luca' or 'park'              
    #               currently coded
    # p             parameter of Lukasiewicz similarity measure
    #               p in (0, \infty) as default p=1.
    
    # You need to import 'numpy' as 'np' before using this function

    l = int(max(data.iloc[:,-1]))   # -classes in the last column
    m = data.shape[0]               # -samples
    t = data.shape[1] -1           # -features
    
    dataold = data.copy()
    
    idealvec_s = np.zeros((l,t)) 
    for k in range(l):
        idx = data.iloc[:,-1] == k+1
        idealvec_s[k,:] = data[idx].iloc[:,:-1].mean(axis = 0)
    
    # scaling data between [0,1]
    data_v = data.iloc[:,:-1]
    data_c = data.iloc[:,-1] # labels
    mins_v = data_v.min(axis = 0)
    Ones   = np.ones((data_v.shape))
    data_v = data_v + np.dot(Ones,np.diag(abs(mins_v)))
    
    tmp =[]
    for k in range(l):
        tmp.append(abs(mins_v))
    
    idealvec_s = idealvec_s+tmp
    maxs_v     = data_v.max(axis = 0)
    data_v     = np.dot(data_v,np.diag(maxs_v**(-1)))
    tmp2 =[];
    for k in range(l):
        tmp2.append(abs(maxs_v))
        
    idealvec_s = idealvec_s/tmp2
    
    data_vv = pd.DataFrame(data_v) # Convert the array of feature to a dataframe
    data    = pd.concat([data_vv, data_c], axis=1, ignore_index=False)

    # sample data
    datalearn_s = data.iloc[:,:-1]
    
    # similarities
    sim = np.zeros((t,m,l))
    
    for j in range(m):
        for i in range(t):
            for k in range(l):
                sim[i,j,k] = (1-abs(idealvec_s[k,i]**p - datalearn_s.iloc[j,i])**p)**(1/p)
            
    sim = sim.reshape(t,m*l)
    
    # possibility for two different entropy measures
    if measure =='luca':
        # moodifying zero and one values of the similarity values to work with 
        # De Luca's entropy measure
        delta = 1e-10
        sim[sim == 1] = delta
        sim[sim == 0] = 1-delta
        H = (-sim*np.log(sim)-(1-sim)*np.log(1-sim)).sum(axis = 1)
    elif measure == 'park':
        H = (np.sin(np.pi/2*sim)+np.sin(np.pi/2*(1-sim))-1).sum(axis = 1) 
        
    # find maximum feature
    max_idx = np.argmax(H) # notice that index is starting from 0
    
    #removing feature from the data
    data_mod = dataold.drop(dataold.columns[max_idx], axis=1)
    
    return max_idx, data_mod


### Test the 'feature_selection_sim' function with an example dataset

In [29]:
# Initialization
p = 1
measure = 'luca'

data = [[0.4600,    0.3400,    0.1400,    0.0300,    0.9218,    1.0000],
        [0.5000,    0.3400,    0.1500,    0.0200,    0.7382,    1.0000],
        [0.4400,    0.2900,    0.1400,    0.0200,    0.1763,    1.0000],
        [0.7600,    0.3000,    0.6600,    0.2100,    0.4057,    2.0000],
        [0.4900,    0.2500,    0.4500,    0.1700,    0.9355,    2.0000],
        [0.7300,    0.2900,    0.6300,    0.1800,    0.9169,    2.0000]]

data = pd.DataFrame(data) # convert array data to a dataframe
    
idx, datanew = feature_selection_sim(data,measure,p) # function call

# idx: index of the removed feature
# datanew: data with removed feature

# data.columns[idx] # name of the removed feature


### Test the function with Iris dataset (you need to import datasets from from sklearn)

In [30]:
# Load the data
iris = datasets.load_iris()

# Load iris training dataset
X = iris.data

# Load iris target set
Y = iris.target

# Convert datasets' type into dataframe
df_f = pd.DataFrame(X, columns=iris.feature_names)
df_c = pd.DataFrame(Y)

data = pd.concat([df_f, df_c], axis=1, ignore_index=False)

idx, datanew = feature_selection_sim(data)

# data.columns[idx] : name of the removed feature


In [31]:
len(data.columns)

5

In [32]:
data = pd.read_csv('../../Dataset/school_district_breakdowns_1.csv')

data['jurisdiction_name'] = data['jurisdiction_name'].astype('category')
cat_columns = data.select_dtypes(['category']).columns
data[cat_columns] = data[cat_columns].apply(lambda x: x.cat.codes)

data.iloc[:,0:-1] = data.iloc[:,0:-1].apply(lambda x: (x-x.mean())/ x.std(), axis=0)

data = data.fillna(0)

data.head()

Unnamed: 0,jurisdiction_name,count_participants,count_female,percent_female,count_male,percent_male,count_gender_unknown,percent_gender_unknown,count_gender_total,percent_gender_total,...,count_citizen_status_total,percent_citizen_status_total,count_receives_public_assistance,percent_receives_public_assistance,count_nreceives_public_assistance,percent_nreceives_public_assistance,count_public_assistance_unknown,percent_public_assistance_unknown,count_public_assistance_total,percent_public_assistance_total
0,-1.652306,-0.513093,-0.560255,-0.492912,-0.407513,1.238394,0.0,0.0,-0.513093,0.423554,...,-0.513093,0.424554,-0.573592,-1.13977,-0.446934,1.173055,0.0,0.0,-0.513093,100
1,-1.545705,0.546499,0.422518,0.195873,0.712962,0.41021,0.0,0.0,0.546499,0.423554,...,0.546499,0.424554,0.483208,0.468554,0.536239,0.204943,0.0,0.0,0.546499,100
2,-1.439105,-0.259917,-0.232664,0.609145,-0.288314,-0.086701,0.0,0.0,-0.259917,0.423554,...,-0.259917,0.424554,-0.184245,0.779842,-0.276518,0.017567,0.0,0.0,-0.259917,100
3,-1.332504,-0.428701,-0.426241,0.505827,-0.407513,0.037527,0.0,0.0,-0.428701,0.423554,...,-0.428701,0.424554,-0.351108,0.935487,-0.433825,-0.076122,0.0,0.0,-0.428701,100
4,-1.225904,-0.550601,-0.560255,0.230313,-0.502873,0.368801,0.0,0.0,-0.550601,0.423554,...,-0.550601,0.424554,-0.545781,-0.41343,-0.512479,0.735843,0.0,0.0,-0.550601,100


In [33]:
idx, datanew = feature_selection_sim(data)


  idealvec_s = idealvec_s/tmp2
  H = (-sim*np.log(sim)-(1-sim)*np.log(1-sim)).sum(axis = 1)


In [6]:
datanew.to_csv("./processed_data.csv")

In [35]:
datanew.head()

Unnamed: 0,count_participants,count_female,percent_female,count_male,percent_male,count_gender_unknown,percent_gender_unknown,count_gender_total,percent_gender_total,count_pacific_islander,...,count_citizen_status_total,percent_citizen_status_total,count_receives_public_assistance,percent_receives_public_assistance,count_nreceives_public_assistance,percent_nreceives_public_assistance,count_public_assistance_unknown,percent_public_assistance_unknown,count_public_assistance_total,percent_public_assistance_total
0,-0.513093,-0.560255,-0.492912,-0.407513,1.238394,0.0,0.0,-0.513093,0.423554,-0.254133,...,-0.513093,0.424554,-0.573592,-1.13977,-0.446934,1.173055,0.0,0.0,-0.513093,100
1,0.546499,0.422518,0.195873,0.712962,0.41021,0.0,0.0,0.546499,0.423554,-0.254133,...,0.546499,0.424554,0.483208,0.468554,0.536239,0.204943,0.0,0.0,0.546499,100
2,-0.259917,-0.232664,0.609145,-0.288314,-0.086701,0.0,0.0,-0.259917,0.423554,-0.254133,...,-0.259917,0.424554,-0.184245,0.779842,-0.276518,0.017567,0.0,0.0,-0.259917,100
3,-0.428701,-0.426241,0.505827,-0.407513,0.037527,0.0,0.0,-0.428701,0.423554,-0.254133,...,-0.428701,0.424554,-0.351108,0.935487,-0.433825,-0.076122,0.0,0.0,-0.428701,100
4,-0.550601,-0.560255,0.230313,-0.502873,0.368801,0.0,0.0,-0.550601,0.423554,-0.254133,...,-0.550601,0.424554,-0.545781,-0.41343,-0.512479,0.735843,0.0,0.0,-0.550601,100


In [34]:
len(datanew.columns)

44

In [36]:
len(datanew)

32

In [37]:
Binavg = datanew.apply(lambda x: bin(hash(tuple(x))), axis = 1)
Binavg

0     0b11111110111000111111110111101110100000111100...
1     -0b1001011010001101111101000101110010101011110...
2     -0b1100001010000110011110011100111110111110001...
3     0b10111110011110101000000100111011000010100011...
4     -0b1111110001011000110101110101100111101100101...
5     -0b1011100110010101111010111100110111101110111...
6     -0b1100001110000001101110100101000110100100101...
7     0b10110100100001010011001001001010011110000110...
8     0b11101011011000001111011111011101010101111000...
9     -0b1000010011001110100000100100110010101100010...
10    -0b1111001101010011100001011101101100000100110...
11    -0b1110001000101010000010100101000000011101101...
12    0b10011101000010000000101101000001000111000111...
13    -0b1001101111001110001100000011000011001000100...
14    0b11010000101101111001000011100100111011111100...
15    -0b1010000000011001100000011000100001011010110...
16    0b11001001000110011011001110111001001110100100...
17    0b1101011110101011111101110010001111101101

In [38]:
len(Binavg[0])

63

In [42]:
bin_list = []
for el in Binavg:
    bin_list.append(el.replace('-','').replace('0b',''))
    while(len(bin_list[-1])!=63):
        bin_list[-1]='0'+bin_list[-1]
    print(len(bin_list[-1]))

63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63
63


In [43]:
bin_list

['001111111011100011111111011110111010000011110010011011010100100',
 '001001011010001101111101000101110010101011110111111000001000010',
 '110000101000011001111001110011111011111000101100101101111001110',
 '101111100111101010000001001110110000101000111010011010111101100',
 '011111100010110001101011101011001111011001010000010110110001001',
 '101110011001010111101011110011011110111011101001001100011001010',
 '110000111000000110111010010100011010010010101011001100111001001',
 '001011010010000101001100100100101001111000011000000000111011101',
 '111010110110000011110111110111010101011110000111011100110011101',
 '100001001100111010000010010011001010110001010100101111011100001',
 '111100110101001110000101110110110000010011011011011010111100111',
 '111000100010101000001010010100000001110110100101001010010011010',
 '100111010000100000001011010000010001110001110101101101010101111',
 '010011011110011100011000000110000110010001000100000110111000000',
 '1101000010110111100100001110010011101111110010

In [44]:
df = pd.DataFrame(bin_list)

In [46]:
df.to_csv('../../Dataset/entropy_hashed_SDBdata.csv')