In [1]:
import os
import time
import copy
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.svm import SVC as SVM
from sklearn.ensemble import RandomForestClassifier as RF

import warnings
warnings.filterwarnings('ignore')

In [2]:
CSV_PATH = 'olunpu_protocol1.csv'

In [4]:
# loading the features extracted previously

ftrs_df = pd.read_csv('olunpu_protocol1.csv')
ftrs_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,label
0,0.196656,0.131031,0.0855,0.444914,1.501654,-0.431384,-0.075061,-0.41397,-0.332213,-0.671525,...,-0.17376,-0.034144,0.296806,-0.337709,0.532591,-0.160591,1.799268,0.383572,-0.610647,0
1,-0.186134,-0.420871,0.057183,0.205021,2.235867,-0.751498,-0.284065,-0.255555,-0.438462,-0.472919,...,-0.174453,-0.472127,0.403736,-0.314814,0.159944,-0.120569,2.412055,0.041659,-0.548541,0
2,0.114465,0.056228,-0.092826,0.494534,1.481885,-0.417989,-0.036052,-0.326369,-0.296863,-0.595545,...,-0.142185,0.013301,0.409179,-0.313531,0.514522,-0.064553,1.652749,0.461697,-0.54546,0
3,0.227946,-0.150725,0.183412,0.326855,1.342035,-0.399909,-0.25333,-0.082883,-0.136401,-0.670254,...,0.018894,0.012146,0.29669,-0.340916,0.17746,-0.175406,1.56744,0.253403,-0.352246,0
4,-0.153753,-0.554442,0.075598,0.262754,2.21975,-0.874376,-0.287445,-0.261032,-0.194963,-0.591691,...,-0.117164,-0.331512,0.360172,-0.401951,0.157942,-0.039321,2.489741,-0.024143,-0.455594,0


In [5]:
X = ftrs_df.iloc[:,:-1]
y = ftrs_df['label']

X = np.array(X)
y = np.array(y)

print(X.shape,y.shape)

(37745, 512) (37745,)


In [6]:
# model class for each Agent i.e. chromosome in GA

class Agent:
    # class whose object holds a numpy array and its corresponding fitness
    def __init__(self,agent,fitness):
        self.agent = agent
        self.fitness = fitness

In [7]:
# utility functions for GA

def get_selected_features(X, agent, num_ftrs_sel, agent_idx, call=1):  # getting selected feature set from the FS vector 
    X_1 = np.zeros((X.shape[0],num_ftrs_sel))
    row = 0
    for i in range(agent.shape[0]):
        if agent[i] == 1:
            X_1[:,row] = X[:,i]
            row += 1

    if call == 1 : # print FS 
        print(f'Nos of features selected for Chromosome {agent_idx+1} is: {X_1.shape[1]}')

    return X_1


def most_fit_agents(agents, gen):    # sorting an array of Agent objects by their fitness values and returning the top 50% of them

    n_fit = len(agents)//2
    agents.sort(key=lambda obj:obj.fitness,reverse=True) # sort by fitness
    print(f'\n==========> fitness of most fit chromosome of generation {gen+1} is {(agents[0].fitness):.6f}')
    agents = agents[0:n_fit] # top 50% 
    return agents


def crossover_two_agents(agent1,agent2,crossover_point=0.5):    # utlity function to perform crossover between 2 agent vectors

    child1 = agent1.copy()
    child2 = agent2.copy()
    n_len = len(agent1)

    start = int(n_len*crossover_point)
    for i in range(start,n_len):
        child1[i] = agent2[i]
        child2[i] = agent1[i]

    return child1,child2 


def mutate_agent(agent, prob_mut=0.4):  # perform mutation on agent with probab. of mutation = prob_mut

    n_len = len(agent)
    for i in range(n_len):
        if np.random.rand() < prob_mut:
            agent[i] = 1 - agent[i]
    return agent


def initialize_population(num_agents,num_features):     # initialize matrix of 0/1 randomly

    # define min and max number of features
    min_features = int(0.2 * num_features)
    max_features = int(0.5 * num_features)

    # initialize the agents with zeros
    init_population = np.zeros((num_agents, num_features))

    # select random features for each agent
    for agent_no in range(num_agents):

        # find random indices
        cur_count = np.random.randint(min_features, max_features)
        temp_vec = np.random.rand(1, num_features)
        temp_idx = np.argsort(temp_vec)[0][0:cur_count]

        # select the features with the ranom indices
        init_population[agent_no][temp_idx] = 1   

    return init_population
    

def get_fitness(X, y, original_ftrs=512, weight_acc=0.98):   # fitness of a selected set of features
    
    X = np.array(X)
    num_features = X.shape[1]
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=None,shuffle=False)

    clf = SVM() # learning algorithm for wrapper-based FS
    clf.fit(X_train,y_train)
    acc = clf.score(X_test,y_test)
    
    eff_feat = (original_ftrs - num_features)/original_ftrs

    fitness = weight_acc * acc + (1-weight_acc) * eff_feat

    return fitness

In [8]:
num_generations = 10 # nos of generations of GA
num_agents = 20 # size of population
num_features = X.shape[1] # original number of features

In [10]:
best_agent_per_generation = np.zeros((num_agents,num_features)) # keeps track of most fit chromosome per generation

init_population = initialize_population(num_agents,num_features) # initial population of chromosomes

np.random.seed(random.randint(0,512)) # setting random seed

for gen in range(num_generations):

    print(f'======== Generation {gen+1} =========')

    # array of Agent(agent,fitness)
    agent_fitness = []

    for agent_idx, agent in enumerate(init_population):

        # get selected feature set from agent vector
        X1 = get_selected_features(X, agent, int(np.sum(agent)), agent_idx)

        # get fitness of that agent 
        fitness = get_fitness(X1, y, num_features)
        
        # put Agent(agent,fitness) into the array
        agent_fitness.append(Agent(agent, fitness))

    # get most fit agents of current generation
    most_fit = most_fit_agents(agent_fitness, gen)

    # convert the most fit agents to numpy array
    most_fit_np = np.zeros((len(most_fit), num_features))

    for i,obj in enumerate(most_fit):
        agent = np.array(obj.agent)
        most_fit_np[i] = agent.copy()

    # get offsprings of the most fit parents
    offspring = np.zeros((num_agents//2,num_features))

    for i in range(0,(num_agents//2 - 1), 2):

        # perform crossover between pair of parents
        child1,child2 = crossover_two_agents(most_fit_np[i],most_fit_np[i+1])

        # put them into array
        offspring[i] = child1
        offspring[i+1] = child2

    # concatenate the parents and offsprings
    most_fit_np = np.append(most_fit_np,offspring,axis=0)

    # store most fit agent
    best_agent_per_generation[gen] = most_fit_np[0].copy()

    # mutation on every agent
    for i in range(most_fit_np.shape[0]):
        most_fit_np[i] = mutate_agent(most_fit_np[i])

    # set population to the most_fit_np array
    assert init_population.shape == most_fit_np.shape

    init_population = most_fit_np.copy()

    print('---------------------------------------\n\n')

Nos of features selected for Chromosome 1 is: 176
Nos of features selected for Chromosome 2 is: 132
Nos of features selected for Chromosome 3 is: 228
Nos of features selected for Chromosome 4 is: 164
Nos of features selected for Chromosome 5 is: 154
Nos of features selected for Chromosome 6 is: 235
Nos of features selected for Chromosome 7 is: 147
Nos of features selected for Chromosome 8 is: 120
Nos of features selected for Chromosome 9 is: 240
Nos of features selected for Chromosome 10 is: 245
Nos of features selected for Chromosome 11 is: 126
Nos of features selected for Chromosome 12 is: 229
Nos of features selected for Chromosome 13 is: 211
Nos of features selected for Chromosome 14 is: 231
Nos of features selected for Chromosome 15 is: 146
Nos of features selected for Chromosome 16 is: 184
Nos of features selected for Chromosome 17 is: 157
Nos of features selected for Chromosome 18 is: 242
Nos of features selected for Chromosome 19 is: 240
Nos of features selected for Chromosome 

In [12]:
# get best FS vector out of all generations

max_acc_agent = np.zeros(best_agent_per_generation[0].shape)
max_acc = 0

model = SVM() 

for gen,best_agent in enumerate(best_agent_per_generation):
    
    X1 = get_selected_features(X, best_agent, int(np.sum(best_agent)), agent_idx=0, call=0)

    if X1.shape[1] > 0:
        X_train,X_test,y_train,y_test = train_test_split(X1, y, test_size=0.2, stratify=None, shuffle=False)

        # classification test
        model.fit(X_train,y_train)
        acc = model.score(X_test,y_test)

        if acc >= max_acc:
            max_acc = acc
            max_acc_agent = best_agent.copy()

print(f'most accurate FS has {int(np.sum(max_acc_agent))} features')

most accurate FS has 269 features


In [13]:
def validate_FS(best_agent, clf='SVM'):

    print(f'The FS has {int(np.sum(best_agent))} features')
    
    model = SVM() # default classifier

    if clf.upper() == 'KNN':
        model = KNN()
    elif clf.upper() == 'RF':
        model = RF()
        
    X1 = get_selected_features(X, best_agent, int(np.sum(best_agent)), agent_idx=0, call=0)

    if X1.shape[1] > 0:
        X_train,X_test,y_train,y_test = train_test_split(X1, y, test_size=0.2, stratify=None, shuffle=False)

    # classification test
    model.fit(X_train,y_train)
    acc = model.score(X_test,y_test)

    print(f'accuracy of this FS by {clf} is {(100.0*acc):.4f}')

In [16]:
# FS validation using SVM classifier

best_agent = max_acc_agent.copy()
validate_FS(best_agent,clf='SVM')

The FS has 269 features
accuracy of this FS by SVM is 86.9254


In [18]:
# FS validation using KNN classifier

best_agent = max_acc_agent.copy()
validate_FS(best_agent,clf='KNN')

The FS has 269 features
accuracy of this FS by KNN is 86.5545


In [19]:
# FS validation using RandomForestClassifier

best_agent = max_acc_agent.copy()
validate_FS(best_agent,clf='RF')

The FS has 269 features
accuracy of this FS by RF is 84.3688
