#### Imports

In [1]:
import numpy as np
import pandas as pd
import random,os,time
from sklearn.metrics import roc_curve, auc, roc_auc_score,classification_report

from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier

import itertools as it
import warnings

In [2]:
warnings.filterwarnings("ignore")

In [3]:
all_files=os.listdir()
all_files

['abalone.csv',
 'abalone19.csv',
 'abalone9-18.csv',
 'bupa.csv',
 'cancer_classification.csv',
 'continuous_main.ipynb',
 'contin_main_all_together.ipynb',
 'hayes-roth.csv',
 'kddcup-guess_passwd_vs_satan.csv',
 'led7digit-0-2-4-5-6-7-8-9_vs_1.csv',
 'main.ipynb',
 'new-thyroid.csv',
 'page-blocks_csv.csv',
 'poker-9_vs_7.csv',
 'results',
 'segment0.csv',
 'SPECTF.csv',
 'yeast5.csv']

##### FILE DOWNLOAD

In [4]:
urlpath='https://raw.githubusercontent.com/ahmed-shameem/Class_imbalance/master/CI_Datasets/bupa.data'
file_name=urlpath.split('/')[-1]
if not os.path.exists('file_name'):
  os.system(f"curl --url {urlpath} -o {file_name}")

In [5]:
os.listdir()

['abalone.csv',
 'abalone19.csv',
 'abalone9-18.csv',
 'bupa.csv',
 'bupa.data',
 'cancer_classification.csv',
 'continuous_main.ipynb',
 'contin_main_all_together.ipynb',
 'hayes-roth.csv',
 'kddcup-guess_passwd_vs_satan.csv',
 'led7digit-0-2-4-5-6-7-8-9_vs_1.csv',
 'main.ipynb',
 'new-thyroid.csv',
 'page-blocks_csv.csv',
 'poker-9_vs_7.csv',
 'results',
 'segment0.csv',
 'SPECTF.csv',
 'yeast5.csv']

In [6]:
file_name='page-blocks_csv.csv'

#### Preprocessing

In [7]:
df=pd.read_csv(file_name)
df

Unnamed: 0,height,lenght,area,eccen,p_black,p_and,mean_tr,blackpix,blackand,wb_trans,class
0,5,7,35,1.400,0.400,0.657,2.33,14,23,6,1
1,6,7,42,1.167,0.429,0.881,3.60,18,37,5,1
2,6,18,108,3.000,0.287,0.741,4.43,31,80,7,1
3,5,7,35,1.400,0.371,0.743,4.33,13,26,3,1
4,6,3,18,0.500,0.500,0.944,2.25,9,17,4,1
...,...,...,...,...,...,...,...,...,...,...,...
5468,4,524,2096,131.000,0.542,0.603,40.57,1136,1264,28,2
5469,7,4,28,0.571,0.714,0.929,10.00,20,26,2,1
5470,6,95,570,15.833,0.300,0.911,1.64,171,519,104,1
5471,7,41,287,5.857,0.213,0.801,1.36,61,230,45,1


In [8]:
df.columns=map(lambda x:x.strip(),df.columns)
df.iloc[:,-1].value_counts()

1    4913
2     329
5     115
4      88
3      28
Name: class, dtype: int64

In [9]:
if file_name=="hayes-roth.csv":
  df[df.iloc[:,-1]==1]=0
  df[df.iloc[:,-1]==2]=0
  df[df.iloc[:,-1]==3]=1
elif file_name=='page-blocks_csv.csv':
  maj_index=[1,2,3,4]
  min_index=[5]
  for j in maj_index:
    df[df.iloc[:,-1]==j]=1
  for j in min_index:
    df[df.iloc[:,-1]==j]=0

In [10]:
df.iloc[:,-1].value_counts()

1    5358
0     115
Name: class, dtype: int64

In [10]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [11]:
first_train_x,final_test_x,first_train_y,final_test_y=train_test_split(X,y,test_size=0.2,random_state=int(time.time()))

first_train_x_array=np.array(first_train_x)
first_train_y_array=np.array(first_train_y)

# first_train_y_array=np.reshape(first_train_y_array,newshape=(len(first_train_y_array),1))

# first_=np.concatenate((first_train_x_array,first_train_y_array),axis=1)
#final tests will be used for testing the outcome at the very end
#for all other testing and training during pso, the first_train will be used

#### The objective function

In [12]:
def obj_func(population:np.array,model=AdaBoostClassifier(n_estimators=50,learning_rate=1)):
  indices=[]
  for i in range(population):
    if population[i]==1:
      indices.append(i)
  total_data=df.iloc[indices,:]
  train_x,test_x,train_y,test_y=train_test_split(total_data.iloc[:,:-1],total_data.iloc[:,-1],test_size=0.2,random_state=int(time.time()))
  model.fit(train_x,train_y)
  return model.score(test_x,test_y)

#### PSO

In [13]:
class PSO:
  def __init__(self,agent_num,max_iter,obj_func,train_x,test_x,train_y,test_y):
    self.agent_num=agent_num
    self.max_iter=max_iter
    
    # self.obj_func=obj_func
    self.obj_func=self.fitness

    self.train_x=train_x.copy()
    self.test_x=test_x.copy()
    self.train_y=train_y.copy()
    self.test_y=test_y.copy()

    self.majority_index=None
    self.minority_index=None

    self.worst_cases=None

    self.p_inc=1

    

  def fitness(self,agent,thresold=0.5):
    # clf=KNeighborsClassifier(n_neighbors=5)
    rows1 = []
    for i in range(len(agent)):
      if(agent[i]>thresold):
        rows1.append(self.majority_index[i])

    rows2 = self.minority_index.copy()

    rows = rows1+rows2

    train_data=[self.train_x[i,:] for i in rows]
    test_data=self.test_x.copy()

    # model=clf.fit(train_data,trainy[rows])
    abc = AdaBoostClassifier(n_estimators=50,learning_rate=1)
    model=abc.fit(train_data,self.train_y[rows])

    #check here possible error from here
    #changed 1 to -1
    false_positive_rate,true_positive_rate,thresholds=roc_curve(self.test_y,model.predict_proba(test_data)[:,-1])
    # false_positive_rate,true_positive_rate,thresholds=roc_curve(testy,model.predict_proba(test_data)[:,1])
    return auc(false_positive_rate, true_positive_rate)
  
  def initialize(self,n,select_percent:float):
    select_n=round(n*select_percent)
    ans=np.zeros(shape=(self.agent_num,n))
    for i in range(self.agent_num):
      ans[i,random.sample(range(n),k=select_n)]=1
    return ans

  def get_all_fitness(self,population):
    ans=[]
    for vec in population:
      ans.append(self.obj_func(vec))
    ans=np.array(ans)
    return ans
  
  #probably make it a class method but mehh
  def sigmoid(self,x:float)->int:
    threshold=0.5
    if 1/(1+np.e**(-x))>threshold:
      return 1
    else:
      return 0

  def sigmoid_transform(self,population):
    new_population=np.zeros_like(population)
    for i in range(len(new_population)):
      new_population[i]=self.sigmoid(population[i])
    return new_population

  def thresolding_transform(self,agent,thresold=0.5):
    new_agent=[0]*len(agent)
    for i in range(len(agent)):
      new_agent[i]=int((agent[i]>thresold))
    return new_agent

  def penalize(self,population,best_fitness_all,penalty,iter_no):
    arg_sorted=np.argsort(best_fitness_all)
    #keep a 2d array like (iteration_no,one_or_zero_in_worst_solution)
    #if you see, some instance present in last 5 iterations, then penalize it.
    #continue like this
    
    consider_no=5
    self.worst_cases[iter_no]=population[np.argmin(best_fitness_all)].copy()

    # 100
    # agent0=>0 1 2 3 4.. 100
    #         1 0 1 1 

    if iter_no<consider_no:
      return
    to_penalize=[]
    for i in range(np.shape(population)[1]):
      for j in range(iter_no-consider_no+1,iter_no+1):
        if self.worst_cases[j][i]<=0.5:
          break
      if j==iter_no:
        to_penalize.append(i)
    
    for i in to_penalize:
      # penalty[i]+=1
      penalty[i]+=self.p_inc
      # print(f"penalized {i}, cur_penalty:{penalty[i]}")

    # raise NotImplementedError()
    
  def get_majority_minority_indices(self):
    #work with train_data here
    type_0=[]
    type_1=[]
    for i in range(len(self.train_y)):
      if self.train_y[i]==0:
        type_0.append(i)
      else:
        type_1.append(i)
    if len(type_1)>len(type_0):
      self.majority_index,self.minority_index=type_1,type_0
    else:
      self.majority_index,self.minority_index=type_0,type_1



  def optimize(self,select_percent:float=0.5,p_inc:float=1,bounds=None):
    #initializing
    # n=np.shape(data)[0]
    self.p_inc=p_inc

    self.get_majority_minority_indices()

    n=len(self.majority_index)
    if bounds==None:
      bounds=[0]*n
      for i in range(len(bounds)):
        bounds[i]=[0,1]
    bounds=np.array(bounds)

    population=self.initialize(n,select_percent)
    penalty=np.zeros(n)
    
    best_fitness_all=self.get_all_fitness(population)
    best_fitness_all_position=population.copy()
    best_fitness_global=np.max(best_fitness_all)
    best_fitness_global_position=population[np.argmax(best_fitness_all)].copy()

    velocity=np.zeros_like(population)
    
    penalty_constant=1
    # [0,2,0]*[1,1,1]=[0,2,0]=>2
    # x=>agent
    compound_obj_func=lambda x:self.obj_func(x)-penalty_constant*np.sum(penalty*self.thresolding_transform(x))

    # TODO
    #apply sigmoid on penalty part of the compound function to reduce dominance of the penalty


    #just an initialization
    self.worst_cases=[0]*self.max_iter

    for iter in range(self.max_iter):
      w=0.9-(iter/self.max_iter)*(0.9-0.4)
      c1=1.5+np.random.random()*(2-1.5)
      c2=2+np.random.random()*(2.5-2)

      for i in range(self.agent_num):
        temp_velo_term1=w*velocity[i]
        # temp_velo_term2=np.multiply(np.random.random_integers(0,1,n),(best_fitness_all_position[i]-population[i]))*c1
        temp_velo_term2=np.multiply(self.thresolding_transform(np.random.rand(n)),(best_fitness_all_position[i]-population[i]))*c1
        # temp_velo_term3=np.multiply(np.random.random_integers(0,1,n),(best_fitness_global_position-population[i]))*c2
        temp_velo_term3=np.multiply(self.thresolding_transform(np.random.rand(n)),(best_fitness_global_position-population[i]))*c2
        
        temp_velocity=temp_velo_term1+temp_velo_term2+temp_velo_term3
        
        #updation of velocity that i forgot
        velocity[i]=temp_velocity

        temp_population=population[i]+temp_velocity

        #bringing back into bound
        temp_population=np.clip(temp_population,0,1)

        #temp_population=self.sigmoid_transform(temp_population)#making the floats 0/1
        temp_fitness=compound_obj_func(temp_population)# calling the compound function
        if(temp_fitness>best_fitness_all[i]):
          population[i]=temp_population.copy()
          best_fitness_all[i]=temp_fitness
          best_fitness_all_position[i]=temp_population.copy()
          if temp_fitness>best_fitness_global:
            best_fitness_global=temp_fitness
            best_fitness_global_position=temp_population.copy()
        
      self.penalize(population,best_fitness_all,penalty,iter)

    ans_to_return=self.dataset_indices(best_fitness_global_position)
    return ans_to_return


  def dataset_indices(self,agent):
    rows=[]
    for i in range(len(agent)):
      if agent[i]==1:
        rows.append(self.majority_index[i])
    rows+=self.minority_index
    return rows

      

In [14]:
  def calc_fitness(train_x,final_test_x,train_y,final_test_y):

    abc = AdaBoostClassifier(n_estimators=50,learning_rate=1)
    model=abc.fit(train_x,train_y)

    #check here possible error from here
    #changed 1 to -1
    false_positive_rate,true_positive_rate,thresholds=roc_curve(final_test_y,model.predict_proba(final_test_x)[:,-1])
    # false_positive_rate,true_positive_rate,thresholds=roc_curve(testy,model.predict_proba(test_data)[:,1])
    return auc(false_positive_rate, true_positive_rate)

In [15]:
hyper_parameter_list=[
  (0.3,0.01),
  (0.4,0.01),
  (0.4,0.5),
  (0.5,1),
  (0.5,0.01),
  (0.5,0.05),
  (0.5,3),
  (0.7,1),
  (0.7,3),
  (0.7,0.01),
  (0.7,0.1),
  (0.3,1),
  (0.5,2),
  (0.5,0.5),
  (0.7,1),
  (0.7,0.05),
  (0.4,0.1),
  (0.4,1),
  (0.5,3),
  (0.5,0.001),
]

In [16]:
# hyper_parameter_list=[
#   (0.3,1),
#   (0.5,2),
#   (0.5,0.5),
#   (0.7,1),
#   (0.7,0.05),
#   (0.4,0.1),
#   (0.4,1),
#   (0.5,3),
#   (0.5,0.001),
# ]
hyper_parameter_list=set(hyper_parameter_list)

In [17]:
#num_agent,max_iter,select_percent,p_inc
all_agent_nums=range(10,40,10)
all_max_iternums=range(10,30,5)
# all_select_percent_s=[0.3,0.5,0.7]
all_select_percent_s=[0.5,0.7]
# all_p_incs=[0.01,0.05,0.1,3]
all_p_incs=[0.01,0.1,0.5]
new_hyperparameter_list=list(it.product(all_agent_nums,all_max_iternums,all_select_percent_s,all_p_incs))
len(new_hyperparameter_list)

72

In [18]:
new_hyperparameter_list=list(it.product([10,20,30,40,50],[50],[0.5],[0.01]))

In [19]:
all_results=[]
for agent_num,max_iter,select_percent,p_inc in new_hyperparameter_list :
  train_x,test_x,train_y,test_y=train_test_split(first_train_x_array,first_train_y_array,test_size=0.2,random_state=round(time.time()))
  model=PSO(agent_num,max_iter,None,train_x,test_x,train_y,test_y)
  rows=model.optimize(select_percent,p_inc)
  
  # result_entry=[file_name,len(df[df.iloc[:,-1]==1]),np.count_nonzero(train_y[rows]),np.count_nonzero(final_test_y),len(train_x),len(rows),select_percent,p_inc,calc_fitness(train_x[rows],final_test_x,train_y[rows],final_test_y)]
  # all_results.append(result_entry)
  
  new_result=f"{file_name},{agent_num},{max_iter},{select_percent},{p_inc},{calc_fitness(train_x[rows],final_test_x,train_y[rows],final_test_y)}"
  all_results.append(new_result)
  # print(*result_entry,sep=",")




In [20]:
# for i in all_results:
#   print(*i,sep=",")
all_results

with open("new_output.csv","a") as f:
  f.writelines(i+"\n" for i in all_results)

['cancer_classification.csv,10,50,0.5,0.01,0.9935042735042735',
 'cancer_classification.csv,20,50,0.5,0.01,0.9911111111111112',
 'cancer_classification.csv,30,50,0.5,0.01,0.9907692307692308',
 'cancer_classification.csv,40,50,0.5,0.01,0.9876923076923076',
 'cancer_classification.csv,50,50,0.5,0.01,0.983931623931624']

# train_x,test_x,train_y,test_y=train_test_split(first_train_x_array,first_train_y_array,test_size=0.2,random_state=round(time.time()))
# select_percent=0.45
# p_inc=0.1 #100
# model=PSO(10,15,None,train_x,test_x,train_y,test_y)
# rows=model.optimize(select_percent,p_inc)

# 0..1+100

result_header=['dataset_name','total_strong_class','strong_class_in_training','strong_class_in_testing','total_rows','selected_rows','select_percent','p_inc','score ']
print(*result_header,sep=",")

# result_entry=[file_name,len(df[df.iloc[:,-1]==1]),np.count_nonzero(train_y[rows]),np.count_nonzero(final_test_y),len(train_x),len(rows),select_percent,p_inc,calc_fitness(train_x[rows],final_test_x,train_y[rows],final_test_y)]

# print(*result_entry,sep=",")

new_df=pd.read_csv('continuous_results.csv')
new_df

grp=new_df.groupby("dataset_name")


for i,j in grp:
  print(i,np.max(j.iloc[:,-1]))

comp_table_df=pd.read_csv("comp_table.csv")
comp_table_df