[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/12kuxmyxTMPpQb10qF24shv0QC7KbUZqe?authuser=2#scrollTo=JeMARWUsd3X3)

##Import

In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.neighbors import NearestNeighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import normalize
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, accuracy_score
pd.set_option('display.max_columns', 500)

import warnings
warnings.filterwarnings("ignore")

##Class SMOTE & SMOTEBoost

In [None]:
from numpy.ma.core import count
class SMOTE():

    def __init__(self, n_samples, k_neighbors=5, random_state=None):
        self.n_samples = n_samples
        self.k = k_neighbors
        self.random_state = random_state

    def sample(self):
      np.random.seed(seed=self.random_state)
      S = np.zeros(shape=(self.n_samples, self.n_features))
      
      for i in range(self.n_samples):
        j = np.random.randint(0, self.X.shape[0])
        X_reshape = self.X[j].reshape(1, -1)
        new_neighbor = self.neigh.kneighbors(X_reshape, return_distance=False)
        new_neighbor = new_neighbor[:, 1:]
        nn_index = np.random.choice(new_neighbor[0])
        distance = self.X[nn_index] - self.X[j]
        fraction = np.random.random() 
        S[i, :] = self.X[j, :] + fraction * distance[:]       
      return S

    def fit(self, X):
        self.X = X
        self.n_minority, self.n_features = self.X.shape
        self.neigh = NearestNeighbors(n_neighbors=self.k + 1) 
        self.neigh.fit(self.X)
        return self

#----------------------------------------------------------------------------------------------------------#

class SMOTEBoost():
    
    def __init__(self, base_classifier = None, n_samples = 100, n_estimators = 50, k_neighbors = 5, random_state = 42):
        self.base_classifier = base_classifier
        self.n_samples = n_samples
        self.n_estimators = n_estimators
        self.k_neighbors = k_neighbors
        self.random_state = random_state
        
    def fit(self, X, y):     
        distribution = np.ones(X.shape[0], dtype=float) / X.shape[0]
        self.classifiers = []
        self.beta = []

        for i in range(self.n_estimators):
            minority_class = min(Counter(y), key=Counter(y).get)
            X_min = X[np.where(y == minority_class)]

            self.classifiers.append(self.base_classifier())
            self.smote = SMOTE(n_samples=self.n_samples, k_neighbors=self.k_neighbors, random_state=self.random_state)
            self.smote.fit(X_min)
            X_syn = self.smote.sample()
            y_syn = np.full(X_syn.shape[0], fill_value=minority_class, dtype=np.int64)

            distribution_syn = np.empty(X_syn.shape[0], dtype=np.float64)
            distribution_syn[:] = 1. / X.shape[0]
            mod_distribution = np.append(distribution, distribution_syn).reshape(1, -1)
            mod_distribution = np.squeeze(normalize(mod_distribution, axis=1, norm='l1'))

            mod_X = np.vstack((X, X_syn))
            mod_y = np.append(y, y_syn) 

            self.classifiers[-1].fit(mod_X, mod_y, sample_weight=mod_distribution)

            y_pred_t = self.classifiers[-1].predict(X)
            
            eps_t = np.sum((1 - (y == y_pred_t) +(np.logical_not(y) == y_pred_t)) * distribution)
            beta_t = eps_t/(1-eps_t)
            w_t = 0.5 * (1 + (y == y_pred_t) - (np.logical_not(y) == y_pred_t))
            
            self.beta.append(beta_t)
            
            distribution = distribution * beta_t ** w_t / np.sum(distribution)
        
    def predict(self, X):
        final_pred = np.zeros((X.shape[0], 2))
        for beta, clf in zip(self.beta, self.classifiers):
            yp = clf.predict(X)
            final_pred[range(len(X)), yp] += np.log(1/beta)
        final_pred = np.argmax(final_pred,axis=1)
        return final_pred

##Calculate Score

In [None]:
def score_model(y_true, y_pred, protect):

  y_true_p, y_true_np = [], []
  y_pred_p, y_pred_np = [], []
  
  for y_true_i, y_pred_i, protect_i in zip(y_true, y_pred, protect):
    if protect_i == 1:
      y_true_p.append(y_true_i)
      y_pred_p.append(y_pred_i)
    elif protect_i == 0:
      y_true_np.append(y_true_i)
      y_pred_np.append(y_pred_i)
    else:
      return 0,0,0,0

  y_true_p = np.array(y_true_p)
  y_true_np = np.array(y_true_np)
  y_pred_p = np.array(y_pred_p)
  y_pred_np = np.array(y_pred_np)

  tp_p = confusion_matrix(y_true_p, y_pred_p)[1][1]
  tn_p = confusion_matrix(y_true_p, y_pred_p)[0][0]
  fp_p = confusion_matrix(y_true_p, y_pred_p)[0][1]
  fn_p = confusion_matrix(y_true_p, y_pred_p)[1][0]

  fpr_p = fp_p / (fp_p + tn_p)
  fnr_p = fn_p / (fn_p + tp_p)

        
  tp_np = confusion_matrix(y_true_np, y_pred_np)[1][1]
  tn_np = confusion_matrix(y_true_np, y_pred_np)[0][0]
  fp_np = confusion_matrix(y_true_np, y_pred_np)[0][1]
  fn_np = confusion_matrix(y_true_np, y_pred_np)[1][0]

  fpr_np = fp_np / (fp_np + tn_np)
  fnr_np = fn_np / (fn_np + tp_np)

  diff_fnr = fnr_np - fnr_p
  diff_fpr = fpr_np - fpr_p

  # Scores
  EqOdds = abs(diff_fpr) + abs(diff_fnr)
  TPR_protect = 1 - fnr_p
  TPR_non_protect = 1 - fnr_np
  TNR_protect = 1 - fpr_p
  TNR_non_protect = 1 - fpr_np

  return (EqOdds, TPR_protect, TPR_non_protect, TNR_protect, TNR_non_protect)

##DataSet: KDD

In [None]:
df_kdd = pd.read_csv('KDD.txt', header = None)
header_list = ['age', 'class_worker', 'ind_code', 'occ_code', 'education', 
               'wage_per_hour', 'enrolled_last_wk', 'marital_status', 'major_ind_code', 'major_occ_code',
               'mace', 'hispanic_Origin', 'sex', 'member_labor_union', 'reason_unemploy',
               'full_or_parttime', 'capital_gains', 'capital_losses', 'divdends_stocks', 'tax_filer_status',
               'region_pv_residence', 'state_pv_residence', 'detailed_family_stat', 'detailed_summary_household', 'instance_weight', 
               'migration_code_change_msa', 'migration_code_change_reg', 'migration_code_move_within_reg', 'live_in_house_1year_ago', 'migration_pv_res_in_sunbelt',
               'num_persons_worked_for_employer', 'family_members_under_18', 'country_of_birth_father', 'country_of_birth_mother', 'country_of_birth_self',
               'citizenship', 'total_person_income', 'own_bus_or_self_employed', 'total_person_earnings', 'weeks_worked_in_year',
               'veterans_benefits', 'income']

df_kdd.columns = header_list

df_kdd.replace(' ?', np.nan, inplace=True)
df_kdd.iloc[:, 21].replace(np.nan, 'NoData', inplace=True)
df_kdd.iloc[:, 25].replace(np.nan, 'NoData', inplace=True)
df_kdd.iloc[:, 26].replace(np.nan, 'NoData', inplace=True)
df_kdd.iloc[:, 27].replace(np.nan, 'NoData', inplace=True)
df_kdd.iloc[:, 29].replace(np.nan, 'NoData', inplace=True)

df_kdd.dropna(axis=0, inplace=True)

df_kdd['income'].replace(' - 50000.', 0, inplace=True)
df_kdd['income'].replace(' 50000+.', 1, inplace=True)

df_kdd['sex'].replace(' Male', 0, inplace=True)
df_kdd['sex'].replace(' Female', 1, inplace=True)

df_kdd['ind_code'] = df_kdd['ind_code'].astype(str)
df_kdd['occ_code'] = df_kdd['ind_code'].astype(str)
df_kdd['veterans_benefits'] = df_kdd['veterans_benefits'].astype(str)

y_kdd = df_kdd['income']
X_kdd = df_kdd.drop('income', axis=1)

# Get Dummy
X_kdd = pd.get_dummies(X_kdd)

#Scale
col_num_kdd = ['age',	'wage_per_hour',	'capital_gains',	'capital_losses',	'divdends_stocks',	'instance_weight',	
                  'num_persons_worked_for_employer',	'total_person_income',	'total_person_earnings',	'weeks_worked_in_year']

scaler = StandardScaler()
scaler.fit(X_kdd[col_num_kdd])
X_kdd[col_num_kdd] = scaler.transform(X_kdd[col_num_kdd])


In [None]:
Counter(X_kdd['sex'])

Counter({0: 91618, 1: 99621})

##Train_Test_Split

In [None]:
X_train_kdd, X_test_kdd, y_train_kdd, y_test_kdd = train_test_split(X_kdd, y_kdd, test_size=0.3, random_state=42)

##Protect

In [None]:
# Protected female
train_p_kdd = []
sum=0

for i in range(len(X_train_kdd)):
  if X_train_kdd.iloc[i]['sex'] == 1:
    train_p_kdd.append(1)
  else:
    train_p_kdd.append(0)

train_p_kdd = np.array(train_p_kdd)

test_p_kdd = []
for i in range(len(X_test_kdd)):
  if X_test_kdd.iloc[i]['sex'] == 1:
    test_p_kdd.append(1)
  else:
    test_p_kdd.append(0)

test_p_kdd = np.array(test_p_kdd)

X_train_kdd = X_train_kdd.values
X_test_kdd = X_test_kdd.values
y_train_kdd = y_train_kdd.values
y_test_kdd = y_test_kdd.values

##Model SMOTEBoost

In [None]:
test_base_clf = lambda: DecisionTreeClassifier(max_depth=5)
smoteboost_test = SMOTEBoost(base_classifier = test_base_clf,n_samples=500, n_estimators=10, k_neighbors=5, random_state=42) 
smoteboost_test.fit(X_train_kdd, y_train_kdd)
y_pred_kdd_sb = smoteboost_test.predict(X_test_kdd)

##Model Adaboost

In [None]:
adaboost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5), algorithm="SAMME", n_estimators=10, random_state=42)
adaboost.fit(X_train_kdd, y_train_kdd)
y_pred_kdd_ad = adaboost.predict(X_test_kdd)

##SMOTEBoost Score

In [None]:
EqOdd, TPR_P, TPR_NP, TNR_P, TNR_NP = [], [], [], [], []
Bal_acc, Acc = [], []

bal_score = balanced_accuracy_score(y_test_kdd, y_pred_kdd_sb)
accu_score = accuracy_score(y_test_kdd, y_pred_kdd_sb)
EqOdds, TPR_protect, TPR_non_protect, TNR_protect, TNR_non_protect = score_model(y_test_kdd, y_pred_kdd_sb, test_p_kdd)

Bal_acc.append(bal_score)
Acc.append(accu_score)
EqOdd.append(EqOdds)
TPR_P.append(TPR_protect)
TPR_NP.append(TPR_non_protect)
TNR_P.append(TNR_protect)
TNR_NP.append(TNR_non_protect)

##Adaboost Score

In [None]:
bal_score = balanced_accuracy_score(y_test_kdd, y_pred_kdd_ad)
accu_score = accuracy_score(y_test_kdd, y_pred_kdd_ad)
EqOdds, TPR_protect, TPR_non_protect, TNR_protect, TNR_non_protect = score_model(y_test_kdd, y_pred_kdd_ad, test_p_kdd)

Bal_acc.append(bal_score)
Acc.append(accu_score)
EqOdd.append(EqOdds)
TPR_P.append(TPR_protect)
TPR_NP.append(TPR_non_protect)
TNR_P.append(TNR_protect)
TNR_NP.append(TNR_non_protect)

##Table Scores

In [None]:
table = pd.DataFrame({'Bal': Bal_acc, 'Accuracy':Acc,
                      'EqOdds': EqOdd,'TPR_P':TPR_P ,
                      'TPR_NP': TPR_NP,'TNR_P':TNR_P,
                      'TNR_NP': TNR_NP},
                     
                     index = ['SMOTEBoost', 'Adaboost'])

table

Unnamed: 0,Bal,Accuracy,EqOdds,TPR_P,TPR_NP,TNR_P,TNR_NP
SMOTEBoost,0.685344,0.898487,0.157256,0.365229,0.46471,0.953971,0.896196
Adaboost,0.675385,0.949488,0.243334,0.185984,0.413076,0.994212,0.97797
