[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ylGbiSrmOgRxCI4WWAGJuYYdZnDV40Em?authuser=2#scrollTo=wE_9Gkc7dr2F)

##Import

In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.neighbors import NearestNeighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import normalize
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, accuracy_score
pd.set_option('display.max_columns', 500)

import warnings
warnings.filterwarnings("ignore")

##Class SMOTE & SMOTEBoost

In [None]:
from numpy.ma.core import count
class SMOTE():

    def __init__(self, n_samples, k_neighbors=5, random_state=None):
        self.n_samples = n_samples
        self.k = k_neighbors
        self.random_state = random_state

    def sample(self):
      np.random.seed(seed=self.random_state)
      S = np.zeros(shape=(self.n_samples, self.n_features))
      
      for i in range(self.n_samples):
        j = np.random.randint(0, self.X.shape[0])
        X_reshape = self.X[j].reshape(1, -1)
        new_neighbor = self.neigh.kneighbors(X_reshape, return_distance=False)
        new_neighbor = new_neighbor[:, 1:]
        nn_index = np.random.choice(new_neighbor[0])
        distance = self.X[nn_index] - self.X[j]
        fraction = np.random.random() 
        S[i, :] = self.X[j, :] + fraction * distance[:]       
      return S

    def fit(self, X):
        self.X = X
        self.n_minority, self.n_features = self.X.shape
        self.neigh = NearestNeighbors(n_neighbors=self.k + 1) 
        self.neigh.fit(self.X)
        return self

#----------------------------------------------------------------------------------------------------------#

class SMOTEBoost():
    
    def __init__(self, base_classifier = None, n_samples = 100, n_estimators = 50, k_neighbors = 5, random_state = 42):
        self.base_classifier = base_classifier
        self.n_samples = n_samples
        self.n_estimators = n_estimators
        self.k_neighbors = k_neighbors
        self.random_state = random_state
        
    def fit(self, X, y):     
        distribution = np.ones(X.shape[0], dtype=float) / X.shape[0]
        self.classifiers = []
        self.beta = []

        for i in range(self.n_estimators):
            minority_class = min(Counter(y), key=Counter(y).get)
            X_min = X[np.where(y == minority_class)]

            self.classifiers.append(self.base_classifier())
            self.smote = SMOTE(n_samples=self.n_samples, k_neighbors=self.k_neighbors, random_state=self.random_state)
            self.smote.fit(X_min)
            X_syn = self.smote.sample()
            y_syn = np.full(X_syn.shape[0], fill_value=minority_class, dtype=np.int64)

            distribution_syn = np.empty(X_syn.shape[0], dtype=np.float64)
            distribution_syn[:] = 1. / X.shape[0]
            mod_distribution = np.append(distribution, distribution_syn).reshape(1, -1)
            mod_distribution = np.squeeze(normalize(mod_distribution, axis=1, norm='l1'))

            mod_X = np.vstack((X, X_syn))
            mod_y = np.append(y, y_syn) 

            self.classifiers[-1].fit(mod_X, mod_y, sample_weight=mod_distribution)

            y_pred_t = self.classifiers[-1].predict(X)
            
            eps_t = np.sum((1 - (y == y_pred_t) +(np.logical_not(y) == y_pred_t)) * distribution)
            beta_t = eps_t/(1-eps_t)
            w_t = 0.5 * (1 + (y == y_pred_t) - (np.logical_not(y) == y_pred_t))
            
            self.beta.append(beta_t)
            
            distribution = distribution * beta_t ** w_t / np.sum(distribution)
        
    def predict(self, X):
        final_pred = np.zeros((X.shape[0], 2))
        for beta, clf in zip(self.beta, self.classifiers):
            yp = clf.predict(X)
            final_pred[range(len(X)), yp] += np.log(1/beta)
        final_pred = np.argmax(final_pred,axis=1)
        return final_pred

##Calculate Score

In [None]:
def score_model(y_true, y_pred, protect):

  y_true_p, y_true_np = [], []
  y_pred_p, y_pred_np = [], []
  
  for y_true_i, y_pred_i, protect_i in zip(y_true, y_pred, protect):
    if protect_i == 1:
      y_true_p.append(y_true_i)
      y_pred_p.append(y_pred_i)
    elif protect_i == 0:
      y_true_np.append(y_true_i)
      y_pred_np.append(y_pred_i)
    else:
      return 0,0,0,0

  y_true_p = np.array(y_true_p)
  y_true_np = np.array(y_true_np)
  y_pred_p = np.array(y_pred_p)
  y_pred_np = np.array(y_pred_np)

  tp_p = confusion_matrix(y_true_p, y_pred_p)[1][1]
  tn_p = confusion_matrix(y_true_p, y_pred_p)[0][0]
  fp_p = confusion_matrix(y_true_p, y_pred_p)[0][1]
  fn_p = confusion_matrix(y_true_p, y_pred_p)[1][0]

  fpr_p = fp_p / (fp_p + tn_p)
  fnr_p = fn_p / (fn_p + tp_p)

        
  tp_np = confusion_matrix(y_true_np, y_pred_np)[1][1]
  tn_np = confusion_matrix(y_true_np, y_pred_np)[0][0]
  fp_np = confusion_matrix(y_true_np, y_pred_np)[0][1]
  fn_np = confusion_matrix(y_true_np, y_pred_np)[1][0]

  fpr_np = fp_np / (fp_np + tn_np)
  fnr_np = fn_np / (fn_np + tp_np)

  diff_fnr = fnr_np - fnr_p
  diff_fpr = fpr_np - fpr_p

  # Scores
  EqOdds = abs(diff_fpr) + abs(diff_fnr)
  TPR_protect = 1 - fnr_p
  TPR_non_protect = 1 - fnr_np
  TNR_protect = 1 - fpr_p
  TNR_non_protect = 1 - fpr_np

  return (EqOdds, TPR_protect, TPR_non_protect, TNR_protect, TNR_non_protect)

##DataSet: Compass

In [None]:
url = 'https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv'
df_compas = pd.read_csv(url)

drop_col_compas = ['id', 'name', 'first', 'last', 'compas_screening_date', 'dob', 'c_case_number', 'c_charge_desc', 'violent_recid', 'is_violent_recid',
                   'c_jail_in',	'c_jail_out', 'c_case_number',	'c_offense_date',	'c_arrest_date', 'c_charge_desc', 'days_b_screening_arrest',
                   'r_case_number',	'r_charge_degree',	'r_days_from_arrest',	'r_offense_date',	'r_charge_desc',	'r_jail_in',	'r_jail_out', 'vr_case_number', 
                   'vr_charge_degree',	'vr_offense_date',	'vr_charge_desc', 'screening_date', 'v_screening_date',	'in_custody',	'out_custody']

df_compas.drop(drop_col_compas, axis=1, inplace=True)
df_compas['c_days_from_compas'].replace(np.NaN, 0, inplace=True)

y_compas = df_compas['two_year_recid']
X_compas = df_compas.drop('two_year_recid', axis=1)

X_compas['sex'].replace('Male', 0, inplace=True)
X_compas['sex'].replace('Female', 1, inplace=True)

# Get Dummy
X_compas = pd.get_dummies(X_compas)

# Scale
col_num_compas = ['age', 'juv_fel_count', 'decile_score', 'juv_misd_count', 'juv_other_count', 'priors_count', 'c_days_from_compas',
                    'is_recid', 'decile_score.1', 'v_decile_score', 'priors_count.1', 'start', 'end']

scaler = StandardScaler()
scaler.fit(X_compas[col_num_compas])
X_compas[col_num_compas] = scaler.transform(X_compas[col_num_compas])
X_compas[col_num_compas] = scaler.transform(X_compas[col_num_compas])

In [None]:
Counter(X_compas['sex'])

Counter({0: 5819, 1: 1395})

##Train_Test_Split

In [None]:
X_train_compas, X_test_compas, y_train_compas, y_test_compas = train_test_split(X_compas, y_compas, test_size=0.3, random_state=42)

##Protect

In [None]:
# Protected female
train_p_compas = []
sum=0

for i in range(len(X_train_compas)):
  if X_train_compas.iloc[i]['sex'] == 1:
    train_p_compas.append(1)
  else:
    train_p_compas.append(0)

train_p_compas = np.array(train_p_compas)

test_p_compas = []
for i in range(len(X_test_compas)):
  if X_test_compas.iloc[i]['sex'] == 1:
    test_p_compas.append(1)
  else:
    test_p_compas.append(0)

test_p_compas = np.array(test_p_compas)

X_train_compas = X_train_compas.values
X_test_compas = X_test_compas.values
y_train_compas = y_train_compas.values
y_test_compas = y_test_compas.values

##Model SMOTEBoost

In [None]:
test_base_clf = lambda: DecisionTreeClassifier(max_depth=5)
smoteboost_test = SMOTEBoost(base_classifier = test_base_clf,n_samples=500, n_estimators=10, k_neighbors=5, random_state=42) 
smoteboost_test.fit(X_train_compas, y_train_compas)
y_pred_compas_sb = smoteboost_test.predict(X_test_compas)

##Model Adaboost

In [None]:
adaboost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5), algorithm="SAMME", n_estimators=10, random_state=42)
adaboost.fit(X_train_compas, y_train_compas)
y_pred_compas_ad = adaboost.predict(X_test_compas)

##SMOTEBoost Score

In [None]:
EqOdd, TPR_P, TPR_NP, TNR_P, TNR_NP = [], [], [], [], []
Bal_acc, Acc = [], []

bal_score = balanced_accuracy_score(y_test_compas, y_pred_compas_sb)
accu_score = accuracy_score(y_test_compas, y_pred_compas_sb)
EqOdds, TPR_protect, TPR_non_protect, TNR_protect, TNR_non_protect = score_model(y_test_compas, y_pred_compas_sb, test_p_compas)

Bal_acc.append(bal_score)
Acc.append(accu_score)
EqOdd.append(EqOdds)
TPR_P.append(TPR_protect)
TPR_NP.append(TPR_non_protect)
TNR_P.append(TNR_protect)
TNR_NP.append(TNR_non_protect)

##Adaboost Score

In [None]:
bal_score = balanced_accuracy_score(y_test_compas, y_pred_compas_ad)
accu_score = accuracy_score(y_test_compas, y_pred_compas_ad)
EqOdds, TPR_protect, TPR_non_protect, TNR_protect, TNR_non_protect = score_model(y_test_compas, y_pred_compas_ad, test_p_compas)

Bal_acc.append(bal_score)
Acc.append(accu_score)
EqOdd.append(EqOdds)
TPR_P.append(TPR_protect)
TPR_NP.append(TPR_non_protect)
TNR_P.append(TNR_protect)
TNR_NP.append(TNR_non_protect)

##Table Scores

In [None]:
table = pd.DataFrame({'Bal': Bal_acc, 'Accuracy':Acc,
                      'EqOdds': EqOdd,'TPR_P':TPR_P ,
                      'TPR_NP': TPR_NP,'TNR_P':TNR_P,
                      'TNR_NP': TNR_NP},
                     
                     index = ['SMOTEBoost', 'Adaboost'])

table

Unnamed: 0,Bal,Accuracy,EqOdds,TPR_P,TPR_NP,TNR_P,TNR_NP
SMOTEBoost,0.976165,0.976905,0.013625,0.970803,0.969549,0.992308,0.979937
Adaboost,0.990042,0.989376,0.015131,1.0,0.995128,0.992308,0.982049
