In [267]:
import os
import sys
import copy
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings("ignore")

In [268]:
data = pd.read_csv("./../../data/fraud_detection_bank_dataset.csv")
col_names = [f"col_{i}" for i in range(111)]
target = "targets"

train_data, test_data = train_test_split(data, train_size=0.8, random_state=123)
X_train, y_train = train_data[col_names].values, train_data[target].values
X_test, y_test = test_data[col_names].values, test_data[target].values

## Decision Stump

### Pseudo-Code

  Input: Feature Matrix X and  label vector y

  ``` 
    for each feature j ('Column' of X)
        for each threshold t
            set `y_yes` to most commom label of obejects i satisfying rule (xij > t)
            set `y_no` to most commom label of obejects i not satisfying rule (xij <= t)
            set `y_hat` to be prediction
            compute error
            store the rule (j, t, y_yes, y_no), if it has the lowest error
  ```

### Cost of Decision Stumps

Assume we have:

   - ‘n’ examples (days that we measured).

   - ‘d’ features (foods that we measured).

   - ‘k’ thresholds (>0, >1, >2, …) for each feature

   - final cost is $O(ndk)$, assume k=n, then it is $O(n^2d)$


### Improvement

  - Accuracy is not a good way to split the feature.

  - $O(n^2d)$ can be improved to $O(ndlog(n))$


In [311]:
def accuracy(y, y_hat):
    
    return (y == y_hat).sum() / len(y)


class Decision_stump():

    def fit(self, X, y):
    
        accu_max = -np.inf
        model = []

        for idx, feature in enumerate(X.T):
            
            threshs = set(feature)
            for thresh in threshs:
                
                y_yes = y[feature > thresh]
                y_no = y[feature <= thresh]
                
                if y_yes.sum() < int(0.5 * len(y_yes)):
                    y_hat_yes = np.zeros_like(y_yes)
                else:
                    y_hat_yes = np.ones_like(y_yes)
                    
                if y_no.sum() < int(0.5 * len(y_no)):
                    y_hat_no = np.zeros_like(y_no)
                else:
                    y_hat_no = np.ones_like(y_no)

                y_hat_con = np.concatenate([y_hat_yes, y_hat_no])
                y_con = np.concatenate([y_yes, y_no])

                accu = accuracy(y_hat_con, y_con)
                if accu > accu_max:
                    model = [idx, thresh, accu, y_hat_yes[0], y_hat_no[0]]
                    accu_max = accu
        
        self.model = model        
    
    def predict(self, X):
        
        idx, thresh, _, y_yes_fill, y_no_fill = self.model
        prediction = np.where(X[:, idx]>thresh, y_yes_fill, y_no_fill)

        return prediction
        
    

In [312]:
decision_stump = Decision_stump()
decision_stump.fit(X_train, y_train)

y_train_hat = decision_stump.predict(X_train)
y_test_hat = decision_stump.predict(X_test)

print("Train accuracy: ", accuracy(y_train, y_train_hat))
print("Test accuracy: ", accuracy(y_test, y_test_hat))

Train accuracy:  0.8229510199096128
Test accuracy:  0.8346360527601367


In [313]:
decision_stump.model

[83, 0.0, 0.8229510199096128, 1, 0]

### $O(n^2d)$ can be improved to $O(ndlog(n))$

<img src="./decision_tree_1.jpg" width=600 height=300>

  - pre-order every feature, it took nlog(n), then repeat for d times

  - I tried the algorithm above, but maybe because of numpy's efficiency, I somehow didn't get a faster version

In [272]:
class Decision_stump():

    def fit(self, X, y):
    
        accu_max = -np.inf
        model = []

        for feat_num, feature in enumerate(X.T):
                        
            sorted_feature = zip(feature, y)
            sorted_feature = sorted(sorted_feature, key=lambda x: x[0], reverse=False)
            _, sorted_y = zip(*sorted_feature)


            p_count_unsatis, f_count_unsatis = 0, 0
            p_count_satis, f_count_satis = sum(sorted_y), len(sorted_y) - sum(sorted_y)

            if p_count_satis < f_count_satis:
                sorted_y_hat = np.zeros_like(sorted_y)
            else:
                sorted_y_hat = np.ones_like(sorted_y)


            thresh_prev = sorted_feature[0][0]

            for idx, (thresh, label) in enumerate(sorted_feature):
                
                if label == 1:
                    p_count_unsatis += 1
                    p_count_satis -= 1
                else:
                    f_count_unsatis += 1
                    f_count_satis -= 1
                
                if thresh != thresh_prev or idx == len(sorted_feature) - 1:
                    
                    if p_count_unsatis < f_count_unsatis:
                        sorted_y_hat[:idx] = 0
                    else:
                        sorted_y_hat[:idx] = 1
                        
                    if p_count_satis < f_count_satis:
                        sorted_y_hat[idx:] = 0
                    else:
                        sorted_y_hat[idx:] = 1
                        
                    
                    accu = accuracy(sorted_y, sorted_y_hat)
                    if accu > accu_max:
                        model = [feat_num, thresh_prev, accu, sorted_y_hat[-1], sorted_y_hat[0]]
                        accu_max = accu
                        
                    thresh_prev = thresh

                    
        
        self.model = model        
    
    def predict(self, X):
        
        idx, thresh, _, y_yes_fill, y_no_fill = self.model
        prediction = np.where(X[:, idx]>thresh, y_yes_fill, y_no_fill)

        return prediction
        
    

In [273]:
decision_stump_1 = Decision_stump()
decision_stump_1.fit(X_train, y_train)

y_train_hat = decision_stump_1.predict(X_train)
y_test_hat = decision_stump_1.predict(X_test)

print("Train accuracy: ", accuracy(y_train, y_train_hat))
print("Test accuracy: ", accuracy(y_test, y_test_hat))

Train accuracy:  0.8229510199096128
Test accuracy:  0.8346360527601367


### Use entropy instead of using accuracy

#### Pseudo-Code

  Input: vector y

  ``` 
    counter_dict = dict
    for ele feature y
      dict[ele] += 1

    entropy = 0
    for i in dict:
      prob = dict[i] / n
      entropy -= prob * log(prob)
    
    return entropy
  ```



In [306]:
from collections import Counter

def entropy(y):
    
    p_dist = np.array(list(Counter(y).values()))
    p_dist = p_dist / p_dist.sum()
    ent = (-1 * np.log(p_dist) * p_dist).sum()
    
    return ent


In [314]:


class Decision_stump_entropy():
    
    @staticmethod
    def entropy(y):
        p_dist = np.array(list(Counter(y).values()))
        p_dist = p_dist / p_dist.sum()
        ent = (-1 * np.log(p_dist) * p_dist).sum()
        return ent

    def fit(self, X, y):
    
        gain_max = -np.inf
        model = []

        for idx, feature in enumerate(X.T):
            
            threshs = set(feature)
            for thresh in threshs:
                
                y_yes = y[feature > thresh]
                y_no = y[feature <= thresh]
                
                y_hat_yes = int(y_yes.sum() >= int(0.5 * len(y_yes)))
                y_hat_no = int(y_no.sum() >= int(0.5 * len(y_no)))
                
                gain = entropy(y) - (len(y_yes) * entropy(y_yes) + len(y_no) * entropy(y_no)) / len(y)

                if gain > gain_max:
                    model = [idx, thresh, gain, y_hat_yes, y_hat_no]
                    gain_max = gain
                
                print([idx, thresh, gain, y_hat_yes, y_hat_no])
        
        self.model = model        
    
    def predict(self, X):
        
        idx, thresh, _, y_yes_fill, y_no_fill = self.model
        prediction = np.where(X[:, idx]>thresh, y_yes_fill, y_no_fill)

        return prediction
        
    

In [333]:
np.linspace(0, 1, min(len(set(feature)), 100))

array([0.        , 0.01010101, 0.02020202, 0.03030303, 0.04040404,
       0.05050505, 0.06060606, 0.07070707, 0.08080808, 0.09090909,
       0.1010101 , 0.11111111, 0.12121212, 0.13131313, 0.14141414,
       0.15151515, 0.16161616, 0.17171717, 0.18181818, 0.19191919,
       0.2020202 , 0.21212121, 0.22222222, 0.23232323, 0.24242424,
       0.25252525, 0.26262626, 0.27272727, 0.28282828, 0.29292929,
       0.3030303 , 0.31313131, 0.32323232, 0.33333333, 0.34343434,
       0.35353535, 0.36363636, 0.37373737, 0.38383838, 0.39393939,
       0.4040404 , 0.41414141, 0.42424242, 0.43434343, 0.44444444,
       0.45454545, 0.46464646, 0.47474747, 0.48484848, 0.49494949,
       0.50505051, 0.51515152, 0.52525253, 0.53535354, 0.54545455,
       0.55555556, 0.56565657, 0.57575758, 0.58585859, 0.5959596 ,
       0.60606061, 0.61616162, 0.62626263, 0.63636364, 0.64646465,
       0.65656566, 0.66666667, 0.67676768, 0.68686869, 0.6969697 ,
       0.70707071, 0.71717172, 0.72727273, 0.73737374, 0.74747

In [332]:
feature = X.T[0]

np.quantile(a=feature, q=)

array([  0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   1.        ,   1.  

In [325]:
np.linspace(0, 1, 101)

array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
       0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
       0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
       0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
       0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
       0.99, 1.  ])

TypeError: _linspace_dispatcher() missing 1 required positional argument: 'stop'

In [315]:
decision_stump_2 = Decision_stump_entropy()
decision_stump_2.fit(X_train, y_train)

y_train_hat = decision_stump_2.predict(X_train)
y_test_hat = decision_stump_2.predict(X_test)


[0, 0.0, 0.0284530403314065, 0, 0]
[0, 1.0, 0.021214487602840948, 0, 0]
[0, 2.0, 0.017387139137669427, 0, 0]
[0, 3.0, 0.013676996738655856, 0, 0]
[0, 4.0, 0.01161409913446787, 0, 0]
[0, 5.0, 0.010912255512797953, 0, 0]
[0, 6.0, 0.009319290161863947, 0, 0]
[0, 7.0, 0.008128684466071623, 0, 0]
[0, 8.0, 0.00728743928094211, 0, 0]
[0, 9.0, 0.006245805970368612, 0, 0]
[0, 10.0, 0.005996483921499429, 0, 0]
[0, 11.0, 0.006113604871167855, 0, 0]
[0, 12.0, 0.0054740186610166, 0, 0]
[0, 13.0, 0.004921502979535863, 0, 0]
[0, 14.0, 0.004608298016505308, 0, 0]
[0, 15.0, 0.0041482891690227675, 0, 0]
[0, 16.0, 0.00363331726908922, 0, 0]
[0, 17.0, 0.0033948551157269025, 0, 0]
[0, 18.0, 0.003133551538771484, 0, 0]
[0, 19.0, 0.002944900526546723, 0, 0]
[0, 20.0, 0.002850156387742553, 0, 0]
[0, 21.0, 0.0025985053314266926, 0, 0]
[0, 22.0, 0.0025468788714316792, 0, 0]
[0, 23.0, 0.002344320115099441, 0, 0]
[0, 24.0, 0.002110114388868878, 0, 0]
[0, 25.0, 0.0019436267160480858, 0, 0]
[0, 26.0, 0.001684543054

In [308]:
decision_stump_2.model        
        
    
    
# accuracy(yy, yy_hat)

[0, 0.0, 0.5814292179902534, 0, 0]

In [42]:
feature

array([ 0.,  3.,  0., ...,  0.,  0., 16.])

In [43]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [45]:
np.shape(y[feature > thresh])

(6228,)