In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os


In [2]:
# read table

dataPayment = '../Datasets/Employee_Payroll.csv'
payroll = pd.read_csv(dataPayment)

# set default value to 0 for NaN numerical data
numeric_cols = payroll.select_dtypes(include=[np.number]).columns
payroll[numeric_cols] = payroll[numeric_cols].clip(lower=0)


payroll.fillna(0, inplace=True)
payroll['Office'] = payroll['Office'].astype(int)

# define column for 1/4 year discretization
payroll['Fiscal Quarter'] = payroll['Fiscal Quarter']*0.25 - 0.25
payroll['Fiscal Period'] = payroll['Fiscal Year'] + payroll['Fiscal Quarter']

pd.to_datetime(payroll['Original Hire Date'])

# parse hire date to get hire year
payroll['Original Hire Date'] = payroll['Original Hire Date'].str.split('/').str[2]
payroll['Original Hire Date'] = payroll['Original Hire Date'].astype(int)

_work_year = payroll["Fiscal Period"] - payroll['Original Hire Date']

In [3]:
# specify table column included

payroll = payroll[['Fiscal Year', 'Fiscal Period', 'Job Code', 'Job Title', 'Base Pay', 'Position ID', 'Employee Identifier']]

# add Working Year Column
payroll["Working Year"] = _work_year

payroll.head(20)

Unnamed: 0,Fiscal Year,Fiscal Period,Job Code,Job Title,Base Pay,Position ID,Employee Identifier,Working Year
0,2016,2016.0,1172,Assistant State's Attorney,20088.0,9510200,6ac7ba3e-d286-44f5-87a0-191dc415e23c,11.0
1,2016,2016.25,1172,Assistant State's Attorney,23436.0,9510200,6ac7ba3e-d286-44f5-87a0-191dc415e23c,11.25
2,2016,2016.5,1172,Assistant State's Attorney,20422.82,9510200,6ac7ba3e-d286-44f5-87a0-191dc415e23c,11.5
3,2016,2016.75,1172,Assistant State's Attorney,23904.8,9510200,6ac7ba3e-d286-44f5-87a0-191dc415e23c,11.75
4,2017,2017.0,1172,Assistant State's Attorney,20745.8,9510200,6ac7ba3e-d286-44f5-87a0-191dc415e23c,12.0
5,2017,2017.25,1172,Assistant State's Attorney,24473.38,9510200,6ac7ba3e-d286-44f5-87a0-191dc415e23c,12.25
6,2017,2017.5,1172,Assistant State's Attorney,21217.35,9510200,6ac7ba3e-d286-44f5-87a0-191dc415e23c,12.5
7,2016,2016.0,5049,Residential Model Sr Anal III,17770.86,9500731,f313b1c3-1b1a-4b07-bb75-a8c850a91bac,18.0
8,2016,2016.25,5049,Residential Model Sr Anal III,20800.67,9500731,f313b1c3-1b1a-4b07-bb75-a8c850a91bac,18.25
9,2016,2016.5,5049,Residential Model Sr Anal III,17873.76,9500731,f313b1c3-1b1a-4b07-bb75-a8c850a91bac,18.5


In [4]:
class RandomForestRegressor:
    def __init__(self, n_estimators=10, max_depth=3, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, X, y):
        for i in range(self.n_estimators):
            tree = DecisionTreeRegressor(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            idx = np.random.choice(X.shape[0], size=X.shape[0], replace=True)
            tree.fit(X[idx], y[idx])
            self.trees.append(tree)

    def predict(self, X):
        predictions = np.zeros((X.shape[0], len(self.trees)))
        for i, tree in enumerate(self.trees):
            predictions[:, i] = tree.predict(X)
        return np.mean(predictions, axis=1)

class DecisionTreeRegressor:
    def __init__(self, max_depth=3, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.split_feature = None
        self.split_value = None
        self.left = None
        self.right = None
        self.prediction = None

    def fit(self, X, y):
        if self.max_depth == 0 or X.shape[0] < self.min_samples_split:
            self.prediction = np.mean(y)
            return
        best_feature, best_value = self.find_best_split(X, y)
        if best_feature is None or best_value is None:
            self.prediction = np.mean(y)
            return
        self.split_feature = best_feature
        self.split_value = best_value
        left_idx = X[:, best_feature] < best_value
        right_idx = X[:, best_feature] >= best_value
        self.left = DecisionTreeRegressor(max_depth=self.max_depth-1, min_samples_split=self.min_samples_split)
        self.left.fit(X[left_idx], y[left_idx])
        self.right = DecisionTreeRegressor(max_depth=self.max_depth-1, min_samples_split=self.min_samples_split)
        self.right.fit(X[right_idx], y[right_idx])

    def find_best_split(self, X, y):
        best_feature, best_value, best_variance_reduction = None, None, -float('inf')
        for feature in range(X.shape[1]):
            values = np.unique(X[:, feature])
            if len(values) < 2:
                continue
            for value in values:
                left_idx = X[:, feature] < value
                right_idx = X[:, feature] >= value
                if np.sum(left_idx) < self.min_samples_split or np.sum(right_idx) < self.min_samples_split:
                    continue
                left_variance = np.var(y[left_idx])
                right_variance = np.var(y[right_idx])
                total_variance = (np.sum(left_idx) * left_variance + np.sum(right_idx) * right_variance) / len(y)
                variance_reduction = np.var(y) - total_variance
                if variance_reduction > best_variance_reduction:
                    best_feature = feature
                    best_value = value
                    best_variance_reduction = variance_reduction
        return best_feature, best_value

    def predict(self, X):
        if self.prediction is not None:
            return np.full((X.shape[0],), self.prediction)
        left_idx = X[:, self.split_feature] < self.split_value
        right_idx = X[:, self.split_feature] >= self.split_value
        predictions = np.zeros((X.shape[0],))
        predictions[left_idx] = self.left.predict(X[left_idx])
        predictions[right_idx] = self.right.predict(X[right_idx])
        return predictions

In [5]:
from sklearn.model_selection import train_test_split

In [None]:
import random

In [None]:
#X = payroll[['Job Code','Working Year']]
#Y = payroll['Base Pay']
#X.head(10)

In [None]:
#X_train, X_test,y_train,y_test = train_test_split(X,Y,test_size =0.2)
# print the data
#X_train

In [None]:
# Extracting the features & manually splitting the dataset & selecting survived as our target variable
features = ['Job Code', 'Working Year', 'Position ID', 'Fiscal Period', 'Fiscal Year']
nb_train = int(np.floor(0.8 * len(payroll)))
df = payroll.sample(frac=1, random_state=217)
X_train = df[features][:nb_train]
y_train = df['Base Pay'][:nb_train].values
X_test = df[features][nb_train:]
y_test = df['Base Pay'][nb_train:].values

In [None]:
# Defining a function for calculating the entropy
def entropy(p): 
    if p == 0:
        return 0
    elif p == 1:
        return 0
    else:
        return - (p * np.log2(p) + (1 - p) * np.log2(1-p))

# defining a function to calculate information gain of the decision tree
def information_gain(left_child, right_child): 
    parent = left_child + right_child
    p_parent = parent.count(1) / len(parent) if len(parent) > 0 else 0
    p_left = left_child.count(1) / len(left_child) if len(left_child) > 0 else 0
    p_right = right_child.count(1) / len(right_child) if len(right_child) > 0 else 0
    IG_p = entropy(p_parent)
    IG_l = entropy(p_left)
    IG_r = entropy(p_right)
    return IG_p - len(left_child) / len(parent) * IG_l - len(right_child) / len(parent) * IG_r

In [None]:
def draw_bootstrap(X_train, y_train): # Function for calculating the bootstrap for drawing the branches
    bootstrap_indices = list(np.random.choice(range(len(X_train)), len(X_train), replace = True))
    oob_indices = [i for i in range(len(X_train)) if i not in bootstrap_indices]
    X_bootstrap = X_train.iloc[bootstrap_indices].values
    y_bootstrap = y_train[bootstrap_indices]
    X_oob = X_train.iloc[oob_indices].values
    y_oob = y_train[oob_indices]
    return X_bootstrap, y_bootstrap, X_oob, y_oob

def oob_score(tree, X_test, y_test): # Function for checking the out of block(test dataset) & determine the score
    mis_label = 0
    for i in range(len(X_test)):
        pred = predict_tree(tree, X_test[i])
        if pred != y_test[i]:
            mis_label += 1
    return mis_label / len(X_test)

In [None]:
def find_split_point(X_bootstrap, y_bootstrap, max_features):# Function for caclculating the number of split
    feature_ls = list()
    num_features = len(X_bootstrap[0])

    while len(feature_ls) <= max_features:
      feature_idx = random.sample(range(num_features), 1)
      if feature_idx not in feature_ls:
        feature_ls.extend(feature_idx)

    best_info_gain = -999
    node = None
    for feature_idx in feature_ls:
      for split_point in X_bootstrap[:,feature_idx]:
        left_child = {'X_bootstrap': [], 'y_bootstrap': []}
        right_child = {'X_bootstrap': [], 'y_bootstrap': []}

        # split children for continuous variables
        if type(split_point) in [int, float]:
            for i, value in enumerate(X_bootstrap[:,feature_idx]):
                if value <= split_point:
                    left_child['X_bootstrap'].append(X_bootstrap[i])
                    left_child['y_bootstrap'].append(y_bootstrap[i])
                else:
                    right_child['X_bootstrap'].append(X_bootstrap[i])
                    right_child['y_bootstrap'].append(y_bootstrap[i])
        # split children for categoric variables
        else:
            for i, value in enumerate(X_bootstrap[:,feature_idx]):
                if value == split_point:
                    left_child['X_bootstrap'].append(X_bootstrap[i])
                    left_child['y_bootstrap'].append(y_bootstrap[i])
                else:
                    right_child['X_bootstrap'].append(X_bootstrap[i])
                    right_child['y_bootstrap'].append(y_bootstrap[i])

        split_info_gain = information_gain(left_child['y_bootstrap'], right_child['y_bootstrap'])
        if split_info_gain > best_info_gain:
            best_info_gain = split_info_gain
            left_child['X_bootstrap'] = np.array(left_child['X_bootstrap'])
            right_child['X_bootstrap'] = np.array(right_child['X_bootstrap'])
            node = {'information_gain': split_info_gain,
                    'left_child': left_child,
                    'right_child': right_child,
                    'split_point': split_point,
                    'feature_idx': feature_idx}


    return node

In [None]:
def terminal_node(node):# Function for the terminal node of the decision tree
    y_bootstrap = node['y_bootstrap']
    pred = max(y_bootstrap, key = y_bootstrap.count)
    return pred


def split_node(node, max_features, min_samples_split, max_depth, depth):# function for including all the hyperparameters
    left_child = node['left_child']
    right_child = node['right_child']

    del(node['left_child'])
    del(node['right_child'])

    if len(left_child['y_bootstrap']) == 0 or len(right_child['y_bootstrap']) == 0:
        empty_child = {'y_bootstrap': left_child['y_bootstrap'] + right_child['y_bootstrap']}
        node['left_split'] = terminal_node(empty_child)
        node['right_split'] = terminal_node(empty_child)
        return

    if depth >= max_depth:
        node['left_split'] = terminal_node(left_child)
        node['right_split'] = terminal_node(right_child)
        return node

    if len(left_child['X_bootstrap']) <= min_samples_split:
        node['left_split'] = node['right_split'] = terminal_node(left_child)
    else:
        node['left_split'] = find_split_point(left_child['X_bootstrap'], left_child['y_bootstrap'], max_features)
        split_node(node['left_split'], max_depth, min_samples_split, max_depth, depth + 1)
    if len(right_child['X_bootstrap']) <= min_samples_split:
        node['right_split'] = node['left_split'] = terminal_node(right_child)
    else:
        node['right_split'] = find_split_point(right_child['X_bootstrap'], right_child['y_bootstrap'], max_features)
        split_node(node['right_split'], max_features, min_samples_split, max_depth, depth + 1)

In [None]:
def build_tree(X_bootstrap, y_bootstrap, max_depth, min_samples_split, max_features):# Function for building of the tree
    root_node = find_split_point(X_bootstrap, y_bootstrap, max_features)
    split_node(root_node, max_features, min_samples_split, max_depth, 1)
    return root_node

def random_forest(X_train, y_train, n_estimators, max_features, max_depth, min_samples_split):# Function to use Random Forest Regressor
    tree_ls = list()
    oob_ls = list()
    for i in range(n_estimators):
        X_bootstrap, y_bootstrap, X_oob, y_oob = draw_bootstrap(X_train, y_train)
        tree = build_tree(X_bootstrap, y_bootstrap, max_features, max_depth, min_samples_split)
        tree_ls.append(tree)
        oob_error = oob_score(tree, X_oob, y_oob)
        oob_ls.append(oob_error)
    print("OOB estimate: {:.2f}".format(np.mean(oob_ls)))
    return tree_ls

In [None]:
def predict_tree(tree, X_test):# Function for predictions of trees
    feature_idx = tree['feature_idx']

    if X_test[feature_idx] <= tree['split_point']:
        if type(tree['left_split']) == dict:
            return predict_tree(tree['left_split'], X_test)
        else:
            value = tree['left_split']
            return value
    else:
        if type(tree['right_split']) == dict:
            return predict_tree(tree['right_split'], X_test)
        else:
            return tree['right_split']

In [None]:
def predict_rf(tree_ls, X_test):# function for prediction of random forest
    pred_ls = list()
    for i in range(len(X_test)):
        ensemble_preds = [predict_tree(tree, X_test.values[i]) for tree in tree_ls]
        final_pred = max(ensemble_preds, key = ensemble_preds.count)
        pred_ls.append(final_pred)
    return np.array(pred_ls)

In [None]:
# Manual Tuning of the hyperparameters
n_estimators = 50
max_features = 5
max_depth = 10
min_samples_split = 2

model = random_forest(X_train, y_train, n_estimators, max_features, max_depth, min_samples_split)

In [None]:
# predicting the accuracy
preds = predict_rf(model, X_test)
acc = sum(preds == y_test) / len(y_test)
print("Testing accuracy: {}".format(np.round(acc,3)))

NameError: name 'model' is not defined

In [None]:
# YANG INI JANGAN DIJALANKAN DULU YA GESSS :'

#arr = []

#try:
    #with open('./cache/ids.txt', "r") as f:
        #for _id in f:
            #arr.append(int(_id))
#except:
    #if not os.path.exists('./cache'):
        #os.mkdir('./cache')
        
    #_index = payroll['Job Code'].unique()
    #for _id in _index:
        #counts = len(payroll.groupby(['Job Code']).get_group(_id))
        #if counts > 1000:
            #arr.append(str(_id))
    #with open('cache/ids.txt', 'w') as f:
        #for _id in arr:
            #f.write('%s\n' % _id) 
#finally:
   #f.close()

# :return : <List> arr : list of unique job id

In [None]:
# Group Job Code 1172 training dataset <<< PAKAI INI NGGIH

#group_1172 = payroll.groupby(['Job Code']).get_group(1172)

#_sorted = group_1172.sort_values('Employee Identifier')
#fiscal = pd.pivot_table(_sorted, values='Base Pay', index=['Fiscal Period'], columns='Employee Identifier')

#fiscal = fiscal.diff()

In [None]:
# cleaned display table

#fiscal.fillna(0, inplace=True)
#fiscal = fiscal.drop(2016.00)

#fiscal

In [None]:


# neg_cols = (payroll[numeric_cols] < 0).any()

# if neg_cols.any():
#     print("Terdapat nilai negatif pada kolom: ", end="")
#     print(", ".join(neg_cols[neg_cols == True].index))
# else:
#     print("Tidak terdapat nilai negatif pada semua kolom numerik.")

In [None]:
# SUDAH AMAN 👍👍

# null_cols = payroll.isnull().any()

# if null_cols.any():
#     print("Terdapat nilai NaN pada kolom: ", end="")
#     print(", ".join(null_cols[null_cols == True].index))
# else:
#     print("Tidak terdapat nilai NaN pada semua kolom.")

In [None]:
# query job code indexing

#for val in arr:
    #new_df = payroll[payroll['Job Code'] == val]
    #print(f"DataFrame for job code {val}:")
    #display(new_df)