In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os


In [None]:
# read table

dataPayment = '../Datasets/Employee_Payroll.csv'
payroll = pd.read_csv(dataPayment)

# set default value to 0 for NaN numerical data
numeric_cols = payroll.select_dtypes(include=[np.number]).columns
payroll[numeric_cols] = payroll[numeric_cols].clip(lower=0)


payroll.fillna(0, inplace=True)
payroll['Office'] = payroll['Office'].astype(int)

# define column for 1/4 year discretization
payroll['Fiscal Quarter'] = payroll['Fiscal Quarter']*0.25 - 0.25
payroll['Fiscal Period'] = payroll['Fiscal Year'] + payroll['Fiscal Quarter']

pd.to_datetime(payroll['Original Hire Date'])

# parse hire date to get hire year
payroll['Original Hire Date'] = payroll['Original Hire Date'].str.split('/').str[2]
payroll['Original Hire Date'] = payroll['Original Hire Date'].astype(int)

_work_year = payroll["Fiscal Period"] - payroll['Original Hire Date']

In [None]:
# specify table column included

payroll = payroll[['Fiscal Year', 'Fiscal Period', 'Job Code', 'Job Title', 'Base Pay', 'Position ID', 'Employee Identifier']]

# add Working Year Column
payroll["Working Year"] = _work_year

In [None]:
# YANG INI JANGAN DIJALANKAN DULU YA GESSS :'

arr = []

try:
    with open('./cache/ids.txt', "r") as f:
        for _id in f:
            arr.append(int(_id))
except:
    if not os.path.exists('./cache'):
        os.mkdir('./cache')
        
    _index = payroll['Job Code'].unique()
    for _id in _index:
        counts = len(payroll.groupby(['Job Code']).get_group(_id))
        if counts > 1000:
            arr.append(str(_id))
    with open('cache/ids.txt', 'w') as f:
        for _id in arr:
            f.write('%s\n' % _id)
    with open('cache/ids.txt', 'r') as f:
        arr = []
        for _id in f:
            arr.append(int(_id)) 
finally:
   f.close()

# :return : <List> arr : list of unique job id

In [None]:
emps = []
max = 0;
for _id in arr:
    df = payroll[payroll['Job Code'] == _id]
    emp_id = df['Employee Identifier'].unique()
    if len(emp_id) > 50:
        l = 75
    else:
        l = len(emp_id)

    for i in range(l):
        for i in range(100):
            _index = np.random.randint(0, l-1)
            if emp_id[_index] not in emps:
                emps.append(emp_id[_index])
                break


len(emps)

In [None]:
df_id = payroll[payroll['Employee Identifier'].isin(emps)]
df_id

In [None]:
class StandardScaler:
    def __init__(self):
        self.variance = None
        self.mean = None
        
    def fit(self, data):
        self.variance = np.std(data, axis=0)
        self.mean = np.mean(data, axis=0)

    def transform(self, data):
        scaled = (data - self.mean)/self.variance
        return scaled

In [None]:
X = df_id[['Job Code','Working Year']]
Y = df_id['Base Pay']

In [None]:
scaler = StandardScaler()
scaler.fit(X)
x = scaler.transform(X)
x.head()

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,Y,test_size = 0.2, random_state = 0)

In [None]:
class LogisticRegression:
    def __init__(self,iterations,alpha):
        self.iterations=iterations
        self.alpha=alpha
    
    def sigmoid(self,z):
        return(1/(1+np.exp(-z)))
    
    def fit(self,x,y):
        m=x.shape[0]
        self.w = np.array([[5],[5]])
        
        cost_vals=[] 
        for i in range(self.iterations):
            a= np.dot(x,self.w)
            z=self.sigmoid(a)
            
            za = z.copy()
            za[za <= 0] = 1
            
            zb = z.copy()
            zb[1-zb <= 0] = 1
            
            ya = y.copy()
            ya[1-ya <= 0] = 1
            
            cost = (-1/m) *( np.dot(y,np.log(z))+(np.dot((1-y),np.log(1-z))))
            
            cost_vals.append(cost)
            
            dw = np.dot(x.T,z-np.array([y])).mean()
            
            self.w=self.w-(self.alpha*dw)
        print(self.w)
    
    def predict(self,x,threshold=0.5):
        result = []
        for i in range(x.shape[0]):
            row = np.array(x.iloc[i])
            probability=self.sigmoid(np.dot(row,self.w))
            if(probability > threshold):
                result.append(1)
            else:
                result.append(0)
        return np.array(result)

In [None]:
model = LogisticRegression(1000,0.1)
model.fit(x_train,y_train)

In [None]:
test = pd.DataFrame({
    'expected': model.predict(x_test),
    'actual': np.array(y_test)
})

num_all = test.shape[0]
num_true = test[test['expected'] == test['actual']].shape[0]

accuracy = num_true/num_all
print("accuracy:", accuracy)