In [294]:
import pandas as pd
import numpy as np
import os

In [295]:
# read table

dataPayment = '../Datasets/Employee_Payroll.csv'
payroll = pd.read_csv(dataPayment)

# set default value to 0 for NaN numerical data
numeric_cols = payroll.select_dtypes(include=[np.number]).columns
payroll[numeric_cols] = payroll[numeric_cols].clip(lower=0)


payroll.fillna(0, inplace=True)
payroll['Office'] = payroll['Office'].astype(int)

# define column for 1/4 year discretization
payroll['Fiscal Quarter'] = payroll['Fiscal Quarter']*0.25 - 0.25
payroll['Fiscal Period'] = payroll['Fiscal Year'] + payroll['Fiscal Quarter']

pd.to_datetime(payroll['Original Hire Date'])

# parse hire date to get hire year
payroll['Original Hire Date'] = payroll['Original Hire Date'].str.split('/').str[2]
payroll['Original Hire Date'] = payroll['Original Hire Date'].astype(int)

_work_year = payroll["Fiscal Period"] - payroll['Original Hire Date']

In [296]:
# specify table column included

payroll = payroll[['Fiscal Year', 'Fiscal Period', 'Job Code', 'Job Title', 'Base Pay', 'Position ID', 'Employee Identifier']]

# add Working Year Column
payroll["Working Year"] = _work_year

In [297]:
# YANG INI JANGAN DIJALANKAN DULU YA GESSS :'

arr = []

try:
    with open('../cache/ids.txt', "r") as f:
        for _id in f:
            arr.append(int(_id))
except:
    if not os.path.exists('../cache'):
        os.mkdir('../cache')
        
    _index = payroll['Job Code'].unique()
    for _id in _index:
        counts = len(payroll.groupby(['Job Code']).get_group(_id))
        if counts > 1000:
            arr.append(str(_id))
    with open('../cache/ids.txt', 'w') as f:
        for _id in arr:
            f.write('%s\n' % _id)
    with open('../cache/ids.txt', 'r') as f:
        arr = []
        for _id in f:
            arr.append(int(_id)) 
finally:
   f.close()

# :return : <List> arr : list of unique job id

In [298]:
emps = []
max = 0;
for _id in arr:
    df = payroll[payroll['Job Code'] == _id]
    emp_id = df['Employee Identifier'].unique()
    if len(emp_id) > 50:
        l = 75
    else:
        l = len(emp_id)

    for i in range(l):
        for i in range(100):
            _index = np.random.randint(0, l-1)
            if emp_id[_index] not in emps:
                emps.append(emp_id[_index])
                break


In [299]:
df_id = payroll[payroll['Employee Identifier'].isin(emps)]
df_id.head(4)

Unnamed: 0,Fiscal Year,Fiscal Period,Job Code,Job Title,Base Pay,Position ID,Employee Identifier,Working Year
0,2016,2016.0,1172,Assistant State's Attorney,20088.0,9510200,6ac7ba3e-d286-44f5-87a0-191dc415e23c,11.0
1,2016,2016.25,1172,Assistant State's Attorney,23436.0,9510200,6ac7ba3e-d286-44f5-87a0-191dc415e23c,11.25
2,2016,2016.5,1172,Assistant State's Attorney,20422.82,9510200,6ac7ba3e-d286-44f5-87a0-191dc415e23c,11.5
3,2016,2016.75,1172,Assistant State's Attorney,23904.8,9510200,6ac7ba3e-d286-44f5-87a0-191dc415e23c,11.75


In [300]:
from numpy.linalg import inv

class MultivariableRegression():
    def __init__(self):
        self.b = []
    
    
    def _decompose(self,X):
        try:
            x = [
                X['Fiscal Period'].values,
                X['Job Code'].values,
                X['Working Year'].values
            ]
        except:
            x = [
                X['Fiscal Period'],
                X['Job Code'],
                X['Working Year']
            ]
        return x
    
    
    def sums_of_xy(self, x1, x2):
        sum = 0
        for i in range(len(x1)):
            x1diff = x1[i]
            x2diff = x2[i]
            e = x1diff*x2diff
            sum += e
        return sum
        
    
    def sums_of_x(self, x):
        sum = 0
        for i in range(len(x)):
            x_diff = x[i]
            sum += x_diff
        return sum
    
    
    def mean(self, x):
        x_mean = sum(x)/len(x)
        return x_mean
    
    
    def fit(self, X, Y):
        x = self._decompose(X)
        
        A = np.array([
            [len(x), self.sums_of_x(x[0]), self.sums_of_x(x[1]), self.sums_of_x(x[2])],
            [self.sums_of_x(x[0]), self.sums_of_xy(x[0],x[0]), self.sums_of_xy(x[0],x[1]), self.sums_of_xy(x[0],x[2])],
            [self.sums_of_x(x[1]), self.sums_of_xy(x[1],x[0]), self.sums_of_xy(x[1],x[1]), self.sums_of_xy(x[1],x[2])],
            [self.sums_of_x(x[2]), self.sums_of_xy(x[2],x[0]), self.sums_of_xy(x[2],x[1]), self.sums_of_xy(x[2],x[2])],
        ])
        
        g = np.array([
            [self.sums_of_x(Y)],
            [self.sums_of_xy(x[0],Y)],
            [self.sums_of_xy(x[1],Y)],
            [self.sums_of_xy(x[2],Y)]
        ])
        
        inv_A = inv(A)
        self.b = np.matmul(inv_A, g)
    
    
    def predict(self, x_input):
        x_input.index.name = 'ids'
        
        res = []
        for _index in x_input.index:
            x = x_input.loc[_index]
            xs = self._decompose(x)
            y = self.b[0][0] + self.b[1][0]*xs[0] + self.b[2][0]*xs[1] + self.b[3][0]*xs[2]
            res.append(y)
            
        return res        

In [301]:
X = df_id[['Fiscal Period', 'Job Code', 'Working Year']]
Y = df_id['Base Pay'].values

from sklearn.model_selection import train_test_split
x_train, x_test,y_train,y_test = train_test_split(X,Y,test_size =0.2)

In [302]:
mvr = MultivariableRegression()
mvr.fit(x_train, y_train)

In [303]:
y_pred = mvr.predict(x_test)

In [304]:
from sklearn.metrics import mean_absolute_error, explained_variance_score
print(mean_absolute_error(y_test, y_pred))
print(explained_variance_score(y_test, y_pred))

5873.8884247119395
0.02278883716129332
