In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split


In [2]:
# read table

dataPayment = '../Datasets/Employee_Payroll.csv'
payroll = pd.read_csv(dataPayment)

# set default value to 0 for NaN numerical data
numeric_cols = payroll.select_dtypes(include=[np.number]).columns
payroll[numeric_cols] = payroll[numeric_cols].clip(lower=0)


payroll.fillna(0, inplace=True)
payroll['Office'] = payroll['Office'].astype(int)

# define column for 1/4 year discretization
payroll['Fiscal Quarter'] = payroll['Fiscal Quarter']*0.25 - 0.25
payroll['Fiscal Period'] = payroll['Fiscal Year'] + payroll['Fiscal Quarter']

pd.to_datetime(payroll['Original Hire Date'])

# parse hire date to get hire year
payroll['Original Hire Date'] = payroll['Original Hire Date'].str.split('/').str[2]
payroll['Original Hire Date'] = payroll['Original Hire Date'].astype(int)

_work_year = payroll["Fiscal Period"] - payroll['Original Hire Date']

In [3]:
# specify table column included

payroll = payroll[['Fiscal Year', 'Fiscal Period', 'Job Code', 'Job Title', 'Base Pay', 'Position ID', 'Employee Identifier']]

# add Working Year Column
payroll["Working Year"] = _work_year

payroll.head(20)

Unnamed: 0,Fiscal Year,Fiscal Period,Job Code,Job Title,Base Pay,Position ID,Employee Identifier,Working Year
0,2016,2016.0,1172,Assistant State's Attorney,20088.0,9510200,6ac7ba3e-d286-44f5-87a0-191dc415e23c,11.0
1,2016,2016.25,1172,Assistant State's Attorney,23436.0,9510200,6ac7ba3e-d286-44f5-87a0-191dc415e23c,11.25
2,2016,2016.5,1172,Assistant State's Attorney,20422.82,9510200,6ac7ba3e-d286-44f5-87a0-191dc415e23c,11.5
3,2016,2016.75,1172,Assistant State's Attorney,23904.8,9510200,6ac7ba3e-d286-44f5-87a0-191dc415e23c,11.75
4,2017,2017.0,1172,Assistant State's Attorney,20745.8,9510200,6ac7ba3e-d286-44f5-87a0-191dc415e23c,12.0
5,2017,2017.25,1172,Assistant State's Attorney,24473.38,9510200,6ac7ba3e-d286-44f5-87a0-191dc415e23c,12.25
6,2017,2017.5,1172,Assistant State's Attorney,21217.35,9510200,6ac7ba3e-d286-44f5-87a0-191dc415e23c,12.5
7,2016,2016.0,5049,Residential Model Sr Anal III,17770.86,9500731,f313b1c3-1b1a-4b07-bb75-a8c850a91bac,18.0
8,2016,2016.25,5049,Residential Model Sr Anal III,20800.67,9500731,f313b1c3-1b1a-4b07-bb75-a8c850a91bac,18.25
9,2016,2016.5,5049,Residential Model Sr Anal III,17873.76,9500731,f313b1c3-1b1a-4b07-bb75-a8c850a91bac,18.5


In [4]:
len(payroll["Job Code"].value_counts())

2382

In [5]:
X = payroll[['Job Code','Working Year','Position ID','Fiscal Period', 'Fiscal Year']]
Y = payroll['Base Pay']
X.head(10)

Unnamed: 0,Job Code,Working Year,Position ID,Fiscal Period,Fiscal Year
0,1172,11.0,9510200,2016.0,2016
1,1172,11.25,9510200,2016.25,2016
2,1172,11.5,9510200,2016.5,2016
3,1172,11.75,9510200,2016.75,2016
4,1172,12.0,9510200,2017.0,2017
5,1172,12.25,9510200,2017.25,2017
6,1172,12.5,9510200,2017.5,2017
7,5049,18.0,9500731,2016.0,2016
8,5049,18.25,9500731,2016.25,2016
9,5049,18.5,9500731,2016.5,2016


In [7]:
Y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 234299 entries, 0 to 234298
Series name: Base Pay
Non-Null Count   Dtype  
--------------   -----  
234299 non-null  float64
dtypes: float64(1)
memory usage: 1.8 MB


In [6]:
x_train, x_test,y_train,y_test = train_test_split(X,Y,test_size =0.2)
# print the data
x_train

Unnamed: 0,Job Code,Working Year,Position ID,Fiscal Period,Fiscal Year
120462,1794,3.00,1300175,2016.00,2016
198058,1328,16.00,9505286,2018.00,2018
95298,83,4.75,9500074,2017.75,2017
157370,4858,27.00,9502041,2017.00,2017
68080,1941,9.25,9524958,2017.25,2017
...,...,...,...,...,...
60825,1361,20.75,9508541,2017.75,2017
121292,47,24.25,9503290,2017.25,2017
180377,1360,11.75,9507450,2017.75,2017
117852,1539,30.25,9514311,2016.25,2016


In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import RidgeCV
rf = RandomForestRegressor()
rf.fit(x_train, y_train)

In [11]:
y_pred = rf.predict(x_test)

In [12]:
from sklearn.metrics import mean_absolute_error, accuracy_score
# mean_absolute_error(y_test, y_pred)
accuracy_score(y_test, y_pred)

ValueError: continuous is not supported

In [None]:
from sklearn.linear_model import LinearRegression
clf = LinearRegression()

In [None]:
clf.fit(x_train,y_train)

In [None]:
predictions = clf.predict(x_test)
predictions

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
mean_absolute_error(y_test, predictions)

In [None]:
from sklearn.linear_model import Lasso

In [None]:
reg = Lasso(alpha=1)
reg.fit(x_train, y_train)

In [None]:
print('R squared training set', round(reg.score(x_train, y_train)*100, 2))
print('R squared test set', round(reg.score(x_test, y_test)*100, 2))

In [None]:
from sklearn.metrics import mean_squared_error

# Training data
pred_train = reg.predict(x_train)
mse_train = mean_squared_error(y_train, pred_train)
print('MSE training set', round(mse_train, 2))

# Test data
pred = reg.predict(x_test)
mse_test =mean_squared_error(y_test, pred)
print('MSE test set', round(mse_test, 2))

In [None]:
from sklearn.linear_model import Ridge

In [None]:
rr = Ridge(alpha=0.01)
rr.fit(x_train, y_train) 
pred_train_rr= rr.predict(x_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_rr)))
#print(r2_score(y_train, pred_train_rr))

pred_test_rr= rr.predict(x_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_rr))) 
#print(r2_score(y_test, pred_test_rr))

In [None]:
from sklearn.linear_model import ElasticNet

In [None]:
model_enet = ElasticNet(alpha = 0.01)
model_enet.fit(x_train, y_train) 
pred_train_enet= model_enet.predict(x_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_enet)))
#print(r2_score(y_train, pred_train_enet))

pred_test_enet= model_enet.predict(x_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_enet)))
#print(r2_score(y_test, pred_test_enet))

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
# create a regressor object
regressor = DecisionTreeRegressor(random_state = 0) 
  
# fit the regressor with X and Y data
regressor.fit(X, Y)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
mean_absolute_error(y_test, predictions)

### xgb regressor

In [None]:
%pip install xgboost

In [None]:
import xgboost as xgb
xgb = xgb.XGBRegressor()
xgb.fit(x_train, y_train)
y_pred = xgb.predict(x_test)
mean_absolute_error(y_test, y_pred)

# apaa

In [None]:
# sum = 0
# for i in range(len(predictions)):
#     if predictions[i] == y_test[i]:
#         sum=sum+1

# accuracy = sum/len(predictions)*100
# accuracy

In [None]:
# YANG INI JANGAN DIJALANKAN DULU YA GESSS :'

arr = []

try:
    with open('./cache/ids.txt', "r") as f:
        for _id in f:
            arr.append(int(_id))
except:
    if not os.path.exists('./cache'):
        os.mkdir('./cache')
        
    _index = payroll['Job Code'].unique()
    for _id in _index:
        counts = len(payroll.groupby(['Job Code']).get_group(_id))
        if counts > 1000:
            arr.append(str(_id))
    with open('cache/ids.txt', 'w') as f:
        for _id in arr:
            f.write('%s\n' % _id) 
finally:
   f.close()

# :return : <List> arr : list of unique job id

In [None]:
# Group Job Code 1172 training dataset <<< PAKAI INI NGGIH

group_1172 = payroll.groupby(['Job Code']).get_group(1172)

_sorted = group_1172.sort_values('Employee Identifier')
fiscal = pd.pivot_table(_sorted, values='Base Pay', index=['Fiscal Period'], columns='Employee Identifier')

fiscal = fiscal.diff()

In [None]:
# cleaned display table

fiscal.fillna(0, inplace=True)
fiscal = fiscal.drop(2016.00)

fiscal

In [None]:


# neg_cols = (payroll[numeric_cols] < 0).any()

# if neg_cols.any():
#     print("Terdapat nilai negatif pada kolom: ", end="")
#     print(", ".join(neg_cols[neg_cols == True].index))
# else:
#     print("Tidak terdapat nilai negatif pada semua kolom numerik.")

In [None]:
# SUDAH AMAN 👍👍

# null_cols = payroll.isnull().any()

# if null_cols.any():
#     print("Terdapat nilai NaN pada kolom: ", end="")
#     print(", ".join(null_cols[null_cols == True].index))
# else:
#     print("Tidak terdapat nilai NaN pada semua kolom.")

In [None]:
# query job code indexing

for val in arr:
    new_df = payroll[payroll['Job Code'] == val]
    print(f"DataFrame for job code {val}:")
    display(new_df)