In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split


In [None]:
# read table

dataPayment = '../Datasets/Employee_Payroll.csv'
payroll = pd.read_csv(dataPayment)

# set default value to 0 for NaN numerical data
numeric_cols = payroll.select_dtypes(include=[np.number]).columns
payroll[numeric_cols] = payroll[numeric_cols].clip(lower=0)


payroll.fillna(0, inplace=True)
payroll['Office'] = payroll['Office'].astype(int)

# define column for 1/4 year discretization
payroll['Fiscal Quarter'] = payroll['Fiscal Quarter']*0.25 - 0.25
payroll['Fiscal Period'] = payroll['Fiscal Year'] + payroll['Fiscal Quarter']

pd.to_datetime(payroll['Original Hire Date'])

# parse hire date to get hire year
payroll['Original Hire Date'] = payroll['Original Hire Date'].str.split('/').str[2]
payroll['Original Hire Date'] = payroll['Original Hire Date'].astype(int)

_work_year = payroll["Fiscal Period"] - payroll['Original Hire Date']

In [None]:
# specify table column included

payroll = payroll[['Fiscal Year', 'Fiscal Period', 'Job Code', 'Job Title', 'Base Pay', 'Position ID', 'Employee Identifier']]

# add Working Year Column
payroll["Working Year"] = _work_year

payroll.head(20)

In [None]:
len(payroll["Job Code"].value_counts())

In [None]:
X = payroll[['Job Code','Working Year']]
Y = payroll['Base Pay']
X.head(10)

In [None]:
Y.info()

In [None]:
x_train, x_test,y_train,y_test = train_test_split(X,Y,test_size =0.2)
# print the data
x_train

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import RidgeCV
rf = RandomForestRegressor()
rf.fit(x_train, y_train)

In [None]:
y_pred = rf.predict(x_test)

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred)

In [None]:
from sklearn.linear_model import LinearRegression
clf = LinearRegression()

In [None]:
clf.fit(x_train,y_train)

In [None]:
predictions = clf.predict(x_test)
predictions

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
mean_absolute_error(y_test, predictions)

In [None]:
from sklearn.linear_model import Lasso

In [None]:
reg = Lasso(alpha=1)
reg.fit(x_train, y_train)

In [None]:
print('R squared training set', round(reg.score(x_train, y_train)*100, 2))
print('R squared test set', round(reg.score(x_test, y_test)*100, 2))

In [None]:
from sklearn.metrics import mean_squared_error

# Training data
pred_train = reg.predict(x_train)
mse_train = mean_squared_error(y_train, pred_train)
print('MSE training set', round(mse_train, 2))

# Test data
pred = reg.predict(x_test)
mse_test =mean_squared_error(y_test, pred)
print('MSE test set', round(mse_test, 2))

In [None]:
from sklearn.linear_model import Ridge

In [None]:
rr = Ridge(alpha=0.01)
rr.fit(x_train, y_train) 
pred_train_rr= rr.predict(x_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_rr)))
#print(r2_score(y_train, pred_train_rr))

pred_test_rr= rr.predict(x_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_rr))) 
#print(r2_score(y_test, pred_test_rr))

In [None]:
from sklearn.linear_model import ElasticNet

In [None]:
model_enet = ElasticNet(alpha = 0.01)
model_enet.fit(x_train, y_train) 
pred_train_enet= model_enet.predict(x_train)
print(np.sqrt(mean_squared_error(y_train,pred_train_enet)))
#print(r2_score(y_train, pred_train_enet))

pred_test_enet= model_enet.predict(x_test)
print(np.sqrt(mean_squared_error(y_test,pred_test_enet)))
#print(r2_score(y_test, pred_test_enet))

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
# create a regressor object
regressor = DecisionTreeRegressor(random_state = 0) 
  
# fit the regressor with X and Y data
regressor.fit(X, Y)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
mean_absolute_error(y_test, predictions)

### xgb regressor

In [None]:
%pip install xgboost

In [None]:
import xgboost as xgb
xgb = xgb.XGBRegressor()
xgb.fit(x_train, y_train)
y_pred = xgb.predict(x_test)
mean_absolute_error(y_test, y_pred)

# apaa

In [None]:
# sum = 0
# for i in range(len(predictions)):
#     if predictions[i] == y_test[i]:
#         sum=sum+1

# accuracy = sum/len(predictions)*100
# accuracy

In [None]:
# YANG INI JANGAN DIJALANKAN DULU YA GESSS :'

arr = []

try:
    with open('./cache/ids.txt', "r") as f:
        for _id in f:
            arr.append(int(_id))
except:
    if not os.path.exists('./cache'):
        os.mkdir('./cache')
        
    _index = payroll['Job Code'].unique()
    for _id in _index:
        counts = len(payroll.groupby(['Job Code']).get_group(_id))
        if counts > 1000:
            arr.append(str(_id))
    with open('cache/ids.txt', 'w') as f:
        for _id in arr:
            f.write('%s\n' % _id) 
finally:
   f.close()

# :return : <List> arr : list of unique job id

In [None]:
# Group Job Code 1172 training dataset <<< PAKAI INI NGGIH

group_1172 = payroll.groupby(['Job Code']).get_group(1172)

_sorted = group_1172.sort_values('Employee Identifier')
fiscal = pd.pivot_table(_sorted, values='Base Pay', index=['Fiscal Period'], columns='Employee Identifier')

fiscal = fiscal.diff()

In [None]:
# cleaned display table

fiscal.fillna(0, inplace=True)
fiscal = fiscal.drop(2016.00)

fiscal

In [None]:


# neg_cols = (payroll[numeric_cols] < 0).any()

# if neg_cols.any():
#     print("Terdapat nilai negatif pada kolom: ", end="")
#     print(", ".join(neg_cols[neg_cols == True].index))
# else:
#     print("Tidak terdapat nilai negatif pada semua kolom numerik.")

In [None]:
# SUDAH AMAN 👍👍

# null_cols = payroll.isnull().any()

# if null_cols.any():
#     print("Terdapat nilai NaN pada kolom: ", end="")
#     print(", ".join(null_cols[null_cols == True].index))
# else:
#     print("Tidak terdapat nilai NaN pada semua kolom.")

In [None]:
# query job code indexing

for val in arr:
    new_df = payroll[payroll['Job Code'] == val]
    print(f"DataFrame for job code {val}:")
    display(new_df)