In [99]:
# Data manipulation
import pandas as pd
import numpy as np

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.model_selection import  train_test_split

import sys
path_append = "../"
sys.path.append(path_append)  # Go up one directory from where you are.

from ccnets.config import get_parser
from ccnets.ccnets import CCNets
from ccnets.resnets import ResNets
from nn.custom_deepfm import DeepFM
from nn.custom_dnn import ResMLP, MLP 
from ccnets.utils.log import create_log_details, create_log_name
from ccnets.utils.setting import set_random_seed
import torch
import os
from torch.utils.tensorboard import SummaryWriter


In [100]:
df = pd.read_csv('./data/employee_data.csv')
df.head()

Unnamed: 0,EmpID,FirstName,LastName,StartDate,ExitDate,Title,Supervisor,ADEmail,BusinessUnit,EmployeeStatus,...,Division,DOB,State,JobFunctionDescription,GenderCode,LocationCode,RaceDesc,MaritalDesc,Performance Score,Current Employee Rating
0,3427,Uriah,Bridges,20-Sep-19,,Production Technician I,Peter Oneill,uriah.bridges@bilearner.com,CCDR,Active,...,Finance & Accounting,07-10-1969,MA,Accounting,Female,34904,White,Widowed,Fully Meets,4
1,3428,Paula,Small,11-Feb-23,,Production Technician I,Renee Mccormick,paula.small@bilearner.com,EW,Active,...,Aerial,30-08-1965,MA,Labor,Male,6593,Hispanic,Widowed,Fully Meets,3
2,3429,Edward,Buck,10-Dec-18,,Area Sales Manager,Crystal Walker,edward.buck@bilearner.com,PL,Active,...,General - Sga,06-10-1991,MA,Assistant,Male,2330,Hispanic,Widowed,Fully Meets,4
3,3430,Michael,Riordan,21-Jun-21,,Area Sales Manager,Rebekah Wright,michael.riordan@bilearner.com,CCDR,Active,...,Finance & Accounting,04-04-1998,ND,Clerk,Male,58782,Other,Single,Fully Meets,2
4,3431,Jasmine,Onque,29-Jun-19,,Area Sales Manager,Jason Kim,jasmine.onque@bilearner.com,TNS,Active,...,General - Con,29-08-1969,FL,Laborer,Female,33174,Other,Married,Fully Meets,3


In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 26 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   EmpID                       3000 non-null   int64 
 1   FirstName                   3000 non-null   object
 2   LastName                    3000 non-null   object
 3   StartDate                   3000 non-null   object
 4   ExitDate                    1533 non-null   object
 5   Title                       3000 non-null   object
 6   Supervisor                  3000 non-null   object
 7   ADEmail                     3000 non-null   object
 8   BusinessUnit                3000 non-null   object
 9   EmployeeStatus              3000 non-null   object
 10  EmployeeType                3000 non-null   object
 11  PayZone                     3000 non-null   object
 12  EmployeeClassificationType  3000 non-null   object
 13  TerminationType             3000 non-null   obje

In [102]:
drop_cols = ['EmpID', 'ADEmail', 'Supervisor','FirstName', 'LastName', 'StartDate', 'ExitDate', 'TerminationDescription', 'DOB', 'LocationCode']
df.drop(columns=drop_cols, axis = 1, inplace=True)
df.head()

Unnamed: 0,Title,BusinessUnit,EmployeeStatus,EmployeeType,PayZone,EmployeeClassificationType,TerminationType,DepartmentType,Division,State,JobFunctionDescription,GenderCode,RaceDesc,MaritalDesc,Performance Score,Current Employee Rating
0,Production Technician I,CCDR,Active,Contract,Zone C,Temporary,Unk,Production,Finance & Accounting,MA,Accounting,Female,White,Widowed,Fully Meets,4
1,Production Technician I,EW,Active,Contract,Zone A,Part-Time,Unk,Production,Aerial,MA,Labor,Male,Hispanic,Widowed,Fully Meets,3
2,Area Sales Manager,PL,Active,Full-Time,Zone B,Part-Time,Unk,Sales,General - Sga,MA,Assistant,Male,Hispanic,Widowed,Fully Meets,4
3,Area Sales Manager,CCDR,Active,Contract,Zone A,Full-Time,Unk,Sales,Finance & Accounting,ND,Clerk,Male,Other,Single,Fully Meets,2
4,Area Sales Manager,TNS,Active,Contract,Zone A,Temporary,Unk,Sales,General - Con,FL,Laborer,Female,Other,Married,Fully Meets,3


In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Title                       3000 non-null   object
 1   BusinessUnit                3000 non-null   object
 2   EmployeeStatus              3000 non-null   object
 3   EmployeeType                3000 non-null   object
 4   PayZone                     3000 non-null   object
 5   EmployeeClassificationType  3000 non-null   object
 6   TerminationType             3000 non-null   object
 7   DepartmentType              3000 non-null   object
 8   Division                    3000 non-null   object
 9   State                       3000 non-null   object
 10  JobFunctionDescription      3000 non-null   object
 11  GenderCode                  3000 non-null   object
 12  RaceDesc                    3000 non-null   object
 13  MaritalDesc                 3000 non-null   obje

In [104]:
df['Current Employee Rating'].value_counts()

Current Employee Rating
3    1530
2     510
4     419
1     271
5     270
Name: count, dtype: int64

In [105]:
target = 'Current Employee Rating'

X = df.drop(target, axis = 1)
y = df[target]

In [106]:
oh_encoder = OneHotEncoder()
X = oh_encoder.fit_transform(X).toarray()
X


array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [107]:
y = y.astype(str)
y = pd.get_dummies(y)
y = y.astype(int)

In [108]:
X = pd.DataFrame(X)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,208,209,210,211,212,213,214,215,216,217
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [109]:
print(X.shape)
print(y.shape)
print(X.info()) 

(3000, 218)
(3000, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Columns: 218 entries, 0 to 217
dtypes: float64(218)
memory usage: 5.0 MB
None


In [110]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        vals = torch.tensor(self.x[index], dtype = torch.float32)
        label = torch.tensor(self.y[index], dtype= torch.float32).unsqueeze(-1)
        return vals, label


In [111]:
args = get_parser()
args.device = torch.device('cuda:0' if (torch.cuda.is_available() and args.ngpu > 0) else "cpu")

In [112]:
import IPython ; file_path = IPython.extract_module_locals()[1]['__vsc_ipynb_file__']
from pathlib import Path
file_name = Path(file_path).stem
model_path = path_append + f"models/{file_name}/"
temp_path = path_append + f"models/{'temp_'}{file_name}/"
log_path = path_append + f"log/{file_name}/"


if Path(temp_path).exists() is False: 
    os.mkdir(temp_path)

if Path(model_path).exists() is False: 
    os.mkdir(model_path)

if Path(log_path).exists() is False: 
    os.mkdir(log_path)  

args.model_path = model_path
args.temp_path = temp_path

In [113]:
args.num_epoch = 1000
args.lr = 2e-4
args.batch_size = 64
args.step_size = 10

args.num_layer = 3
args.hidden_size = 256

args.obs_size = 218
args.label_size = 1
args.explain_size = 218//2
args.seq_len = 0

args.num_checkpoints = 100
args.use_one_hot = False

args.reasoner_joint_type = "add"
args.producer_joint_type = "add"
args.label_type = "C" 

args.obs_fn = "none"
args.label_fn = "none"

In [114]:
test_size = 0.2

X = X[:]
# args.num_epoch = int(round(3.2/(1 - test_size)))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, shuffle= False)


X_train = X_train.iloc[:, :].values 
X_test = X_test.iloc[:, :].values 
y_train = y_train.iloc[:, -1].values
y_test = y_test.iloc[:, -1].values

trainset = Dataset(X_train, y_train)
testset = Dataset(X_test, y_test)



In [115]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2400, 218)
(600, 218)
(2400,)
(600,)


In [116]:

args.loss_type = "L1"
args.error_type = "Sub"
args.loss_reduction = "all"
args.error_reduction = "none"
log_details = create_log_details(args)
args.log = SummaryWriter(log_dir=create_log_name(log_path, log_details))
set_random_seed(0)

ccnets = CCNets(args, MLP, DeepFM, ResMLP)
ccnets.train(trainset, testset)

# args.loss_type = "MSE"
# args.loss_reduction = "all"
# log_details = create_log_details(args)
# args.log = SummaryWriter(log_dir=create_log_name(log_path, log_details))
# set_random_seed(0)
# resnets = ResNets(args, MLP, DeepFM)
# resnets.train(trainset, testset)

  0%|          | 0/1000 [00:00<?, ?it/s]

[2/1000][25/38][Time 2.31]
Opt-Adam lr: 0.00019908190206581545
Inf: 0.0007	Gen: 0.0757	Rec: 0.0755	E: 0.0009	R: 0.0005	P: 0.1506
auc: 0.5142
[5/1000][14/38][Time 2.18]
Opt-Adam lr: 0.0001981680186507147
Inf: 0.0001	Gen: 0.0690	Rec: 0.0690	E: 0.0001	R: 0.0001	P: 0.1379
auc: 0.4624
[8/1000][3/38][Time 2.25]
Opt-Adam lr: 0.0001972583304079914
Inf: 0.0001	Gen: 0.0686	Rec: 0.0686	E: 0.0001	R: 0.0001	P: 0.1371
auc: 0.4787
[10/1000][29/38][Time 2.46]
Opt-Adam lr: 0.00019635281807975005
Inf: 0.0001	Gen: 0.0680	Rec: 0.0680	E: 0.0002	R: 0.0001	P: 0.1358
auc: 0.5005
[13/1000][18/38][Time 2.19]
Opt-Adam lr: 0.0001954514624964984
Inf: 0.0002	Gen: 0.0649	Rec: 0.0648	E: 0.0003	R: 0.0001	P: 0.1295
auc: 0.4889
[16/1000][7/38][Time 2.21]
Opt-Adam lr: 0.00019455424457674148
Inf: 0.0002	Gen: 0.0598	Rec: 0.0597	E: 0.0002	R: 0.0001	P: 0.1193
auc: 0.5215
[18/1000][33/38][Time 2.35]
Opt-Adam lr: 0.00019366114532657779
Inf: 0.0001	Gen: 0.0593	Rec: 0.0593	E: 0.0001	R: 0.0001	P: 0.1186
auc: 0.4661
[21/1000][22/3