In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data.dataset import random_split, TensorDataset
from torchvision import datasets
from sklearn.metrics import confusion_matrix

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [96]:
data_train = pd.read_csv('train.csv')
data_train_len = data_train.shape[0]
data_test = pd.read_csv('test.csv')
y = data_train.pop('Attrition')
data = pd.concat([data_train, data_test])

data = data.drop(columns=['EmployeeID', 'StandardHours', 'Over18', 'EmployeeCount'])
data.replace({'JobRole':{'Admin': 'Administrative'}}, inplace=True)
# for colname in data.select_dtypes("O"):
#     data[colname], _ = data[colname].factorize()
data.replace({'Education':{1:'Below_college', 2:'College', 3:'Bachelor', 4:'Masters', 5:'Doctor'}}, inplace=True)
data = pd.get_dummies(data)
data = data.drop(columns=['Gender_Female','OverTime_No'])

train = data.iloc[:data_train_len, :]
test = data.iloc[data_train_len:, :]

In [97]:
train.shape

(1340, 48)

In [98]:
def y_enco(y):
    if y == 'No':
        return 0
    elif y == 'Yes':
        return 1
    else:
        pass

y_encoder = y.apply(y_enco)

In [99]:
from imblearn.over_sampling import ADASYN 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
# Data Normalization

scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(train)
X_norm = scaler.transform(train)

ada = ADASYN(random_state=42)
X_res, y_res = ada.fit_resample(X_norm, y_encoder)
X_res.shape

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.1, random_state=42)
X_train.shape
# ada = ADASYN(random_state=42)
# X_res, y_res = ada.fit_resample(X_train, y_train)
# X_res.shape, y_res.shape

(2116, 48)

In [100]:
X_data = torch.tensor(X_train, dtype = torch.float)
y_data = torch.tensor(y_train.to_numpy().reshape((-1,1)), dtype = torch.float)

dataset = TensorDataset(X_data, y_data)

# train_set, val_set = random_split(dataset, [1071, 269])
train_set, val_set = random_split(dataset, [1500, 616])

train_loader = torch.utils.data.DataLoader(train_set, batch_size=128, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=128, shuffle=False)


In [127]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(48, 100)
        self.fc2 = nn.Linear(100, 100)
        self.fc3 = nn.Linear(100, 100)
        self.fc4 = nn.Linear(100, 100)
        self.fc5 = nn.Linear(100, 100)
        self.fc6 = nn.Linear(100, 100)
        self.fc7 = nn.Linear(100, 1)
        self.dropout = nn.Dropout(p=0.2)
    def forward(self, x):
        x = torch.sigmoid(self.fc1(x))
        #x = self.dropout(x)
        x = F.softplus(self.fc2(x))
        #x = self.dropout(x)
        x = F.softplus(self.fc3(x))
        #x = self.dropout(x)
        x = F.softplus(self.fc4(x))
        #x = self.dropout(x)
        x = F.softplus(self.fc5(x))
        x = F.softplus(self.fc6(x))
        x = torch.sigmoid(self.fc7(x))
        return x

In [128]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device =='cuda':
    print("Train on GPU...")
else:
    print("Train on CPU...")
max_epochs = 80

Train on GPU...


In [129]:
loss_list, acc_list = [], []

criterion = nn.MSELoss()
net = Net().to(device)
optimizer = optim.Adam(net.parameters(), lr=0.01)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

for epoch in range(max_epochs):
    net.train()
    epoch_loss = 0.0
    for batch_idx, (data, labels) in enumerate(train_loader):
        data, labels = data.to(device), labels.to(device)

        optimizer.zero_grad()
        output = net(data)
        loss = criterion(output, labels)

        loss.backward()
        optimizer.step()
        
        epoch_loss += loss

    avg_loss = epoch_loss/(batch_idx+1)
    loss_list.append(avg_loss.cpu().detach().numpy())

    # validation
    net.eval()
    with torch.no_grad():
        loss_val = 0.0
        correct_val = 0
        for batch_idx, (data, labels) in enumerate(val_loader):
            data, labels = data.to(device), labels.to(device)
            output = net(data)
            loss = criterion(output, labels)
  
            loss_val += loss
            
        avg_loss_val = loss_val/(batch_idx+1)
        avg_acc_val = correct_val/(len(val_loader.dataset))
        
    scheduler.step()

    print('[epoch %d] loss: %.5f val loss: %.5f' % (epoch + 1, avg_loss,  avg_loss_val))

[epoch 1] loss: 0.29093 val loss: 0.25232
[epoch 2] loss: 0.25913 val loss: 0.25398
[epoch 3] loss: 0.25282 val loss: 0.24999
[epoch 4] loss: 0.24940 val loss: 0.24899
[epoch 5] loss: 0.22490 val loss: 0.19079
[epoch 6] loss: 0.14015 val loss: 0.10574
[epoch 7] loss: 0.09870 val loss: 0.07969
[epoch 8] loss: 0.08598 val loss: 0.07255
[epoch 9] loss: 0.08641 val loss: 0.08812
[epoch 10] loss: 0.08007 val loss: 0.06891
[epoch 11] loss: 0.07780 val loss: 0.07427
[epoch 12] loss: 0.07225 val loss: 0.07334
[epoch 13] loss: 0.07094 val loss: 0.06469
[epoch 14] loss: 0.06807 val loss: 0.06575
[epoch 15] loss: 0.06944 val loss: 0.06286
[epoch 16] loss: 0.06457 val loss: 0.07405
[epoch 17] loss: 0.06736 val loss: 0.07354
[epoch 18] loss: 0.06288 val loss: 0.06416
[epoch 19] loss: 0.06459 val loss: 0.05872
[epoch 20] loss: 0.06183 val loss: 0.05841
[epoch 21] loss: 0.05626 val loss: 0.05890
[epoch 22] loss: 0.05396 val loss: 0.05908
[epoch 23] loss: 0.05824 val loss: 0.05756
[epoch 24] loss: 0.0

In [130]:
scaler.fit(test)

X_test_scaler = scaler.transform(test)
X_test_scaler = torch.tensor(X_test_scaler, dtype=torch.float).to(device)
X_test_scaler.shape

torch.Size([336, 48])

In [131]:
y_pred = net(X_test_scaler).cpu().detach().numpy()
y_pred_show = np.zeros((y_pred.shape), dtype = int)
for i in range(y_pred.shape[0]):
    if y_pred[i] >= 0.5:
        y_pred_show[i] = 1
    else:
        y_pred_show[i] = 0


In [132]:
id = range(0,len(y_pred_show))
y_test = pd.DataFrame()
y_test['Id'] = id
y_test['Predicted'] = y_pred_show
y_test.to_csv('pred_dnn_ad.csv', index = 0)

In [63]:
from sklearn.metrics import f1_score
for x,y in test_loader:
    x = x.to(device)

    y_pred_val = net(x).cpu().detach().numpy()
    y_pred_show = np.zeros((y_pred_val.shape), dtype = int)
    for i in range(y_pred_val.shape[0]):
        if y_pred_val[i] >= 0.5:
            y_pred_show[i] = 1
        else:
            y_pred_show[i] = 0

    print(f1_score(y, y_pred_show))

0.9619047619047619
1.0
