## 1.导入依赖包

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

## 2.数据预处理

### 2.1读取数据集并查看前5条数据

In [2]:
data = pd.read_csv('../../data/HR_comma_sep.csv')
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,part,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


### 2.2查看数据结构

In [3]:
data.shape

(14999, 10)

### 2.3查看数据信息

可以看到一共有10个特征，其中有两个是字符串类型，需要转成独热编码

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   part                   14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


### 2.4查看一共有几个部门

In [5]:
data.part.unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

### 2.5查看一共有几种安全程度

In [6]:
data.salary.unique()

array(['low', 'medium', 'high'], dtype=object)

### 2.6查看每个安全程度各部门有多少人

In [7]:
data.groupby(['salary', 'part']).size()

salary  part       
high    IT               83
        RandD            51
        accounting       74
        hr               45
        management      225
        marketing        80
        product_mng      68
        sales           269
        support         141
        technical       201
low     IT              609
        RandD           364
        accounting      358
        hr              335
        management      180
        marketing       402
        product_mng     451
        sales          2099
        support        1146
        technical      1372
medium  IT              535
        RandD           372
        accounting      335
        hr              359
        management      225
        marketing       376
        product_mng     383
        sales          1772
        support         942
        technical      1147
dtype: int64

### 2.7对安全程度进行独热编码

In [8]:
pd.get_dummies(data.salary)

Unnamed: 0,high,low,medium
0,0,1,0
1,0,0,1
2,0,0,1
3,0,1,0
4,0,1,0
...,...,...,...
14994,0,1,0
14995,0,1,0
14996,0,1,0
14997,0,1,0


### 2.8将安全程度的独热编码加入数据集中，并将原来字符串特征进行删除

In [9]:
data = data.join(pd.get_dummies(data.salary))
del data['salary']
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,part,high,low,medium
0,0.38,0.53,2,157,3,0,1,0,sales,0,1,0
1,0.8,0.86,5,262,6,0,1,0,sales,0,0,1
2,0.11,0.88,7,272,4,0,1,0,sales,0,0,1
3,0.72,0.87,5,223,5,0,1,0,sales,0,1,0
4,0.37,0.52,2,159,3,0,1,0,sales,0,1,0


### 2.9将部门进行独热编码并删除原来的字符串特征

In [10]:
data = data.join(pd.get_dummies(data.part))
del data['part']
data

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,high,low,...,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
0,0.38,0.53,2,157,3,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,0.80,0.86,5,262,6,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.11,0.88,7,272,4,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.72,0.87,5,223,5,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,0.37,0.52,2,159,3,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
14995,0.37,0.48,2,160,3,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
14996,0.37,0.53,2,143,3,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
14997,0.11,0.96,6,280,4,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0


### 3.0查看各个标签出现的次数

In [11]:
data.left.value_counts()

0    11428
1     3571
Name: left, dtype: int64

### 3.1改变Y的维度

In [12]:
Y_data = data.left.values.reshape(-1, 1)
Y_data.shape

(14999, 1)

### 3.2将Y转成张量

In [13]:
Y = torch.from_numpy(Y_data).type(torch.FloatTensor)

### 3.3将X转成张量

In [14]:
X_data = data[[c for c in data.columns if c != 'left']].values
X = torch.from_numpy(X_data).type(torch.FloatTensor)

### 3.4查看X和Y的维度

In [15]:
X.size(), Y.size()

(torch.Size([14999, 20]), torch.Size([14999, 1]))

## 4.搭建模型

### 4.1方法一：

In [16]:
class Model_1(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_1 = nn.Linear(20, 64)
        self.linear_2 = nn.Linear(64, 64)
        self.linear_3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, input):
        x = self.linear_1(input)
        x = self.relu(x)
        x = self.linear_2(x)
        x = self.relu(x)
        x = self.linear_3(x)
        y = self.sigmoid(x)
        return y
    
model_1 = Model_1()
model_1

Model_1(
  (linear_1): Linear(in_features=20, out_features=64, bias=True)
  (linear_2): Linear(in_features=64, out_features=64, bias=True)
  (linear_3): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (sigmoid): Sigmoid()
)

### 4.2方法二：

In [17]:
class Model_2(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_1 = nn.Linear(20, 64)
        self.linear_2 = nn.Linear(64, 64)
        self.linear_3 = nn.Linear(64, 1)
        
    def forward(self, input):
        x = F.relu(self.linear_1(input))
        x = F.relu(self.linear_2(x))
        y = F.sigmoid(self.linear_3(x))
        return y
    
model_2 = Model_2()
model_2

Model_2(
  (linear_1): Linear(in_features=20, out_features=64, bias=True)
  (linear_2): Linear(in_features=64, out_features=64, bias=True)
  (linear_3): Linear(in_features=64, out_features=1, bias=True)
)

## 5.设置超参数

In [18]:
lr = 0.0001                              #学习率
loss_fn = nn.BCELoss()                   #损失函数
batch_size = 64                          #批训练大小
iteration = len(data) // batch_size      #需要训练的批数
epochs = 100                             #所有数据训练的次数

## 6.获取模型

In [19]:
def get_model():
    model = Model_2()
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    return model, opt

model, optim = get_model()

## 7.训练模型

### 7.1方法一：用切分的方法切割数据集

In [20]:
for epoch in range(epochs):
    for i in range(iteration):
        start = i * batch_size
        end = start + batch_size
        x = X[start : end]
        y = Y[start : end]
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
    with torch.no_grad():
        print('epoch: ', epoch, 'loss: ', loss_fn(model(X), Y).data.item())



epoch:  0 loss:  0.6863037347793579
epoch:  1 loss:  0.6943723559379578
epoch:  2 loss:  0.6791393160820007
epoch:  3 loss:  0.6683120131492615
epoch:  4 loss:  0.657041072845459
epoch:  5 loss:  0.6455378532409668
epoch:  6 loss:  0.6336991190910339
epoch:  7 loss:  0.6348214745521545
epoch:  8 loss:  0.6224115490913391
epoch:  9 loss:  0.6115456819534302
epoch:  10 loss:  0.6017561554908752
epoch:  11 loss:  0.6121525764465332
epoch:  12 loss:  0.6101523041725159
epoch:  13 loss:  0.5942492485046387
epoch:  14 loss:  0.5839657783508301
epoch:  15 loss:  0.5773881077766418
epoch:  16 loss:  0.571406364440918
epoch:  17 loss:  0.5671684741973877
epoch:  18 loss:  0.5637549161911011
epoch:  19 loss:  0.5612894296646118
epoch:  20 loss:  0.5592552423477173
epoch:  21 loss:  0.5575058460235596
epoch:  22 loss:  0.5567304491996765
epoch:  23 loss:  0.5560505986213684
epoch:  24 loss:  0.5566777586936951
epoch:  25 loss:  0.5561623573303223
epoch:  26 loss:  0.5562872290611267
epoch:  27 lo

### 7.2方法二：用TensorDataset的方法切割数据集

In [21]:
hr_dataset = TensorDataset(X, Y)
len(hr_dataset)
hr_dataset[66 : 68]

model, optim = get_model()

for epoch in range(epochs):
    for i in range(iteration):
        x, y = hr_dataset[i * batch_size: i * batch_size + batch_size]
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
    with torch.no_grad():
        print('epoch: ', epoch, 'loss: ', loss_fn(model(X), Y).data.item())

epoch:  0 loss:  0.6456089615821838
epoch:  1 loss:  0.617047131061554
epoch:  2 loss:  0.6136002540588379
epoch:  3 loss:  0.6083982586860657
epoch:  4 loss:  0.6053360104560852
epoch:  5 loss:  0.5993931889533997
epoch:  6 loss:  0.5838913917541504
epoch:  7 loss:  0.6005063056945801
epoch:  8 loss:  0.5776329040527344
epoch:  9 loss:  0.5715966820716858
epoch:  10 loss:  0.5686569213867188
epoch:  11 loss:  0.566743791103363
epoch:  12 loss:  0.5750707387924194
epoch:  13 loss:  0.5684741139411926
epoch:  14 loss:  0.5660810470581055
epoch:  15 loss:  0.5636332631111145
epoch:  16 loss:  0.5621346235275269
epoch:  17 loss:  0.5612703561782837
epoch:  18 loss:  0.5605245232582092
epoch:  19 loss:  0.5604021549224854
epoch:  20 loss:  0.5603036880493164
epoch:  21 loss:  0.5601687431335449
epoch:  22 loss:  0.5606539845466614
epoch:  23 loss:  0.5607375502586365
epoch:  24 loss:  0.5615357160568237
epoch:  25 loss:  0.5617081522941589
epoch:  26 loss:  0.5625395774841309
epoch:  27 lo

### 7.3方法三：用DataLoader的方法切割数据集

In [22]:
hr_ds = TensorDataset(X, Y)
hr_dl = DataLoader(hr_ds, batch_size=batch_size, shuffle=True)

model, optim = get_model()

for epoch in range(epochs):
    for x, y in hr_dl:
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
    with torch.no_grad():
        print('epoch: ', epoch, 'loss: ', loss_fn(model(X), Y).data.item())

epoch:  0 loss:  0.5662083029747009
epoch:  1 loss:  0.5643975138664246
epoch:  2 loss:  0.5578533411026001
epoch:  3 loss:  0.5519608855247498
epoch:  4 loss:  0.5452340841293335
epoch:  5 loss:  0.5377640724182129
epoch:  6 loss:  0.5298526883125305
epoch:  7 loss:  0.5202850699424744
epoch:  8 loss:  0.5134329199790955
epoch:  9 loss:  0.5031102299690247
epoch:  10 loss:  0.4879625737667084
epoch:  11 loss:  0.476669043302536
epoch:  12 loss:  0.46836423873901367
epoch:  13 loss:  0.45677119493484497
epoch:  14 loss:  0.46092063188552856
epoch:  15 loss:  0.43292322754859924
epoch:  16 loss:  0.4233585000038147
epoch:  17 loss:  0.41258731484413147
epoch:  18 loss:  0.4035184979438782
epoch:  19 loss:  0.39519158005714417
epoch:  20 loss:  0.3984805941581726
epoch:  21 loss:  0.3806788921356201
epoch:  22 loss:  0.38321778178215027
epoch:  23 loss:  0.3673507571220398
epoch:  24 loss:  0.3596048057079315
epoch:  25 loss:  0.3575997054576874
epoch:  26 loss:  0.35204359889030457
epoc

### 7.4方法四：用train_test_split的方法切割数据集，并将其划分为训练集和测试集

In [23]:
train_x, test_x, train_y, test_y = train_test_split(X_data, Y_data)
train_x = torch.from_numpy(train_x).type(torch.float32)
train_y = torch.from_numpy(train_y).type(torch.float32)
test_x = torch.from_numpy(test_x).type(torch.float32)
test_y = torch.from_numpy(test_y).type(torch.float32)

train_ds = TensorDataset(train_x, train_y)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

test_ds = TensorDataset(test_x, test_y)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=True)

## 8.添加准确率

In [24]:
def acc(y_pred, y_true):
    y_pred = y_pred = (y_pred > 0.5).type(torch.int32)
    acc = (y_pred == y_true).float().mean()
    return acc

model, optim = get_model()

for epoch in range(epochs):
    for x, y in train_dl:
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
    with torch.no_grad():
        epoch_acc = acc(model(train_x), train_y)
        epoch_loss = loss_fn(model(train_x), train_y).data
        epoch_test_acc = acc(model(test_x), test_y)
        epoch_test_loss = loss_fn(model(test_x), test_y).data
        print('epoch: ', epoch, 
              'loss: ', round(epoch_loss.item(), 3), 
              'accuracy: ', round(epoch_acc.item(), 3), 
              'test_loss: ', round(epoch_test_loss.item(), 3), 
              'test_accuracy: ', round(epoch_test_acc.item(), 3))

epoch:  0 loss:  0.564 accuracy:  0.761 test_loss:  0.562 test_accuracy:  0.765
epoch:  1 loss:  0.564 accuracy:  0.761 test_loss:  0.562 test_accuracy:  0.765
epoch:  2 loss:  0.563 accuracy:  0.761 test_loss:  0.561 test_accuracy:  0.765
epoch:  3 loss:  0.562 accuracy:  0.761 test_loss:  0.559 test_accuracy:  0.765
epoch:  4 loss:  0.561 accuracy:  0.761 test_loss:  0.558 test_accuracy:  0.765
epoch:  5 loss:  0.56 accuracy:  0.761 test_loss:  0.558 test_accuracy:  0.765
epoch:  6 loss:  0.559 accuracy:  0.761 test_loss:  0.557 test_accuracy:  0.765
epoch:  7 loss:  0.559 accuracy:  0.761 test_loss:  0.557 test_accuracy:  0.765
epoch:  8 loss:  0.556 accuracy:  0.761 test_loss:  0.554 test_accuracy:  0.765
epoch:  9 loss:  0.556 accuracy:  0.761 test_loss:  0.554 test_accuracy:  0.765
epoch:  10 loss:  0.554 accuracy:  0.761 test_loss:  0.551 test_accuracy:  0.765
epoch:  11 loss:  0.553 accuracy:  0.761 test_loss:  0.55 test_accuracy:  0.765
epoch:  12 loss:  0.554 accuracy:  0.761