## 1.导入依赖包

In [1]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 2.读取数据集

### 2.1可以看到数据集有653行，16列，数据集没有表头，需要设置header=None

In [2]:
data = pd.read_csv('../../data/credit-a.csv', header=None)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0,30.83,0.000,0,0,9,0,1.25,0,0,1,1,0,202,0.0,-1
1,1,58.67,4.460,0,0,8,1,3.04,0,0,6,1,0,43,560.0,-1
2,1,24.50,0.500,0,0,8,1,1.50,0,1,0,1,0,280,824.0,-1
3,0,27.83,1.540,0,0,9,0,3.75,0,0,5,0,0,100,3.0,-1
4,0,20.17,5.625,0,0,9,0,1.71,0,1,0,1,2,120,0.0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
648,0,21.08,10.085,1,1,11,1,1.25,1,1,0,1,0,260,0.0,1
649,1,22.67,0.750,0,0,0,0,2.00,1,0,2,0,0,200,394.0,1
650,1,25.25,13.500,1,1,13,7,2.00,1,0,1,0,0,200,1.0,1
651,0,17.92,0.205,0,0,12,0,0.04,1,1,0,1,0,280,750.0,1


### 2.2查看数据集信息，没有缺失值

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 653 entries, 0 to 652
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       653 non-null    int64  
 1   1       653 non-null    float64
 2   2       653 non-null    float64
 3   3       653 non-null    int64  
 4   4       653 non-null    int64  
 5   5       653 non-null    int64  
 6   6       653 non-null    int64  
 7   7       653 non-null    float64
 8   8       653 non-null    int64  
 9   9       653 non-null    int64  
 10  10      653 non-null    int64  
 11  11      653 non-null    int64  
 12  12      653 non-null    int64  
 13  13      653 non-null    int64  
 14  14      653 non-null    float64
 15  15      653 non-null    int64  
dtypes: float64(4), int64(12)
memory usage: 81.8 KB


## 3.定义X和Y

### 3.1数据集的前15列作为X

In [4]:
X = data.iloc[:, :-1]
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0,30.83,0.0,0,0,9,0,1.25,0,0,1,1,0,202,0.0
1,1,58.67,4.46,0,0,8,1,3.04,0,0,6,1,0,43,560.0
2,1,24.5,0.5,0,0,8,1,1.5,0,1,0,1,0,280,824.0
3,0,27.83,1.54,0,0,9,0,3.75,0,0,5,0,0,100,3.0
4,0,20.17,5.625,0,0,9,0,1.71,0,1,0,1,2,120,0.0


### 3.2数据集的最后一列作为Y

In [5]:
Y = data.iloc[:, -1].replace(-1, 0)
Y.unique()

array([0, 1], dtype=int64)

### 3.3将X转化成张量

In [6]:
X = torch.from_numpy(X.values).type(torch.float32)
X.shape

torch.Size([653, 15])

### 3.4将Y转化成张量

In [7]:
Y = torch.from_numpy(Y.values.reshape(-1, 1)).type(torch.float32)
Y.size()

torch.Size([653, 1])

## 4.定义模型

In [8]:
model = torch.nn.Sequential(
    torch.nn.Linear(15, 1),
    torch.nn.Sigmoid()
)

### 4.1查看模型结构

In [9]:
model

Sequential(
  (0): Linear(in_features=15, out_features=1, bias=True)
  (1): Sigmoid()
)

## 5.定义损失函数

In [10]:
loss_fn = torch.nn.BCELoss()

## 6.定义优化器

In [11]:
opt = torch.optim.Adam(model.parameters(), lr=0.001)

## 7.设置超参数

### 7.1定义batch_size

In [12]:
batch_size = 16
iteration = Y.size(0)//16

### 7.2定义epoches

In [13]:
epoches = 1000

## 8.训练模型

In [14]:
for epoch in range(epoches):
    for batch in range(iteration):
        start = batch * batch_size
        end = start + batch_size
        x = X[start : end]
        y = Y[start : end]
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        opt.zero_grad()
        loss.backward()
        opt.step()

## 9.查看各参数的权重

In [15]:
model.state_dict()

OrderedDict([('0.weight',
              tensor([[-2.9088e-01, -7.8744e-03,  3.6571e-02,  4.1217e-01,  1.4214e-01,
                       -4.6635e-03,  2.0556e-01, -1.5504e-01,  4.4945e+00,  3.1283e-01,
                       -1.4354e-01, -2.7798e-01, -2.2207e-01,  2.1885e-03,  9.9200e-05]])),
             ('0.bias', tensor([-0.5532]))])

## 10.查看模型准确率

In [16]:
((model(X).data.numpy() > 0.5).astype('int') == Y.numpy()).mean()

0.8499234303215927