In [13]:
# 调用需要的库
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [14]:
# 定义 MLP 模型

class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.hidden_layer = nn.Linear(input_size, hidden_size)
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.hidden_layer(x)
        x = self.relu(x)
        x = self.output_layer(x)
        x = self.sigmoid(x)
        return x

In [15]:
class MyDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # if torch.is_tensor(idx):
        #     idx = idx.tolist()
        # 确保索引在范围内
        if idx >= len(self.data):
            raise IndexError("Index out of range")
        
        sample = self.data.iloc[idx].values  # 按行号索引数据
        label = self.labels.iloc[idx].values 
        """
        此处data为dataFrame类型,labels为Series类型
        dataFrame和series类型经过切片之后均为series类型
        此处.values将他们转换为numpy数组类型
        """

        return sample, label

def Standardization(data):
    """
    标准化函数:将一组数据转化为均值为0,标准差为1的标准正态分布
    """
    return (data-data.mean())/data.std()
def Normolization(data):
    """
    归一化函数：将一组数据按比例缩放到(0,1)
    """
    return (data-data.min())/(data.max()-data.min())

def train_test_split(X, y, test_size=0.2, random_state=None):
    """
    将原有数据集拆分成数据集与测试集的函数
    X: 特征向量
    y: 标签值
    test_size: test数据集占整个数据集的比例
    randem_state: 随机种子
    """

    # 设置随机种子
    if random_state is not None:
        np.random.seed(random_state)
    
    # 确定测试集的大小（样本数）
    num_test = int(len(X) * test_size)
    
    # 生成随机索引
    indices = np.random.permutation(len(X))
    
    # 切分数据集
    X_train = X.iloc[indices[num_test:]]
    X_test = X.iloc[indices[:num_test]]
    y_train = y.iloc[indices[num_test:]]
    y_test = y.iloc[indices[:num_test]]
    
    return X_train, X_test, y_train, y_test

def data_preprocess(data_raw):
    """
    数据预处理函数：
        我们通过补中位数的方式,对age列进行补全
        我们通过补众数的方式来,对Embarked进行补全
        由于cabin缺了大部分数据,所以我们直接用U代表Unkown对其进行填补
    """
    # 数据补全与特征缩放
    data_raw['Age'].fillna(data_raw['Age'].median(),inplace=True)
    data_raw['Embarked'] = data_raw['Embarked'].fillna(data_raw['Embarked'].mode().iloc[0]) 
    data_raw['Cabin'].fillna('U',inplace=True)
    data_raw['Age']  = Standardization(data_raw['Age'])
    data_raw['Fare']  = Standardization(data_raw['Fare'])

    # 标签
    target = ['Survived'] 
    y = data_raw[target]
    # 用于预测的特征
    fetures = ['Pclass','Sex','Age','Fare','Parch','SibSp','Embarked'] 
    X = pd.get_dummies(data_raw[fetures],dtype=int)
    
    return X,y

In [16]:
# 主程序

# 读取数据集
data_raw = pd.read_csv("..\week1\dataProcess\泰坦尼克号数据.csv")

# 数据预处理
X, y = data_preprocess(data_raw)

# 将数据集拆分成训练集与测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# 制作数据集
train_set = MyDataset(X_train, y_train)
test_set = MyDataset(X_test, y_test)

# 构造dataLoader
batch_size = 32
train_dataloader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True, num_workers=0)



In [17]:
## 实例化模型
input_size = 10 # 输入特征数
hidden_size = 3 # 隐藏神经元数
output_size = 1 # 输出类别数
model = MLP(input_size,hidden_size,output_size)

In [43]:
## 训练模型

# 定义损失函数和优化器
criterion = nn.BCELoss() # 二分类交叉熵损失
optimizer = optim.Adam(model.parameters(),lr = 1e-5)

# 训练模型
epochs = 1000
for epoch in range(epochs):
    for X_batch,y_batch in train_dataloader:
        # 确保输入数据是 Float 类型
        X_batch = X_batch.float()
        y_batch = y_batch.float()
        
        # 前向传播
        y_hat = model(X_batch)

        # 计算损失
        loss = criterion(y_hat,y_batch)

        # 反向传播与优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

Epoch [10/1000], Loss: 0.7763
Epoch [20/1000], Loss: 0.4132
Epoch [30/1000], Loss: 0.2514
Epoch [40/1000], Loss: 0.3584
Epoch [50/1000], Loss: 0.2945
Epoch [60/1000], Loss: 0.5126
Epoch [70/1000], Loss: 0.1685
Epoch [80/1000], Loss: 0.3682
Epoch [90/1000], Loss: 0.1624
Epoch [100/1000], Loss: 0.4804
Epoch [110/1000], Loss: 0.4041
Epoch [120/1000], Loss: 0.2678
Epoch [130/1000], Loss: 0.3746
Epoch [140/1000], Loss: 0.4208
Epoch [150/1000], Loss: 0.2386
Epoch [160/1000], Loss: 0.2942
Epoch [170/1000], Loss: 0.8145
Epoch [180/1000], Loss: 0.5073
Epoch [190/1000], Loss: 0.2214
Epoch [200/1000], Loss: 0.5488
Epoch [210/1000], Loss: 0.7114
Epoch [220/1000], Loss: 0.4448
Epoch [230/1000], Loss: 0.4847
Epoch [240/1000], Loss: 0.2743
Epoch [250/1000], Loss: 0.3619
Epoch [260/1000], Loss: 0.5608
Epoch [270/1000], Loss: 0.2880
Epoch [280/1000], Loss: 0.3903
Epoch [290/1000], Loss: 0.3251
Epoch [300/1000], Loss: 0.1892
Epoch [310/1000], Loss: 0.3305
Epoch [320/1000], Loss: 0.2525
Epoch [330/1000],

In [48]:
# 模型评估
model.eval()  # 设置模型为评估模式
with torch.no_grad():
    y_hat_test = model(torch.tensor(X_test.values,dtype=torch.float32))
    y_hat_class = (y_hat_test > 0.5).float()
    accuracy = (y_hat_class == torch.tensor(y_test.values,dtype=torch.float32)).float().mean()
    test_loss = criterion(y_hat_test, torch.tensor(y_test.values,dtype=torch.float32))

print(f'Accuracy: {accuracy * 100:.2f}%     Loss: {test_loss:.4f}')

Accuracy: 82.58%     Loss: 0.4009
