# AD Competition
### 本baseline采用pytorch框架，应用ModelArts的Notebook进行开发

## 加载依赖项

In [1]:
import moxing as mox

import os
import sys
import numpy as np
import warnings
import pandas as pd
from pandas.errors import EmptyDataError

import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim

print("依赖项已加载")

INFO:root:Using MoXing-v2.0.0.rc0-19e4d3ab
INFO:root:Using OBS-Python-SDK-3.20.9.1


No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
依赖项已加载


## 加载数据
### 将您OBS桶中的数据文件加载到此notebook中
### *请将以下代码中"ad-competiton/baseline/train_data"修改成您OBS桶地址中存放训练数据的路径,
### *这里的数据包括所有数据集及标签文件，共2601个

In [2]:
mox.file.copy_parallel('obs://ad-competiton/baseline/train_data','./train/')
print('数据已加载')

INFO:root:Listing OBS: 1000
INFO:root:Listing OBS: 2000
INFO:root:pid: None.	1000/2601
INFO:root:pid: None.	2000/2601


数据已加载


## 生成数据集

In [3]:
def load_data(subject_dir, csv_path):
    df = pd.read_csv(csv_path, index_col=0)
    subjects = os.listdir(subject_dir)

    x = []
    y = []
    for subject in subjects:
        features_path = os.path.join(subject_dir, subject)
        if not os.path.exists(features_path) or not features_path.endswith('npy'):
            continue
        else:
            row = df.loc[subject.split('.')[0]]
            label = int(row['Label'])

            x.append(np.load(features_path))
            y.append(label)

    x = np.array(x)
    y = np.array(y)
    return x, y

class MyDataset(data.Dataset):
    def __init__(self, x, y, device):
        self.x = torch.from_numpy(x).to(torch.float32)
        self.y = torch.from_numpy(y)
        self.device = device

    def __getitem__(self, index):
        xi = self.x[index].to(self.device)
        yi = self.y[index].to(self.device)
        return xi, yi

    def __len__(self):
        return len(self.y)

train_x, train_y = load_data(r'./train/train', r'./train/train_open.csv')

# 数据预处理
train_x = np.nan_to_num(train_x, nan=0.0, posinf=0, neginf=0)
mean = np.mean(train_x, axis=0)
std = np.std(train_x, axis=0)
train_x = (train_x - mean) / std
train_x = np.nan_to_num(train_x, nan=0.0, posinf=0, neginf=0)

# 生成数据集
dataset = MyDataset(train_x, train_y, torch.device("cpu:0"))
data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

print('数据集已生成')




数据集已生成


## 创建模型
### 使用pytorch自定义3层神经网络、softmax输出

In [4]:
class Linear(nn.Module):
    def __init__(self, in_dim, 
                 n_hidden_1, n_hidden_2,
                 out_dim, dropout_p=0.5):
        super().__init__()
        self.layer1 = nn.Linear(in_dim, n_hidden_1)
        self.layer2 = nn.Linear(n_hidden_1, n_hidden_2)
        self.layer3 = nn.Linear(n_hidden_2, out_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=dropout_p)
        self.softmax = nn.Softmax(dim=1)
 
    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.layer2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.layer3(x)
        x = self.softmax(x)
        return x

model = Linear(28169, 4096, 512, 3, dropout_p=0.)

print('模型已创建')

模型已创建


## 加载模型
### 若OBS桶中存在已保存的模型，可加载后继续训练
### *请将以下代码中"ad-competiton/baseline/model/model.pth"修改成您OBS桶地址中存放的模型文件名

In [5]:
model_fileName = 'obs://ad-competiton/baseline/model/model.pth'
if mox.file.exists(model_fileName):
    # 如果模型存在，则加载之
    mox.file.copy(model_fileName,'./train/model.pth')
    model.load_state_dict(torch.load('./train/model.pth', map_location ='cpu'))
    print('模型加载成功')
else:
    print('不存在已保存的模型')

不存在已保存的模型


## 训练模型
### 损失函数采用CrossEntropy，训练方法采用SGD

In [6]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# 训练次数设为10次
for epoch in range(10):
    running_loss = 0.0
    for i, data in enumerate(data_loader, 0):
        inputs, labels = data

        # 训练模型
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # 输出模型当前状态
        running_loss += loss.item()
        if i % 20 == 19:
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 20))
            running_loss = 0.0

print('模型训练完毕')

[1,    20] loss: 1.040
[1,    40] loss: 0.990
[1,    60] loss: 0.969
[1,    80] loss: 0.944
[2,    20] loss: 0.891
[2,    40] loss: 0.909
[2,    60] loss: 0.920
[2,    80] loss: 0.919
[3,    20] loss: 0.881
[3,    40] loss: 0.851
[3,    60] loss: 0.876
[3,    80] loss: 0.868
[4,    20] loss: 0.824
[4,    40] loss: 0.824
[4,    60] loss: 0.838
[4,    80] loss: 0.818
[5,    20] loss: 0.827
[5,    40] loss: 0.804
[5,    60] loss: 0.778
[5,    80] loss: 0.811
[6,    20] loss: 0.794
[6,    40] loss: 0.745
[6,    60] loss: 0.771
[6,    80] loss: 0.781
[7,    20] loss: 0.753
[7,    40] loss: 0.742
[7,    60] loss: 0.780
[7,    80] loss: 0.747
[8,    20] loss: 0.770
[8,    40] loss: 0.756
[8,    60] loss: 0.768
[8,    80] loss: 0.743
[9,    20] loss: 0.729
[9,    40] loss: 0.747
[9,    60] loss: 0.710
[9,    80] loss: 0.757
[10,    20] loss: 0.721
[10,    40] loss: 0.698
[10,    60] loss: 0.722
[10,    80] loss: 0.719
模型训练完毕


## 保存模型
### 将模型保存到notebook本地并复制到OBS中
### *请将以下代码中"ad-competiton/baseline/model"修改成您OBS桶地址中存放模型的路径

In [8]:
np.save('./mean.npy', mean)
np.save('./std.npy', std)
torch.save(model.state_dict(), './model.pth')

model_path = 'obs://ad-competiton/baseline/model'

mox.file.copy('./mean.npy', model_path + '/mean.npy')
mox.file.copy('./std.npy', model_path + '/std.npy')
mox.file.copy('./model.pth', model_path + '/model.pth')

print('模型已保存')

模型已保存
