In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torchvision import transforms
import torch.utils.data as data
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import os
import copy
import argparse
import time
import numpy as np
import pandas as pd

In [2]:
def make_dataset(dir):
    features = []
    labels = []
    # データの読み込みと特徴量の選択
    dataset = pd.read_csv(os.path.join(dir, "yeast_his3.csv"))
    columns = ["C101", "C103", "C104", "C115", "A101", "A120", "A121", "A122", "A123"]
    cell_features_pre = dataset[["Cgroup"] + columns]
    cell_features = cell_features_pre[np.sum(cell_features_pre.isnull(), axis=1) == 0]
    X = cell_features[columns]
    groups = np.array(cell_features["Cgroup"])
    # 正規化
    X_norm = preprocessing.StandardScaler().fit_transform(X)
    # クラスの指定
    for i in range(len(groups)):
        group = groups[i]
        feature = X_norm[i]
        y = [0, 0, 0, 0]
        if group == "no":
            y = [1, 0, 0, 0]
        elif group == "small":
            y = [0, 1, 0, 0]
        elif group == "medium":
            y = [0, 0, 1, 0]
        elif group == "large":
            y = [0, 0, 0, 1]
        features.append(np.array(feature.astype(np.float32)))
        labels.append(np.array(y))
    return features, labels

In [3]:

# データの読み込み
X, y = make_dataset("data")
# テストデータの分割
X_tmp, X_test, y_tmp, y_test = train_test_split(X, y, test_size = 0.20)
# 訓練データとValidationデータの分割
X_train, X_val, y_train, y_val = train_test_split(X_tmp, y_tmp, test_size = 0.25)

In [4]:
class DatasetFolder(data.Dataset):
    def __init__(self, X, y):
        self.samples = X
        self.targets = y

    def __getitem__(self, index):
        sample = self.samples[index]
        target = self.targets[index]
        sample = torch.from_numpy(sample)
        target = torch.from_numpy(target)
        return sample, target

    def __len__(self):
        return len(self.samples)

feature_datasets = {
    'train':DatasetFolder(X_train, y_train),
    'val':DatasetFolder(X_val, y_val),
    'test': DatasetFolder(X_test, y_test)
}

In [5]:
sample, target = feature_datasets['train'][0]
print(sample)
print(target)

tensor([ 0.3911,  0.8372,  0.3250,  0.6466,  0.0129, -0.0699,  0.3997, -0.7443,
        -1.0469])
tensor([0, 1, 0, 0], dtype=torch.int32)


In [6]:
# バッチサイズ分のデータを読み込む。
# 訓練データ（train）はshuffle=Trueを指定することで、
# データの順番をシャッフルし、読み込む順番をランダムにする。
# 他はシャッフルの必要なし。
batch_size=64
workers=0
dataloaders = {
    # 訓練データ
    'train': torch.utils.data.DataLoader(
        feature_datasets['train'],
        batch_size=batch_size,
        shuffle=True,
        num_workers=workers),
    # バリデーションデータ
    'val': torch.utils.data.DataLoader(
        feature_datasets['val'],
        batch_size=batch_size,
        shuffle=False,
        num_workers=workers),
    # テストデータ
    'test': torch.utils.data.DataLoader(
        feature_datasets['test'],
        batch_size=batch_size,
        shuffle=False,
        num_workers=workers)
}
dataset_sizes = {x: len(feature_datasets[x]) for x in ['train', 'val', 'test']}

In [7]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(9, 9)
        self.fc2 = nn.Linear(9, 4)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# ここから先は、作成したネットワークを、指定のデバイスに送るための内容。
# CPUではなく、GPUを利用したい場合は、"cuda" もしくは、"cuda:0" などと
# 設定を記載。
device_name = "cpu"
device = torch.device(device_name)
model = Net()
model = model.to(device)

In [8]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()
    # 途中経過でモデル保存するための初期化
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    # 時間計測用
    end = time.time()

    print(model)
    print()

    for epoch in range(num_epochs):
        print('Epoch:{}/{}'.format(epoch, num_epochs - 1), end="")

        # 各エポックで訓練+バリデーションを実行
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train(True)  # training mode
            else:
                model.train(False)  # evaluate mode

            running_loss = 0.0
            running_corrects = 0

            for inputs, labels in dataloaders[phase]:
                labels = labels.float()
                inputs = inputs.to(device)
                labels = labels.to(device)

                optimizer.zero_grad()

                # 訓練のときだけ履歴を保持する
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, classnums = torch.max(labels, 1)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, classnums)
                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # 統計情報
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == classnums)

            # サンプル数で割って平均を求める
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('\t{} Loss: {:.4f} Acc: {:.4f} Time: {:.4f}'.format(phase, epoch_loss, epoch_acc, time.time()-end), end="")

            # 精度が改善したらモデルを保存する
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            end = time.time()

        print()

    time_elapsed = time.time() - since
    print()
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val acc: {:.4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [9]:
def print_test_accuracy(model, criterion, optimizer, phase):
    running_loss = 0.0
    running_corrects = 0
    model.train(False)

    for inputs, labels in dataloaders[phase]:
        labels = labels.float()
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        # 訓練のときだけ履歴を保持する
        with torch.set_grad_enabled(phase == 'train'):
            outputs = model(inputs)
            _, classnums = torch.max(labels, 1)
            _, preds = torch.max(outputs, 1)
            loss = criterion(outputs, classnums)

        # 統計情報
        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == classnums)

    # サンプル数で割って平均を求める
    epoch_loss = running_loss / dataset_sizes[phase]
    epoch_acc = running_corrects.double() / dataset_sizes[phase]
    print('On Test:\tLoss: {:.4f} Acc: {:.4f}'.format(epoch_loss, epoch_acc))

In [11]:
epochs = 100
batch_size = 64
lr = 0.1
momentum = 0.9
outdir = "."

# 損失関数（クロスエントロピー）、
# パラメータの最適化方法、学習率の更新方法を定義。
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.7)

# 実際の学習を実施
model = train_model(model, criterion, optimizer, exp_lr_scheduler, num_epochs=epochs)
# テストデータでの精度を求める
print_test_accuracy(model, criterion, optimizer, 'test')

Net(
  (fc1): Linear(in_features=9, out_features=9, bias=True)
  (fc2): Linear(in_features=9, out_features=4, bias=True)
)

Epoch:0/99	train Loss: 0.4910 Acc: 0.8198 Time: 0.0050	val Loss: 0.6161 Acc: 0.7067 Time: 0.0010
Epoch:1/99	train Loss: 0.4706 Acc: 0.8378 Time: 0.0050	val Loss: 0.5978 Acc: 0.7333 Time: 0.0010
Epoch:2/99	train Loss: 0.4449 Acc: 0.8739 Time: 0.0040	val Loss: 0.5845 Acc: 0.7333 Time: 0.0020
Epoch:3/99	train Loss: 0.4182 Acc: 0.8694 Time: 0.0040	val Loss: 0.5646 Acc: 0.7467 Time: 0.0010
Epoch:4/99	train Loss: 0.3860 Acc: 0.9009 Time: 0.0040	val Loss: 0.5613 Acc: 0.7067 Time: 0.0010
Epoch:5/99	train Loss: 0.3600 Acc: 0.9144 Time: 0.0040	val Loss: 0.5349 Acc: 0.7600 Time: 0.0020
Epoch:6/99	train Loss: 0.3336 Acc: 0.9189 Time: 0.0040	val Loss: 0.5130 Acc: 0.7467 Time: 0.0010
Epoch:7/99	train Loss: 0.3097 Acc: 0.9234 Time: 0.0040	val Loss: 0.5012 Acc: 0.7600 Time: 0.0010
Epoch:8/99	train Loss: 0.2875 Acc: 0.9099 Time: 0.0040	val Loss: 0.5061 Acc: 0.7467 Time: 0.0010
Epo