# Dataset Configuration

In [2]:
# Extract Texture features

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import pickle
import time
import copy

from collections import Counter

from sklearn.preprocessing import StandardScaler

texture_features = ['../result/left_mask.csv', '../result/right_mask.csv']
label_file = '../data/label_dict.pickle'
clinical_file = '../data/clinical_data.pickle'

with open(label_file, 'rb') as file: label_dict = pickle.load(file)
with open(clinical_file, 'rb') as file: clinical_dict = pickle.load(file)
    
feature_names = np.array(list(pd.read_csv(texture_features[0]).columns)[1:])

whole_feature = []
whole_label = []
whole_subjects = []

for texture_feature in texture_features:
#     print(texture_feature)
    
    for key, value in pd.read_csv(texture_feature).iterrows():

        subject = '_'.join(list(value)[0].split('_')[:-1])
        label = label_dict[subject]
        clinic_data = clinical_dict[subject] # is male, is female, age

#         print(key, list(value)[0], len(list(value[1:])), label, clinic_data)
        
        whole_feature.append(list(value[1:]) + clinic_data)
        whole_label.append(label)
        whole_subjects.append(list(value)[0])

#         break

whole_feature = np.array(whole_feature)
whole_label = np.array(whole_label)
        
print('Whole feature :',np.array(whole_feature).shape) # (270 = 135 + 135, 854 = 851 + 3)
print('Whole labels :', np.array(whole_label).shape)
print('Whole subjects :', len(whole_subjects))
print('Number by labels :',Counter(whole_label))

Whole feature : (270, 854)
Whole labels : (270,)
Whole subjects : 270
Number by labels : Counter({1: 92, 2: 90, 0: 88})


In [110]:
class ad_dataset(object):
    def __init__(self, binary = False, phase='train'):
        
        self.phase = phase
        
        self.whole_feature, self.whole_label, self.subjects = self.read_data(binary=binary)
        
        if self.phase == 'train':
            self.label_file = label_file
            self.label_dict = self.read_pickle(self.label_file)
            self.image_ids = list(self.label_dict.keys())
    
    def __len__(self):
        return len(self.whole_feature)
    
    def __getitem__(self, index):
        
        return {'data' : self.ToTensor(self.whole_feature[index]),
                'label' : int(self.whole_label[index]),
                'name' : self.subjects[index]}
    
    def ToTensor(self, data):
        return torch.from_numpy(data).float()
    
    def read_pickle(self, file):
        with open(file, 'rb') as f:
            loaded_file = pickle.load(f)
            
        return loaded_file
    
    def read_data(self, binary = False):
        texture_features = ['../result/left_mask.csv', '../result/right_mask.csv']
        label_file = '../data/label_dict.pickle'
        clinical_file = '../data/clinical_data.pickle'
        
        label_dict = self.read_pickle(label_file)
        clinical_dict = self.read_pickle(clinical_file)

        feature_names = np.array(list(pd.read_csv(texture_features[0]).columns)[1:])

        whole_feature = []
        whole_label = []
        whole_subjects = []

        for texture_feature in texture_features:
        #     print(texture_feature)

            for key, value in pd.read_csv(texture_feature).iterrows():

                subject = '_'.join(list(value)[0].split('_')[:-1])
                label = label_dict[subject]
                clinic_data = clinical_dict[subject] # is male, is female, age
                
                if binary:
                    if label == 1:
                        continue
                    else:
                        if label == 2:
                            label = 1 
                        whole_feature.append(list(value[1:]) + clinic_data)
                        whole_label.append(label)
                        whole_subjects.append(list(value)[0])
                        
                else:
                    whole_feature.append(list(value[1:]) + clinic_data)
                    whole_label.append(label)
                    whole_subjects.append(list(value)[0])

        whole_feature = np.array(whole_feature)
        whole_label = np.array(whole_label)
        
        scaler = StandardScaler()
        scaler.fit(whole_feature)
        whole_feature = scaler.transform(whole_feature)

        print('Whole feature :',np.array(whole_feature).shape) # (270 = 135 + 135, 854 = 851 + 3)
        print('Whole labels :', np.array(whole_label).shape)
        print('Whole subjects :', len(whole_subjects))
        print('Number by labels :',Counter(whole_label))

        return whole_feature, whole_label, whole_subjects

In [111]:
train_dataset = ad_dataset(binary=True)
# print(dataset)

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=2)

for batch in train_dataloader:
    
    x = batch['data']
    y = batch['label']
    name = batch['name']
    
    print(x.size(), y, name)
    
    break

Whole feature : (178, 854)
Whole labels : (178,)
Whole subjects : 178
Number by labels : Counter({1: 90, 0: 88})
torch.Size([1, 854]) tensor([1]) ['082_S_1079_R']


## Main Training

In [181]:
# Training Function
def binarize(label):
    x = np.zeros(2, dtype=np.float)
    x[label] = 1
    return torch.Tensor(x)

def train_model(model, criterion, optimizer, scheduler, num_epochs=1):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for batch in dataloaders[phase]:
                
                inputs = batch['data']
                labels = batch['label']

                inputs = torch.squeeze(inputs) # 1 * 854 => 854

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 0)

                    
                    outputs = torch.unsqueeze(outputs, 0)
                    
                    labels = binarize(labels)
                    labels = torch.unsqueeze(labels, 0)
                    
                    loss = criterion(outputs, labels)


                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                if preds.item() == int(batch['label']):
                    running_corrects += 1

            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()
        

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

In [197]:
from torch.utils.data import DataLoader, random_split

import torch
from torch.nn import Linear, Softmax, ReLU, Sigmoid
from torch.nn.modules.loss import BCELoss, CrossEntropyLoss
import torch.nn.functional as F
from torch.optim import Adam, SGD, lr_scheduler

ad_model = torch.nn.Sequential(
    
    Linear(in_features=854, out_features=256, bias=False),
    ReLU(),
    
    Linear(in_features=256, out_features=128, bias=False),
    ReLU(),
    
    Linear(in_features=128, out_features=2, bias=False),
    Sigmoid()
#     Softmax()
)

In [198]:
binary = True

full_dataset = ad_dataset(binary=binary)

train_ratio = 0.8
learning_rate = 0.1

train_size = int(train_ratio * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, test_size])

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=2)
val_dataloader = DataLoader(val_dataset, batch_size=1, shuffle=True, num_workers=2)

model = ad_model

if not binary:
    criterion = CrossEntropyLoss()
else:
    criterion = BCELoss()
    
# optimizer = Adam(model.parameters())
optimizer = SGD(model.parameters(), lr=learning_rate)
scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

dataloaders = dict(train=train_dataloader, val=val_dataloader)
dataset_sizes = dict(train=len(train_dataloader), val=len(val_dataloader))

device = torch.device('cpu')

best_model = train_model(model, criterion, optimizer, scheduler, num_epochs=150)

Whole feature : (178, 854)
Whole labels : (178,)
Whole subjects : 178
Number by labels : Counter({1: 90, 0: 88})
Epoch 1/150
----------
train Loss: 488.6038 Acc: 0.7042
val Loss: 417.6018 Acc: 0.8333

Epoch 2/150
----------
train Loss: 435.2228 Acc: 0.7606
val Loss: 515.6987 Acc: 0.8056

Epoch 3/150
----------
train Loss: 291.5513 Acc: 0.8380
val Loss: 261.6161 Acc: 0.7778

Epoch 4/150
----------
train Loss: 661.2816 Acc: 0.8310
val Loss: 496.6759 Acc: 0.7222

Epoch 5/150
----------
train Loss: 588.9488 Acc: 0.8099
val Loss: 1105.7948 Acc: 0.7500

Epoch 6/150
----------
train Loss: 1055.7388 Acc: 0.7606
val Loss: 1123.8763 Acc: 0.5000

Epoch 7/150
----------
train Loss: 2179.3049 Acc: 0.6268
val Loss: 1747.7667 Acc: 0.3333

Epoch 8/150
----------
train Loss: 2524.4809 Acc: 0.5634
val Loss: 1115.8606 Acc: 0.2778

Epoch 9/150
----------
train Loss: 1482.9801 Acc: 0.5211
val Loss: 1148.8647 Acc: 0.3333

Epoch 10/150
----------
train Loss: 1279.8263 Acc: 0.5704
val Loss: 1147.5940 Acc: 0.4

KeyboardInterrupt: 

In [None]:
test_dataset = osteoporosis_tf_dataset(data_file='../data/mlp_data/scaled_test_data.pickle', label_file=None, phase='test')

for sample in test_dataset:
#     print(sample['data'].size())
    
    input_ = sample['data']
    outputs = best_model(input_)
#     _, preds = torch.max(outputs, 1)
    
    print(int(outputs.data > 0.5) == 1 )
    
#     break