In [1]:
import pandas as pd
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix

In [2]:
def to_list(df_dict, heads):
    # {key: [], ...}
    out = {}
    for _key in list(heads):
        out[_key] = [df_dict[_key][key] for key in df_dict[_key].keys()]
    return out

def encode_input(data_range, data):
    return data_range.index(data)

def preprocess_data(df_list: dict, need_encoded: list, normalize=True) -> np.ndarray:
    output = []
    data_length = len(df_list['id'])
    data_ranges = {k : list(set(df_list[k])) for k in need_encoded}
    for i in range(data_length):
        item = []
        for key in list(df_list.keys())[1:-1]:
            if key in need_encoded:
                item.append(encode_input(data_ranges[key], df_list[key][i]))
            else:
                item.append(df_list[key][i])
        output.append(item)
    inputs_arr = np.array(output)
    targets_arr = np.array(df_list['stroke'])
    if normalize:
        _range = np.max(inputs_arr, axis=0) - np.min(inputs_arr, axis=0)
        inputs_arr = (inputs_arr-np.min(inputs_arr, axis=0)) / _range
    return inputs_arr, targets_arr

def prepare_data(inputs, targets, seed=1001):
    positive_mask = targets == 1
    negative_mask = targets == 0
    n_minimum = min(np.sum(positive_mask), np.sum(negative_mask))
    positive_inputs = inputs[positive_mask][0:n_minimum, :]
    positive_targets = targets[positive_mask][0:n_minimum]
    negative_inputs = inputs[negative_mask][0:n_minimum, :]
    negative_targets = targets[negative_mask][0:n_minimum]
    inputs = np.concatenate([positive_inputs, negative_inputs]).tolist()
    targets = np.concatenate([positive_targets, negative_targets]).tolist()
    np.random.seed(seed)
    np.random.shuffle(inputs)
    np.random.seed(seed)
    np.random.shuffle(targets)
    return np.array(inputs), np.array(targets)

def metrics(y_pred, y_true):
    _confusion_matrix = confusion_matrix(y_pred, y_true)
    tp = _confusion_matrix[0,0]
    fn = _confusion_matrix[1,0]
    fp = _confusion_matrix[0,1]
    tn = _confusion_matrix[1,1]
    # metrics
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    fscore = 2*tp/(2*tp + fp + fn)
    accuracy = (tp+tn)/(tp+tn+fp+fn)
    miss_rate = fn/(tn+tp)
    fall_out_rate = fp/(fp+tn)
    # return 
    return [precision, recall, fscore, accuracy, miss_rate, fall_out_rate]

In [3]:
path = './dataset/train_2v.csv'
df = pd.read_csv(path)
df_clear = df.dropna(axis=0)
df_dict = df_clear.to_dict()
heads = list(df_dict.keys())
df_list = to_list(df_dict, heads)
need_encoded = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

In [4]:
# CNN (resource limitation, only repeat 1)
epochs = 100
batchsize = 16

inputs, targets = preprocess_data(df_list, need_encoded)
inputs, targets = prepare_data(inputs, targets)
n_samples = inputs.shape[0]
tr_inputs = inputs[0:int(n_samples*0.7), :].reshape((-1, 1, 2, 5))
tr_targets = targets[0:int(n_samples*0.7)].reshape((-1, 1))
te_inputs = inputs[int(n_samples*0.7):, :].reshape((-1, 1, 2, 5))
te_targets = targets[int(n_samples*0.7):].reshape((-1, 1))

class StrokePred(Dataset):
    def __init__(self, inputs, targets) -> None:
        super().__init__()
        self.inputs = inputs
        self.targets = targets
        
    def __len__(self):
        return self.inputs.shape[0]

    def __getitem__(self, index):
        _input = torch.from_numpy(self.inputs[index]).type(torch.float32)
        _target = torch.from_numpy(self.targets[index]).type(torch.float32)
        return _input, _target

class StrokePredModel(nn.Module):
    def __init__(self):
        super(StrokePredModel, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=8, kernel_size=2)

        self.linear1 = nn.Linear(32, 16)
        self.linear2 = nn.Linear(16, 1)

    
    def forward(self, x):
        out = F.relu(self.conv1(x))
        out = F.relu(self.conv2(out))
        out = out.view((-1, 32))
        out = F.relu(self.linear1(out))
        return F.sigmoid(self.linear2(out))

train_set = StrokePred(tr_inputs, tr_targets)
val_set = StrokePred(te_inputs, te_targets)
train_loader = DataLoader(train_set, batch_size=batchsize)
val_loader = DataLoader(train_set, batch_size=1)

model = StrokePredModel()
citeration = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

for i in range(epochs):
    model.train()
    loss_ = 0
    acc_ = 0
    val_acc = 0
    for j, (input_, target_) in enumerate(train_loader):
        optimizer.zero_grad()
        out = model(input_)
        loss = citeration(out, target_)

        loss.backward()
        optimizer.step()

        pred = out.detach().numpy()
        pred_ = np.zeros_like(pred)
        pred_[pred>0.5] = 1
        pred_ = pred_.astype('float')
        acc = np.sum(pred_ == target_.numpy()) / batchsize
        loss_ += loss.item()
        acc_ += acc

    model.eval()
    for j, (input_, target_) in enumerate(val_loader):
        out = model(input_)
        pred = out.detach().numpy()
        pred_ = np.zeros_like(pred)
        pred_[pred>0.5] = 1
        pred_ = pred_.astype('float')
        acc = np.sum(pred_ == target_.numpy())
        val_acc += acc

    print("epochs: {}, loss: {}, val_acc: {}".format(
        i+1,
        loss_ / len(train_loader), 
        val_acc / len(val_loader)))

model.eval()
preds = []
labels = []
for j, (input_, target_) in enumerate(val_loader):
    out = model(input_)
    pred = out.detach().numpy()
    pred_ = np.zeros_like(pred)
    pred_[pred>0.5] = 1
    pred_ = pred_.astype('float')
    preds.append(pred_[0][0])
    labels.append(target_.numpy()[0][0])

metrics = metrics(preds, labels)
# [precision, recall, fscore, accuracy, miss_rate, fall_out_rate]
print(metrics)



epochs: 1, loss: 0.696440060933431, val_acc: 0.49934810951760106
epochs: 2, loss: 0.6954947772125403, val_acc: 0.49934810951760106
epochs: 3, loss: 0.6944990853468577, val_acc: 0.49934810951760106
epochs: 4, loss: 0.6933767460286617, val_acc: 0.49934810951760106
epochs: 5, loss: 0.6921489536762238, val_acc: 0.49934810951760106
epochs: 6, loss: 0.6907782976826032, val_acc: 0.49934810951760106
epochs: 7, loss: 0.6892017275094986, val_acc: 0.49934810951760106
epochs: 8, loss: 0.6874086260795593, val_acc: 0.49934810951760106
epochs: 9, loss: 0.6854263010124365, val_acc: 0.49934810951760106
epochs: 10, loss: 0.6832261867821217, val_acc: 0.5045632333767927
epochs: 11, loss: 0.6807494983077049, val_acc: 0.5619295958279009
epochs: 12, loss: 0.6778507133324941, val_acc: 0.6310299869621904
epochs: 13, loss: 0.6745705232024193, val_acc: 0.6701434159061278
epochs: 14, loss: 0.6708908466001352, val_acc: 0.711864406779661
epochs: 15, loss: 0.6667930086453756, val_acc: 0.7262059973924381
epochs: 16, 