In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix

In [2]:
def to_list(df_dict, heads):
    # {key: [], ...}
    out = {}
    for _key in list(heads):
        out[_key] = [df_dict[_key][key] for key in df_dict[_key].keys()]
    return out

def encode_input(data_range, data):
    return data_range.index(data)

def preprocess_data(df_list: dict, need_encoded: list, normalize=True) -> np.ndarray:
    output = []
    data_length = len(df_list['id'])
    data_ranges = {k : list(set(df_list[k])) for k in need_encoded}
    for i in range(data_length):
        item = []
        for key in list(df_list.keys())[1:-1]:
            if key in need_encoded:
                item.append(encode_input(data_ranges[key], df_list[key][i]))
            else:
                item.append(df_list[key][i])
        output.append(item)
    inputs_arr = np.array(output)
    targets_arr = np.array(df_list['stroke'])
    if normalize:
        _range = np.max(inputs_arr, axis=0) - np.min(inputs_arr, axis=0)
        inputs_arr = (inputs_arr-np.min(inputs_arr, axis=0)) / _range
    return inputs_arr, targets_arr

def prepare_data(inputs, targets, seed=1001):
    positive_mask = targets == 1
    negative_mask = targets == 0
    n_minimum = min(np.sum(positive_mask), np.sum(negative_mask))
    positive_inputs = inputs[positive_mask][0:n_minimum, :]
    positive_targets = targets[positive_mask][0:n_minimum]
    negative_inputs = inputs[negative_mask][0:n_minimum, :]
    negative_targets = targets[negative_mask][0:n_minimum]
    inputs = np.concatenate([positive_inputs, negative_inputs]).tolist()
    targets = np.concatenate([positive_targets, negative_targets]).tolist()
    np.random.seed(seed)
    np.random.shuffle(inputs)
    np.random.seed(seed)
    np.random.shuffle(targets)
    return np.array(inputs), np.array(targets)

def metrics(y_pred, y_true):
    _confusion_matrix = confusion_matrix(y_pred, y_true)
    tp = _confusion_matrix[0,0]
    fn = _confusion_matrix[1,0]
    fp = _confusion_matrix[0,1]
    tn = _confusion_matrix[1,1]
    # metrics
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    fscore = 2*tp/(2*tp + fp + fn)
    accuracy = (tp+tn)/(tp+tn+fp+fn)
    miss_rate = fn/(tn+tp)
    fall_out_rate = fp/(fp+tn)
    # return 
    return [precision, recall, fscore, accuracy, miss_rate, fall_out_rate]

In [3]:
path = './data/train_2v.csv'
df = pd.read_csv(path)
df_clear = df.dropna(axis=0)
df_dict = df_clear.to_dict()
heads = list(df_dict.keys())
print(heads)
df_list = to_list(df_dict, heads)

['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status', 'stroke']


In [4]:
gender_range = list(set(df_list['gender']))
married_range = list(set(df_list['ever_married']))
work_range = list(set(df_list['work_type']))
residence_range = list(set(df_list['Residence_type']))
smoking_range = list(set(df_list['smoking_status']))
need_encoded = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
print(gender_range)
print(married_range)
print(work_range)
print(residence_range)
print(smoking_range)

['Male', 'Other', 'Female']
['No', 'Yes']
['Govt_job', 'Self-employed', 'Never_worked', 'Private', 'children']
['Rural', 'Urban']
['formerly smoked', 'never smoked', 'smokes']


In [None]:
# distribution 
head = 'age'
plt.hist(df_list[head])

In [None]:
inputs, targets = preprocess_data(df_list, need_encoded)
inputs, targets = prepare_data(inputs, targets)
n_samples = inputs.shape[0]
tr_inputs = inputs[0:int(n_samples*0.7), :]
tr_targets = targets[0:int(n_samples*0.7)]
te_inputs = inputs[int(n_samples*0.7):, :]
te_targets = targets[int(n_samples*0.7):]

clf = MLPClassifier(learning_rate_init=1e-2,
    solver='sgd',activation='relu',max_iter=500,
    alpha=5e-4, hidden_layer_sizes=(32, 64, 32),random_state=1,
    verbose=True)
clf.fit(tr_inputs, tr_targets)
# clf.score(tr_inputs, tr_targets)
clf.score(te_inputs, te_targets)

In [7]:
n_repeat = 100
_metrics = []

for i in range(n_repeat):
    # generate data
    seed = random.randint(1, 1000)
    inputs, targets = preprocess_data(df_list, need_encoded)
    inputs, targets = prepare_data(inputs, targets, seed)
    n_samples = inputs.shape[0]
    tr_inputs = inputs[0:int(n_samples*0.7), :]
    tr_targets = targets[0:int(n_samples*0.7)]
    te_inputs = inputs[int(n_samples*0.7):, :]
    te_targets = targets[int(n_samples*0.7):]
    # train clf
    clf = MLPClassifier(learning_rate_init=1e-2,
        solver='sgd',activation='relu',max_iter=500,
        alpha=5e-4, hidden_layer_sizes=(16, 32, 16),random_state=1)
    clf.fit(tr_inputs, tr_targets)
    # evaluate clf
    te_pred = clf.predict(te_inputs)
    run_metrics = metrics(te_pred, te_targets)
    print(i+1, "complete", run_metrics)
    _metrics.append(run_metrics)

_metrics = np.array(_metrics)
_metrics = np.mean(_metrics, axis=0)
print(_metrics)




1 complete [0.7701863354037267, 0.7045454545454546, 0.7359050445103857, 0.729483282674772, 0.21666666666666667, 0.24183006535947713]




2 complete [0.75, 0.7222222222222222, 0.7358490566037735, 0.7446808510638298, 0.1836734693877551, 0.23353293413173654]
3 complete [0.8037974683544303, 0.7055555555555556, 0.7514792899408284, 0.7446808510638298, 0.2163265306122449, 0.2080536912751678]




4 complete [0.7592592592592593, 0.75, 0.754601226993865, 0.756838905775076, 0.1646586345381526, 0.23636363636363636]
5 complete [0.75, 0.7354838709677419, 0.742671009771987, 0.7598784194528876, 0.164, 0.21839080459770116]




6 complete [0.7960526315789473, 0.7515527950310559, 0.7731629392971247, 0.78419452887538, 0.15503875968992248, 0.18452380952380953]




7 complete [0.7469879518072289, 0.7337278106508875, 0.7402985074626866, 0.7355623100303952, 0.1859504132231405, 0.2625]




8 complete [0.8120805369127517, 0.7469135802469136, 0.7781350482315113, 0.790273556231003, 0.1576923076923077, 0.16766467065868262]




9 complete [0.7816901408450704, 0.69375, 0.7350993377483444, 0.756838905775076, 0.19678714859437751, 0.1834319526627219]




10 complete [0.8137931034482758, 0.7151515151515152, 0.7612903225806451, 0.7750759878419453, 0.1843137254901961, 0.16463414634146342]




11 complete [0.7712418300653595, 0.7329192546583851, 0.7515923566878981, 0.7629179331306991, 0.17131474103585656, 0.20833333333333334]




12 complete [0.7529411764705882, 0.7852760736196319, 0.7687687687687688, 0.7659574468085106, 0.1388888888888889, 0.25301204819277107]
