In [1]:
import pandas as pd
import numpy as np
import random
from sklearn import linear_model
from sklearn.metrics import confusion_matrix

In [2]:
def to_list(df_dict, heads):
    # {key: [], ...}
    out = {}
    for _key in list(heads):
        out[_key] = [df_dict[_key][key] for key in df_dict[_key].keys()]
    return out

def encode_input(data_range, data):
    return data_range.index(data)

def preprocess_data(df_list: dict, need_encoded: list, normalize=True) -> np.ndarray:
    output = []
    data_length = len(df_list['id'])
    data_ranges = {k : list(set(df_list[k])) for k in need_encoded}
    for i in range(data_length):
        item = []
        for key in list(df_list.keys())[1:-1]:
            if key in need_encoded:
                item.append(encode_input(data_ranges[key], df_list[key][i]))
            else:
                item.append(df_list[key][i])
        output.append(item)
    inputs_arr = np.array(output)
    targets_arr = np.array(df_list['stroke'])
    if normalize:
        _range = np.max(inputs_arr, axis=0) - np.min(inputs_arr, axis=0)
        inputs_arr = (inputs_arr-np.min(inputs_arr, axis=0)) / _range
    return inputs_arr, targets_arr

def prepare_data(inputs, targets, seed=1001):
    positive_mask = targets == 1
    negative_mask = targets == 0
    n_minimum = min(np.sum(positive_mask), np.sum(negative_mask))
    positive_inputs = inputs[positive_mask][0:n_minimum, :]
    positive_targets = targets[positive_mask][0:n_minimum]
    negative_inputs = inputs[negative_mask][0:n_minimum, :]
    negative_targets = targets[negative_mask][0:n_minimum]
    inputs = np.concatenate([positive_inputs, negative_inputs]).tolist()
    targets = np.concatenate([positive_targets, negative_targets]).tolist()
    np.random.seed(seed)
    np.random.shuffle(inputs)
    np.random.seed(seed)
    np.random.shuffle(targets)
    return np.array(inputs), np.array(targets)

def metrics(y_pred, y_true):
    _confusion_matrix = confusion_matrix(y_pred, y_true)
    tp = _confusion_matrix[0,0]
    fn = _confusion_matrix[1,0]
    fp = _confusion_matrix[0,1]
    tn = _confusion_matrix[1,1]
    # metrics
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    fscore = 2*tp/(2*tp + fp + fn)
    accuracy = (tp+tn)/(tp+tn+fp+fn)
    miss_rate = fn/(tn+tp)
    fall_out_rate = fp/(fp+tn)
    # return 
    return [precision, recall, fscore, accuracy, miss_rate, fall_out_rate]

In [3]:
path = './dataset/train_2v.csv'
df = pd.read_csv(path)
df_clear = df.dropna(axis=0)
df_dict = df_clear.to_dict()
heads = list(df_dict.keys())
df_list = to_list(df_dict, heads)
need_encoded = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

In [4]:
# ElasticNet
n_repeat = 100
_metrics = []

for i in range(n_repeat):
    # generate data
    seed = random.randint(1, 1000)
    inputs, targets = preprocess_data(df_list, need_encoded)
    inputs, targets = prepare_data(inputs, targets, seed)
    n_samples = inputs.shape[0]
    tr_inputs = inputs[0:int(n_samples*0.7), :]
    tr_targets = targets[0:int(n_samples*0.7)]
    te_inputs = inputs[int(n_samples*0.7):, :]
    te_targets = targets[int(n_samples*0.7):]
    # train clf
    elastic_model = linear_model.ElasticNetCV()
    elastic_model.fit(tr_inputs, tr_targets)
    # evaluate clf
    te_pred = elastic_model.predict(te_inputs)
    te_pred[te_pred>0.5] = 1
    te_pred[te_pred<0.5] = 0
    run_metrics = metrics(te_pred, te_targets)
    print(i+1, "complete", run_metrics)
    _metrics.append(run_metrics)

run_metrics = np.array(_metrics)
mean_metrics = np.mean(run_metrics, axis=0)


1 complete [0.8175675675675675, 0.6875, 0.7469135802469136, 0.7507598784194529, 0.22267206477732793, 0.17647058823529413]
2 complete [0.7761194029850746, 0.6459627329192547, 0.7050847457627119, 0.7355623100303952, 0.23553719008264462, 0.17857142857142858]
3 complete [0.8040540540540541, 0.695906432748538, 0.7460815047021944, 0.7537993920972644, 0.20967741935483872, 0.18354430379746836]
4 complete [0.8141025641025641, 0.7650602409638554, 0.7888198757763976, 0.7933130699088146, 0.14942528735632185, 0.17791411042944785]
5 complete [0.7610062893081762, 0.7289156626506024, 0.7446153846153846, 0.7477203647416414, 0.18292682926829268, 0.2331288343558282]
6 complete [0.7687074829931972, 0.70625, 0.7361563517915309, 0.7537993920972644, 0.18951612903225806, 0.20118343195266272]
7 complete [0.7866666666666666, 0.7329192546583851, 0.7588424437299035, 0.7720364741641338, 0.16929133858267717, 0.19047619047619047]
8 complete [0.8205128205128205, 0.7398843930635838, 0.7781155015197568, 0.7781155015197

In [5]:
print("\n\n\n========= MEAN OF 100 Experiments ========\n")
# 100 experiments => run_metrics
for i, key in enumerate(["precision", "recall", "fscore", "accuracy", "miss_rate", "fall_out_rate"]): 
    print(key, mean_metrics[i])





precision 0.7864974909892654
recall 0.7131396703769484
fscore 0.7470104038267058
accuracy 0.7595744680851065
miss_rate 0.18950122069164682
fall_out_rate 0.19300734657697366
