In [22]:
import pandas as pd
import numpy as np
import random
from sklearn import linear_model
from sklearn.metrics import confusion_matrix

In [23]:
def to_list(df_dict, heads):
    # {key: [], ...}
    out = {}
    for _key in list(heads):
        out[_key] = [df_dict[_key][key] for key in df_dict[_key].keys()]
    return out

def encode_input(data_range, data):
    return data_range.index(data)

def preprocess_data(df_list: dict, need_encoded: list, normalize=True) -> np.ndarray:
    output = []
    data_length = len(df_list['id'])
    data_ranges = {k : list(set(df_list[k])) for k in need_encoded}
    for i in range(data_length):
        item = []
        for key in list(df_list.keys())[1:-1]:
            if key in need_encoded:
                item.append(encode_input(data_ranges[key], df_list[key][i]))
            else:
                item.append(df_list[key][i])
        output.append(item)
    inputs_arr = np.array(output)
    targets_arr = np.array(df_list['stroke'])
    if normalize:
        _range = np.max(inputs_arr, axis=0) - np.min(inputs_arr, axis=0)
        inputs_arr = (inputs_arr-np.min(inputs_arr, axis=0)) / _range
    return inputs_arr, targets_arr

def prepare_data(inputs, targets, seed=1001):
    positive_mask = targets == 1
    negative_mask = targets == 0
    n_minimum = min(np.sum(positive_mask), np.sum(negative_mask))
    positive_inputs = inputs[positive_mask][0:n_minimum, :]
    positive_targets = targets[positive_mask][0:n_minimum]
    negative_inputs = inputs[negative_mask][0:n_minimum, :]
    negative_targets = targets[negative_mask][0:n_minimum]
    inputs = np.concatenate([positive_inputs, negative_inputs]).tolist()
    targets = np.concatenate([positive_targets, negative_targets]).tolist()
    np.random.seed(seed)
    np.random.shuffle(inputs)
    np.random.seed(seed)
    np.random.shuffle(targets)
    return np.array(inputs), np.array(targets)

def metrics(y_pred, y_true):
    _confusion_matrix = confusion_matrix(y_pred, y_true)
    tp = _confusion_matrix[0,0]
    fn = _confusion_matrix[1,0]
    fp = _confusion_matrix[0,1]
    tn = _confusion_matrix[1,1]
    # metrics
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    fscore = 2*tp/(2*tp + fp + fn)
    accuracy = (tp+tn)/(tp+tn+fp+fn)
    miss_rate = fn/(tn+tp)
    fall_out_rate = fp/(fp+tn)
    # return 
    return [precision, recall, fscore, accuracy, miss_rate, fall_out_rate]

In [24]:
path = './dataset/train_2v.csv'
df = pd.read_csv(path)
df_clear = df.dropna(axis=0)
df_dict = df_clear.to_dict()
heads = list(df_dict.keys())
df_list = to_list(df_dict, heads)
need_encoded = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

In [25]:
# Lasso
n_repeat = 100
_metrics = []

for i in range(n_repeat):
    # generate data
    seed = random.randint(1, 1000)
    inputs, targets = preprocess_data(df_list, need_encoded)
    inputs, targets = prepare_data(inputs, targets, seed)
    n_samples = inputs.shape[0]
    tr_inputs = inputs[0:int(n_samples*0.7), :]
    tr_targets = targets[0:int(n_samples*0.7)]
    te_inputs = inputs[int(n_samples*0.7):, :]
    te_targets = targets[int(n_samples*0.7):]
    # train clf
    lasso_model = linear_model.LassoCV()
    lasso_model.fit(tr_inputs, tr_targets)
    # evaluate clf
    te_pred = lasso_model.predict(te_inputs)
    te_pred[te_pred>0.5] = 1
    te_pred[te_pred<0.5] = 0
    run_metrics = metrics(te_pred, te_targets)
    print(i+1, "complete", run_metrics)
    _metrics.append(run_metrics)

run_metrics = np.array(_metrics)
mean_metrics = np.mean(run_metrics, axis=0)



1 complete [0.7586206896551724, 0.6586826347305389, 0.7051282051282052, 0.7203647416413373, 0.24050632911392406, 0.21604938271604937]
2 complete [0.782051282051282, 0.7134502923976608, 0.746177370030581, 0.7477203647416414, 0.1991869918699187, 0.21518987341772153]
3 complete [0.7697368421052632, 0.73125, 0.75, 0.7629179331306991, 0.17131474103585656, 0.20710059171597633]
4 complete [0.8, 0.7100591715976331, 0.7523510971786834, 0.7598784194528876, 0.196, 0.1875]
5 complete [0.7571428571428571, 0.6625, 0.7066666666666667, 0.7325227963525835, 0.22406639004149378, 0.20118343195266272]
6 complete [0.7533333333333333, 0.6932515337423313, 0.7220447284345048, 0.7355623100303952, 0.2066115702479339, 0.22289156626506024]
7 complete [0.7243589743589743, 0.6932515337423313, 0.7084639498432602, 0.7173252279635258, 0.211864406779661, 0.25903614457831325]
8 complete [0.7368421052631579, 0.7320261437908496, 0.7344262295081967, 0.7537993920972644, 0.16532258064516128, 0.22727272727272727]
9 complete [0

In [27]:
print("\n\n\n========= MEAN OF 100 Experiments ========\n")
# 100 experiments => run_metrics
for i, key in enumerate(["precision", "recall", "fscore", "accuracy", "miss_rate", "fall_out_rate"]): 
    print(key, mean_metrics[i])







precision 0.7823766367859758
recall 0.7173082447332974
fscore 0.7477451750393221
accuracy 0.7587537993920974
miss_rate 0.18680683950967653
fall_out_rate 0.19938307274013833
