In [1]:
from __future__ import print_function
from code import *

In [2]:
raw_train = read_file("adult.data.txt")
raw_test = read_file("adult.test.txt")

In [3]:
average_positive_target_rate = get_average_positive_target_rate(raw_train)
X_train, Y_train = make_dataset(raw_train, average_positive_target_rate)
X_train, Y_train = unique_dataset(X_train, Y_train)

In [4]:
X_train.shape, Y_train.shape

((15210, 75), (15210,))

In [5]:
train_size = 7000
test_size = 500
permupation = np.random.permutation(len(X_train))
train_indices = permupation[:train_size]
test_indices = permupation[train_size:(train_size + test_size)]

train_x = X_train[train_indices]
train_y = Y_train[train_indices]
test_x = X_train[test_indices]
test_y = Y_train[test_indices]

In [6]:
def make_kfold(X, Y, folds):
    kf = KFold(n_splits=folds)
    metrics = []
    stats = []
    for train_index, test_index in kf.split(X):
        train_x, test_x = X[train_index], X[test_index]
        train_y, test_y = Y[train_index], Y[test_index]
        new_metrics = []
        new_metrics.append(get_catboost_acc(train_x, train_y, test_x, test_y))
        
        pos_stats, neg_stats = lattices_stats(train_x, train_y, test_x)

        stats.append((pos_stats, neg_stats))
        
        prediction_0 = [
            1 if pos_stat[0] > neg_stat[0] else 0
            for pos_stat, neg_stat in zip(pos_stats, neg_stats)
        ]
        prediction_1 = [
            1 if pos_stat[0] > neg_stat[0] else 0
            for pos_stat, neg_stat in zip(pos_stats, neg_stats)
        ]
        prediction_2 = [
            1 if pos_stat[0] > neg_stat[0] else 0
            for pos_stat, neg_stat in zip(pos_stats, neg_stats)
        ]
        prediction_3 = [
            1 if pos_stat.sum() > neg_stat.sum() else 0
            for pos_stat, neg_stat in zip(pos_stats, neg_stats)
        ]

        new_metrics.append(accuracy_score(prediction_0, test_y))
        new_metrics.append(accuracy_score(prediction_1, test_y))
        new_metrics.append(accuracy_score(prediction_2, test_y))
        new_metrics.append(accuracy_score(prediction_3, test_y))
        new_metrics.append(accuracy_score(np.zeros_like(test_y), test_y))
        
        metrics.append(new_metrics)
    return np.array(metrics), stats

In [None]:
metrics, stats = make_kfold(train_x, train_y, 3)

0


In [28]:
test_Ys = []
kf = KFold(n_splits=3)
for train_index, test_index in kf.split(train_x):
    test_Ys.append(train_y[test_index])

In [29]:
test_Ys[1]

array([1., 0., 0., ..., 0., 0., 0.])

In [30]:
def get_predictions_accs(stats, Y):
    pos_stats, neg_stats = stats
    prediction_0 = [
        1 if pos_stat[0] > neg_stat[0] else 0
        for pos_stat, neg_stat in zip(pos_stats, neg_stats)
    ]
    prediction_1 = [
        1 if pos_stat[1] > neg_stat[1] else 0
        for pos_stat, neg_stat in zip(pos_stats, neg_stats)
    ]
    prediction_2 = [
        1 if pos_stat[2] > neg_stat[2] else 0
        for pos_stat, neg_stat in zip(pos_stats, neg_stats)
    ]
    prediction_3 = [
        1 if pos_stat.sum() > neg_stat.sum() else 0
        for pos_stat, neg_stat in zip(pos_stats, neg_stats)
    ]
    metrics = [
        accuracy_score(prediction_0, Y),
        accuracy_score(prediction_1, Y),
        accuracy_score(prediction_2, Y),
        accuracy_score(prediction_3, Y),
    ]
    return metrics

In [31]:
def get_all_accs(stats_list, Y_list):
    metrics = []
    for stats, Y in zip(stats_list, Y_list):
        metrics.append(get_predictions_accs(stats, Y))
    return np.array(metrics)

In [32]:
right_metrics = get_all_accs(stats, test_Ys)

In [8]:
metrics

array([[0.87617823, 0.85946872, 0.85946872, 0.85946872, 0.85946872,
        0.80891174],
       [0.8752679 , 0.86583798, 0.86583798, 0.86583798, 0.86712387,
        0.8084012 ],
       [0.87183883, 0.85426489, 0.85426489, 0.85426489, 0.85597943,
        0.79254179]])

In [33]:
right_metrics

array([[0.85946872, 0.83847472, 0.77720651, 0.85946872],
       [0.86583798, 0.85212173, 0.77496785, 0.86712387],
       [0.85426489, 0.83540506, 0.77453922, 0.85597943]])

In [36]:
result_metrics = np.concatenate([metrics[:, :1], metrics[:, -1:], right_metrics], axis=1)

In [38]:
print(result_metrics)

[[0.87617823 0.80891174 0.85946872 0.83847472 0.77720651 0.85946872]
 [0.8752679  0.8084012  0.86583798 0.85212173 0.77496785 0.86712387]
 [0.87183883 0.79254179 0.85426489 0.83540506 0.77453922 0.85597943]]
