In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv('heart.csv')
raw_data = pd.read_csv('heart.csv')

In [2]:
for feature in ['sex', 'fbs', 'exang']:
    positive = '{}_1'.format(feature)
    negative = '{}_0'.format(feature)
    data[positive] = 0
    data[positive][data[feature] == 1] = 1
    data[negative] = 0
    data[negative][data[feature] == 0] = 1
    data = data.drop([feature], axis='columns')

In [3]:
for feature in ['cp', 'restecg', 'slope', 'ca', 'thal']:
    values = {x for x in data[feature]}
    for value in values:
        column_name = '{}_{}'.format(feature, value)
        data[column_name] = 0
        data[column_name][data[feature] == value] = 1
    data = data.drop([feature], axis='columns')
    
numerical_data = data.copy()

In [4]:
BUCKETS = 6
NUMERICAL_FEATURES = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
for feature in NUMERICAL_FEATURES:
    bins = sorted({x for x in pd.cut(data[feature], BUCKETS)})
    for bucket in range(len(bins)):
        column_name = '{}_{}'.format(feature, bucket+1)
        data[column_name] = 0
        data[column_name][(data[feature] > bins[bucket].left) & (data[feature] <= bins[bucket].right)] = 1
        
data = data.drop(NUMERICAL_FEATURES, axis='columns')
data = data[[x for x in data.columns if x != 'target'] + ['target']]

In [5]:
def split_pos_neg(samples):
    return samples[samples[:, -1] == 1][:, :-1], samples[samples[:, -1] == 0][:, :-1]

def get_splits(data, seed=1):
    if seed:
        np.random.seed(seed)
    SPLITS = 5
    split_size = data.shape[0] // SPLITS
    print('positive: {}, negative: {}, ratio: {:.2f}'.format(
        data[data['target'] == 1].shape[0],
        data[data['target'] == 0].shape[0],
        data[data['target'] == 1].shape[0] / data[data['target'] == 0].shape[0],
    ))
    samples = data.to_numpy().astype('int64')
    np.random.shuffle(samples)
    splits = []
    for i in range(SPLITS):
        train = np.concatenate([samples[:i*split_size], samples[(i+1)*split_size:]], axis=0)
        test = samples[i*split_size:(i+1)*split_size]
        splits.append([train, test])
    return splits

In [6]:
class MetricsComputer:
    def __init__(self, algo_name):
        self.algo_name = algo_name
        self.runs = 0
        self.tp = 0
        self.fn = 0
        self.tn = 0
        self.fp = 0
        self.tpr = 0
        self.tnr = 0
        self.fpr = 0
        self.fnr = 0
        self.fdr = 0
        self.accuracy = 0
        self.negative_predictive_value = 0
        self.precision = 0
        self.recall = 0
        
    def update(self, tp, fn, tn, fp):
        self.runs += 1
        self.tp += tp
        self.fn += fn
        self.tn += tn
        self.fp += fp
        total = tp + fn + tn + fp
        self.tpr += tp / total
        self.tnr += tn / total
        self.fpr += fp / total
        self.fnr += fn / total
        if (tp + fp > 0) and self.fdr is not None:
            self.fdr += fp / (tp + fp)
        else:
            self.fdr = None
        self.accuracy += (tp + tn) / total
        self.negative_predictive_value += tn / (tn + fn)
        if (tp + fp > 0) and self.precision is not None:
            self.precision += tp / (tp + fp)
        else:
            self.precision = None
        self.recall += tp / (tp + fn)
        
    def describe(self):
        print('====== ' + self.algo_name + ' ======')
        print('TOTAL RUNS:', self.runs)
        print('TRUE POSITIVE:', self.tp / self.runs)
        print('TRUE NEGATIVE:', self.tn / self.runs)
        print('FALSE POSITIVE:', self.fp / self.runs)
        print('FALSE NEGATIVE:', self.fn / self.runs)
        print('TRUE POSITIVE RATE:', round(self.tpr / self.runs, 3))
        print('TRUE NEGATIVE RATE:', round(self.tnr / self.runs, 3))
        print('FALSE POSITIVE RATE:', round(self.fpr / self.runs, 3))
        print('FALSE NEGATIVE RATE:', round(self.fnr / self.runs, 3))
        print('NEGATIVE PREDICTIVE VALUE:', round(self.negative_predictive_value / self.runs, 3))
        print('FALSE DISCOVERY RATE:', (round(self.fdr / self.runs, 3)) if self.fdr is not None else '<NOT_DEFINED>')
        print('ACCURACY:', round(self.accuracy / self.runs, 3))
        print('PRECISION:', round(self.precision / self.runs, 3) if self.precision is not None else '<NOT_DEFINED>')
        print('RECALL:', round(self.recall / self.runs, 3))
        print('=======' + '='*len(self.algo_name) + '=======')
        
    def get_value(self, value_name):
        if value_name in ['runs', 'algo_name']:
            return getattr(self, value_name)
        else:
            value = getattr(self, value_name)
            if value is None:
                return None
            return value / self.runs

## Random Forest (raw data)

In [7]:
numerical_splits = get_splits(numerical_data)

positive: 165, negative: 138, ratio: 1.20


In [8]:
from sklearn.ensemble import RandomForestClassifier


metrics = MetricsComputer('Random Forest (with numerical features)')
for train, test in numerical_splits:
    pos_train, neg_train = split_pos_neg(train)
    pos_test, neg_test = split_pos_neg(test)
    X = np.concatenate([pos_train, neg_train])
    Y = np.concatenate([np.ones(shape=(pos_train.shape[0],)), np.zeros(shape=(neg_train.shape[0],))])
    clf = RandomForestClassifier(n_estimators=50, max_depth=10)
    clf = clf.fit(X, Y)
    metrics.update(
        tp=int(clf.predict(pos_test).sum()),
        fn=int((1 - clf.predict(pos_test)).sum()),
        tn=int((1 - clf.predict(neg_test)).sum()),
        fp=int(clf.predict(neg_test).sum()),
    )
metrics.describe()

TOTAL RUNS: 5
TRUE POSITIVE: 22.8
TRUE NEGATIVE: 35.8
FALSE POSITIVE: 1.2
FALSE NEGATIVE: 0.2
TRUE POSITIVE RATE: 0.38
TRUE NEGATIVE RATE: 0.597
FALSE POSITIVE RATE: 0.02
FALSE NEGATIVE RATE: 0.003
NEGATIVE PREDICTIVE VALUE: 0.994
FALSE DISCOVERY RATE: 0.054
ACCURACY: 0.977
PRECISION: 0.946
RECALL: 0.993


## Logistic Regression (raw)

In [9]:
from sklearn.linear_model import LogisticRegression

metrics = MetricsComputer('LogReg (with numerical features)')
for train, test in numerical_splits:
    pos_train, neg_train = split_pos_neg(train)
    pos_test, neg_test = split_pos_neg(test)
    X = np.concatenate([pos_train, neg_train])
    Y = np.concatenate([np.ones(shape=(pos_train.shape[0],)), np.zeros(shape=(neg_train.shape[0],))])
    clf = LogisticRegression(random_state=1)
    clf = clf.fit(X, Y)
    metrics.update(
        tp=int(clf.predict(pos_test).sum()),
        fn=int((1 - clf.predict(pos_test)).sum()),
        tn=int((1 - clf.predict(neg_test)).sum()),
        fp=int(clf.predict(neg_test).sum()),
    )
metrics.describe()

TOTAL RUNS: 5
TRUE POSITIVE: 23.0
TRUE NEGATIVE: 36.4
FALSE POSITIVE: 0.6
FALSE NEGATIVE: 0.0
TRUE POSITIVE RATE: 0.383
TRUE NEGATIVE RATE: 0.607
FALSE POSITIVE RATE: 0.01
FALSE NEGATIVE RATE: 0.0
NEGATIVE PREDICTIVE VALUE: 1.0
FALSE DISCOVERY RATE: 0.028
ACCURACY: 0.99
PRECISION: 0.972
RECALL: 1.0


## Random Forest

In [10]:
splits = get_splits(data)

positive: 165, negative: 138, ratio: 1.20


In [11]:
from sklearn.ensemble import RandomForestClassifier

metrics = MetricsComputer('Random Forest (with binarized features)')
for train, test in splits:
    pos_train, neg_train = split_pos_neg(train)
    pos_test, neg_test = split_pos_neg(test)
    X = np.concatenate([pos_train, neg_train])
    Y = np.concatenate([np.ones(shape=(pos_train.shape[0],)), np.zeros(shape=(neg_train.shape[0],))])
    clf = RandomForestClassifier(n_estimators=50, max_depth=10)
    clf = clf.fit(X, Y)
    metrics.update(
        tp=int(clf.predict(pos_test).sum()),
        fn=int((1 - clf.predict(pos_test)).sum()),
        tn=int((1 - clf.predict(neg_test)).sum()),
        fp=int(clf.predict(neg_test).sum()),
    )
metrics.describe()

TOTAL RUNS: 5
TRUE POSITIVE: 27.8
TRUE NEGATIVE: 21.2
FALSE POSITIVE: 6.2
FALSE NEGATIVE: 4.8
TRUE POSITIVE RATE: 0.463
TRUE NEGATIVE RATE: 0.353
FALSE POSITIVE RATE: 0.103
FALSE NEGATIVE RATE: 0.08
NEGATIVE PREDICTIVE VALUE: 0.819
FALSE DISCOVERY RATE: 0.184
ACCURACY: 0.817
PRECISION: 0.816
RECALL: 0.851


## Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression

metrics = MetricsComputer('LogReg (with binarized features)')
for train, test in splits:
    pos_train, neg_train = split_pos_neg(train)
    pos_test, neg_test = split_pos_neg(test)
    X = np.concatenate([pos_train, neg_train])
    Y = np.concatenate([np.ones(shape=(pos_train.shape[0],)), np.zeros(shape=(neg_train.shape[0],))])
    clf = LogisticRegression(random_state=1)
    clf = clf.fit(X, Y)
    metrics.update(
        tp=int(clf.predict(pos_test).sum()),
        fn=int((1 - clf.predict(pos_test)).sum()),
        tn=int((1 - clf.predict(neg_test)).sum()),
        fp=int(clf.predict(neg_test).sum()),
    )
metrics.describe()

TOTAL RUNS: 5
TRUE POSITIVE: 27.4
TRUE NEGATIVE: 21.2
FALSE POSITIVE: 6.2
FALSE NEGATIVE: 5.2
TRUE POSITIVE RATE: 0.457
TRUE NEGATIVE RATE: 0.353
FALSE POSITIVE RATE: 0.103
FALSE NEGATIVE RATE: 0.087
NEGATIVE PREDICTIVE VALUE: 0.807
FALSE DISCOVERY RATE: 0.184
ACCURACY: 0.81
PRECISION: 0.816
RECALL: 0.841


## Custom FCA algorithms

### Simple

In [13]:
def f_pos(sample, our_class, contestant_class):
    f_sum = 0
    for our_sample in our_class:
        intersection = sample * our_sample
        f_sum += (intersection.sum() / our_class.shape[0])
    return f_sum 

for COEF in [0.9, 1.05, 1.2]:
    metrics = MetricsComputer('Simple FCA (t={})'.format(COEF))
    for train, test in splits:
        pos_train, neg_train = split_pos_neg(train)
        pos_test, neg_test = split_pos_neg(test)
        pos_goods = 0
        pos_bads = 0
        for sample in pos_test:
            pos = f_pos(sample, pos_train, neg_train)
            neg = f_pos(sample, neg_train, pos_train)
            if pos / neg >= COEF:
                pos_goods += 1
            else:
                pos_bads += 1

        neg_goods = 0
        neg_bads = 0
        for sample in neg_test:
            pos = f_pos(sample, pos_train, neg_train)
            neg = f_pos(sample, neg_train, pos_train)
            if pos / neg < COEF:
                neg_goods += 1
            else:
                neg_bads += 1

        metrics.update(pos_goods, pos_bads, neg_goods, neg_bads)

    metrics.describe()

TOTAL RUNS: 5
TRUE POSITIVE: 31.8
TRUE NEGATIVE: 17.0
FALSE POSITIVE: 10.4
FALSE NEGATIVE: 0.8
TRUE POSITIVE RATE: 0.53
TRUE NEGATIVE RATE: 0.283
FALSE POSITIVE RATE: 0.173
FALSE NEGATIVE RATE: 0.013
NEGATIVE PREDICTIVE VALUE: 0.958
FALSE DISCOVERY RATE: 0.248
ACCURACY: 0.813
PRECISION: 0.752
RECALL: 0.974
TOTAL RUNS: 5
TRUE POSITIVE: 27.6
TRUE NEGATIVE: 22.2
FALSE POSITIVE: 5.2
FALSE NEGATIVE: 5.0
TRUE POSITIVE RATE: 0.46
TRUE NEGATIVE RATE: 0.37
FALSE POSITIVE RATE: 0.087
FALSE NEGATIVE RATE: 0.083
NEGATIVE PREDICTIVE VALUE: 0.824
FALSE DISCOVERY RATE: 0.163
ACCURACY: 0.83
PRECISION: 0.837
RECALL: 0.845
TOTAL RUNS: 5
TRUE POSITIVE: 22.4
TRUE NEGATIVE: 24.8
FALSE POSITIVE: 2.6
FALSE NEGATIVE: 10.2
TRUE POSITIVE RATE: 0.373
TRUE NEGATIVE RATE: 0.413
FALSE POSITIVE RATE: 0.043
FALSE NEGATIVE RATE: 0.17
NEGATIVE PREDICTIVE VALUE: 0.708
FALSE DISCOVERY RATE: 0.105
ACCURACY: 0.787
PRECISION: 0.895
RECALL: 0.686


### More complex

In [14]:
A_DEG = 50
B_DEG = 2.5

def f_pos(sample, our_class, contestant_class):
    f_sum = 0
    for our_sample in our_class:
        intersection = sample * our_sample
        upper = 2 ** (intersection.sum() * A_DEG / our_class.shape[0])
        counterexamples = np.all((contestant_class - intersection) >= 0, axis=1).sum()
        lower = 2 ** (counterexamples*B_DEG / contestant_class.shape[0])
        f_sum += upper / lower
    return f_sum 

for COEF in [0.75, 0.9, 1.05]:
    metrics = MetricsComputer('Complex FCA (t={})'.format(COEF))

    for train, test in splits:
        pos_train, neg_train = split_pos_neg(train)
        pos_test, neg_test = split_pos_neg(test)
        pos_goods = 0
        pos_bads = 0
        for sample in pos_test:
            pos = f_pos(sample, pos_train, neg_train)
            neg = f_pos(sample, neg_train, pos_train)
            if pos / neg >= COEF:
                pos_goods += 1
            else:
                pos_bads += 1

        neg_goods = 0
        neg_bads = 0
        for sample in neg_test:
            pos = f_pos(sample, pos_train, neg_train)
            neg = f_pos(sample, neg_train, pos_train)
            if pos / neg < COEF:
                neg_goods += 1
            else:
                neg_bads += 1

        metrics.update(pos_goods, pos_bads, neg_goods, neg_bads)

    metrics.describe()

TOTAL RUNS: 5
TRUE POSITIVE: 31.4
TRUE NEGATIVE: 17.8
FALSE POSITIVE: 9.6
FALSE NEGATIVE: 1.2
TRUE POSITIVE RATE: 0.523
TRUE NEGATIVE RATE: 0.297
FALSE POSITIVE RATE: 0.16
FALSE NEGATIVE RATE: 0.02
NEGATIVE PREDICTIVE VALUE: 0.94
FALSE DISCOVERY RATE: 0.236
ACCURACY: 0.82
PRECISION: 0.764
RECALL: 0.962
TOTAL RUNS: 5
TRUE POSITIVE: 28.2
TRUE NEGATIVE: 22.4
FALSE POSITIVE: 5.0
FALSE NEGATIVE: 4.4
TRUE POSITIVE RATE: 0.47
TRUE NEGATIVE RATE: 0.373
FALSE POSITIVE RATE: 0.083
FALSE NEGATIVE RATE: 0.073
NEGATIVE PREDICTIVE VALUE: 0.844
FALSE DISCOVERY RATE: 0.152
ACCURACY: 0.843
PRECISION: 0.848
RECALL: 0.862
TOTAL RUNS: 5
TRUE POSITIVE: 25.0
TRUE NEGATIVE: 23.6
FALSE POSITIVE: 3.8
FALSE NEGATIVE: 7.6
TRUE POSITIVE RATE: 0.417
TRUE NEGATIVE RATE: 0.393
FALSE POSITIVE RATE: 0.063
FALSE NEGATIVE RATE: 0.127
NEGATIVE PREDICTIVE VALUE: 0.761
FALSE DISCOVERY RATE: 0.135
ACCURACY: 0.81
PRECISION: 0.865
RECALL: 0.761
