# 1. Datasets information
**Dataset**: 1   
**Name**: "Congressional Voting Records Data Set"  
**Link**: https://archive.ics.uci.edu/ml/datasets/congressional+voting+records    
**Number of instances**: 435  
**Number of attributes**: 16  
**Missing values**: Yes  

**Dataset**: 2   
**Name**: "Tic-Tac-Toe Endgame Data Set"  
**Link**: https://archive.ics.uci.edu/ml/datasets/Tic-Tac-Toe+Endgame  
**Number of instances**: 958     
**Number of attributes**: 27   
**Missing values**: No  

# 2. Datasets standartization

**2.1 Importing datasets**

In [None]:
import pandas as pd
import numpy as np

# Congressional Voting Records Data Set
df_1 = pd.read_csv(
    filepath_or_buffer="http://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data", 
    header=None, 
    sep=',')
headers_1 = ['target', 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]
df_1.columns= headers_1

# Tic-Tac-Toe Endgame Data Set
df_2 = pd.read_csv(
    filepath_or_buffer="https://archive.ics.uci.edu/ml/machine-learning-databases/tic-tac-toe/tic-tac-toe.data", 
    header=None, 
    sep=',')
headers_2 = [1,2,3,4,5,6,7,8,9,'target']
df_2.columns= headers_2

**2.2 Checking unprepared data**

In [None]:
print(df_1.info())
print(df_2.info())

**2.3 Preparation of Voting records dataset.**  
Changing string values to variables and deleting rows with NaNs.

In [None]:
df_1 = df_1.replace('?', np.nan)
df_1 = df_1.replace('y', True)
df_1 = df_1.replace('n', False)
df_1 = df_1.dropna().reset_index(drop=True)
df_1 = df_1.replace('democrat', True)
df_1 = df_1.replace('republican', False)
df_1 = df_1[list(df_1.columns.values)].astype('bool')
df_1.info()

**2.4 Preparation of Tic-Tac-Toe Dataset**  
Getting dummy features from all features and changing string values to bool variables

In [None]:
df_2 = df_2.replace('positive', True)
df_2 = df_2.replace('negative', False)
df_2 = pd.get_dummies(df_2, columns=[1,2,3,4,5,6,7,8,9])
df_2 = df_2[list(df_2.columns.values)].astype('bool')
df_2.info()

**2.5 Saving standartized datasets**

In [None]:
import os

directory = r'C:/Users/Hp/PycharmProjects/OSDA/std_datasets/'
df_1.to_csv(os.path.join(directory, r'hv.csv'), index=False)
df_2.to_csv(os.path.join(directory, r'ttt.csv'), index=False)

# if using .py script
# script_dir = os.path.abspath(os.path.dirname(sys.argv[0]) or '.') 

# 3. Supporting functions

**3.1 Cross Validation function**

In [None]:
from sklearn import model_selection
import sys

def cross_validation(path_in, path_out, dataset_short_name, n_splits):
    """ Makes cross validation of dataset from selected path in N splits and saves to selected path. 
   
    Parameters
    ----------
    path_in : str
        The path to .csv file with dataset to pass to function
    path_out : str
        The path to the directory where cross validated dataset will be saved
    dataset_short_name : str
        The short name of the dataset
    n_splits : int
        The number of KFolds splits for cross validation
    """
    
    df_prep = pd.read_csv(path_in)
    kf = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=None)
    kf.get_n_splits(df_prep)
    k = 1
    
    for train_index, test_index in kf.split(df_prep):
        df_prep.iloc[train_index].to_csv(os.path.join(path_out, dataset_short_name+'_train_'+str(k)+'.csv'),index=False)
        df_prep.iloc[test_index].to_csv(os.path.join(path_out, dataset_short_name+'_test_'+str(k)+'.csv'),index=False)
        k += 1
        
    return 'Cross Validation Completed'

**3.2 Data preprocessing**

In [None]:
def data_preprocessing(path_train, path_test, target_column_name = 'target'):
    """ Makes plus, minus contexts of selected train dataset in list of dictionaries format 
    and transforms test dataset in list of dictionaries w/o target feature and target feature list.
    
    If the argument 'target_column_name' isn't passed in, the default 'target' name is used.
    
    Parameters
    ----------
    path_train : str
        The path to @_train#.csv where # is a number of file and @ is a shortname of dataset
    path_test : str
        The path to @_test#.csv where # is a number of file and @ is a shortname of dataset
    target_column_name : str
        The name of target feature in selected dataframe
    """
    
    train = pd.read_csv(path_train)
    test = pd.read_csv(path_test)

    plus_context = train[train[target_column_name] == True]
    minus_context = train[train[target_column_name] == False]

    X_plus = plus_context.drop(target_column_name, axis = 1).to_dict('records')
    X_minus = minus_context.drop(target_column_name, axis = 1).to_dict('records')
    X_test = test.drop(target_column_name, axis = 1).to_dict('records')
    y_test = test[target_column_name].tolist()
    
    return X_plus, X_minus, X_test, y_test

**3.3 Metrics evaluation**

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def metrics_test(y_real_list, y_pred_list):
    """ Counts  sklearn.metrics (Accuracy, ROC AUC, Precision and Recall) for real target and its predicted values.
    
    Parameters
    ----------
    y_real_list : list
        The real target values of dataset
    y_pred_list : list
        The predicted target values of dataset via some algorithm
    """
    
    y_real = np.array(y_real_list)
    y_pred = np.array(y_pred_list)
    
    acc = accuracy_score(y_real, y_pred)
    roc_auc = roc_auc_score(y_real, y_pred)
    prc = precision_score(y_real, y_pred)
    rcl = recall_score(y_real, y_pred)
    
#     print('Accuracy score: {:.4f}'.format(acc))
#     print('Roc AUC Score: {:.4f}'.format(roc_auc))
#     print('Precision Score: {:.4f}'.format(prc))
#     print('Recall Score: {:.4f}'.format(rcl))
    
    return acc, roc_auc, prc, rcl

**3.4 Supporting functions for algorithms**

In [None]:
def dict_intersec(dict1, dict2, option='intersec'):
    """Gets dictionary of intersection or bool of subset  of two dictionaries depending on 'option'
    
    If the argument 'option' isn't passed in, the default 'intersec' option is used.
    
    Parameters:
    -----------
    dict1 : dict
        The first dictionary. Its length is less or equal than the length of the second dictionary
    dict2 : dict
        The second dictionary
    option : str
        Option 'intersec' or 'subset' 
    """

    if option == 'intersec':
        return dict(set(dict1.items()) & set(dict2.items()))
    if option == 'subset':
        return dict1.items() <= dict2.items()
    else:
        raise ValueError('Unknown option was used.')

# 4. Algorithms

### Algorithm 1
Algorithm based on voting system, where eveery plus context object votes for plus classification, if its intersection with test object is not a subset(depending on thresholds) of of minus context and vice versa.  
Test object classifies positive, if there is more votes for plus classification (and vice versa).  
There is one threshold, which controls, how many subsets of plus/minus and test sets intersection can be in minus/plus context.

In [None]:
import random

def Alg_1(X_plus, X_minus, X_test, threshold_subset):
    """Algorithm based on voting system, where every plus context object votes for plus classification, 
    if its intersection with test object is not a subset(depending on threshold) of of minus context object and vice versa.
    Test object classifies positive, if there is more votes for plus classification (and vice versa).

    
    Parameters
    ----------
    X_plus : list of dict
        The plus context of train dataset
    X_minus : list of dict
        The minus context of train dataset
    X_test : list of dict
        The test dataset
    threshold_subset : int
        The limitation threshold for counter of subsets of plus/minus and test sets intersection in minus/plus context 
    """
    
    norm_plus = (len(X_plus)+len(X_minus))//len(X_plus)
    norm_minus = (len(X_plus)+len(X_minus))//len(X_minus)
    y_pred = []
    random_counter = 0
    
    for ent_test in X_test:
        labels = {'plus':0, 'minus':0}
        for ent_plus in X_plus:
            counter_plus = 0 
            intersec_plus = dict_intersec(ent_test, ent_plus, option='intersec')
            for ent_minus in X_minus:
                if dict_intersec(intersec_plus, ent_minus, option='subset'):
                    counter_plus += 1
            if counter_plus <= threshold_subset:
                labels['plus'] += 1
        for ent_minus in X_minus:
            counter_minus = 0
            intersec_minus = dict_intersec(ent_test, ent_minus, option='intersec')
            for ent_plus in X_plus:
                if dict_intersec(intersec_minus, ent_plus, option='subset'):
                    counter_minus += 1
            if counter_minus <= threshold_subset:
                labels['minus'] += 1
        
        labels['plus'] = labels['plus']*norm_plus
        labels['minus'] = labels['minus']*norm_minus
        if labels['plus'] > labels['minus']:
            y_pred.append(True)
        elif labels['plus'] < labels['minus']:
            y_pred.append(False)
        else:
            random_counter += 1
            random.seed(1)
            y_pred.append(bool(random.getrandbits(1)))
            
#         if random_counter >= 0.25*len(X_test):
#             raise ValueError('Answer is too random')
    
    randomness = random_counter/len(X_test)
    
    return y_pred,randomness

### Algorithm 2
Algorithm based on voting system, where every plus context object votes for plus classification, if there is more its intersections (depending on threshold) with test object than with minus context.  
Test object classifies positive, if there is more votes for plus classification (and vice versa).   
It has threshold_intersec, which controls, how many elements can be in plus/minus context and test sets intersection.  
It is more simple version of Algorithm 1, because it doesn't search subsets in another context.

In [None]:
import warnings

def Alg_2(X_plus, X_minus, X_test, threshold_intersec):
    """Algorithm based on voting system, where every plus context object votes for plus classification, 
    if there is more its intersections (depending on threshold) with test object than with minus context object.  
    Test object classifies positive, if there is more votes for plus classification (and vice versa).
    
    Parameters
    ----------
    X_plus : list of dict
        The plus context of train dataset
    X_minus : list of dict
        The minus context of train dataset
    X_test : list of dict
        The test dataset 
    threshold_intersec : float
        The limitation threshold for counter of elements in plus/minus context and test sets intersection
    """
    
    warnings.filterwarnings('ignore')
    y_pred = []
    random_counter = 0
    
    for ent_test in X_test:

        labels = {'plus' : 0, 'minus' : 0}
        
        for ent_plus in X_plus:
            counter_plus = 0 
            intersec_plus = dict_intersec(ent_test, ent_plus, option='intersec')
            intersec_plus_len_norm = len(intersec_plus)/len(ent_test)
            if intersec_plus_len_norm > threshold_intersec:
                labels['plus'] += 1
   
        for ent_minus in X_minus:
            counter_minus = 0
            intersec_minus = dict_intersec(ent_test, ent_minus, option='intersec')
            intersec_minus_len_norm = len(intersec_minus)/len(ent_test)
            if intersec_minus_len_norm > threshold_intersec:
                labels['minus'] += 1
                
        if labels['plus'] > labels['minus']:
            y_pred.append(True)
        elif labels['plus'] < labels['minus']:
            y_pred.append(False)
        else:
            random_counter += 1
            random.seed(1)
            y_pred.append(bool(random.getrandbits(1)))
            
#         if random_counter >= 0.25*len(X_test):
#             raise ValueError('Answer is too random')
    
    randomness = random_counter/len(X_test)
    
    return y_pred, randomness

### Algorithm 3
Algorithm based on voting system, where eveery plus context object votes for plus classification, if its intersection(depending on threshold) with test object is not a subset(depending on threshold) of of minus context and vice versa.
Test set is True, if there is more votes for plus classification (and vice versa).

This algorithm is the same approach, as algorithm 1, but with modification. It has threshold_intersec, which controls, how many elements can be in plus/minus context and test sets intersection.

In [None]:
def Alg_3(X_plus, X_minus, X_test, threshold_subset, threshold_intersec):
    """Algorithm based on voting system, where eveery plus context object votes for plus classification, 
    if its intersection(depending on threshold) with test object is not a subset(depending on threshold) 
    of minus context object and vice versa.
    Test object classifies positive, if there is more votes for plus classification (and vice versa).
    
    Parameters
    ----------
    X_plus : list of dict
        The plus context of train dataset
    X_minus : list of dict
        The minus context of train dataset
    X_test : list of dict
        The test dataset
    threshold_subset : int
        The limitation threshold for counter of subsets of plus/minus context and test sets intersection in minus/plus context 
    threshold_intersec : float
        The limitation threshold for counter of elements in plus/minus context and test sets intersection
    """
    
    norm_plus = (len(X_plus)+len(X_minus))//len(X_plus)
    norm_minus = (len(X_plus)+len(X_minus))//len(X_minus)
    y_pred = []
    random_counter = 0
    
    for ent_test in X_test:
        labels = {'plus':0, 'minus':0}
        for ent_plus in X_plus:
            counter_plus = 0 
            intersec_plus = dict_intersec(ent_test, ent_plus, option='intersec')
            intersec_plus_len_norm = len(intersec_plus)/len(ent_test)
            if intersec_plus_len_norm >= threshold_intersec:
                for ent_minus in X_minus:
                    if dict_intersec(intersec_plus, ent_minus, option='subset'):
                        counter_plus += 1
                if counter_plus <= threshold_subset:
                    labels['plus'] += 1
        for ent_minus in X_minus:
            counter_minus = 0
            intersec_minus = dict_intersec(ent_test, ent_minus, option='intersec')
            intersec_minus_len_norm = len(intersec_minus)/len(ent_test)
            if intersec_minus_len_norm >= threshold_intersec:
                for ent_plus in X_plus:
                    if dict_intersec(intersec_minus, ent_plus, option='subset'):
                        counter_minus += 1
                if counter_minus <= threshold_subset:
                    labels['minus'] += 1
        
        labels['plus'] = labels['plus']*norm_plus
        labels['minus'] = labels['minus']*norm_minus
        if labels['plus'] > labels['minus']:
            y_pred.append(True)
        elif labels['plus'] < labels['minus']:
            y_pred.append(False)
        else:
            random_counter += 1
            random.seed(1)
            y_pred.append(bool(random.getrandbits(1)))
            
#         if random_counter >= 0.25*len(X_test):
#             raise ValueError('Answer is too random')
    
    randomness = random_counter/len(X_test)
    
    return y_pred, randomness

### Algorithm 4

Algorithm classifies test object by maximum subset of plus/minus context object and test object intersection in minus/plus context object.  
Test object is True, if there is more votes for plus classification (and vice versa).  
It has threshold_intersec, which controls, how many elements can be in plus/minus context and test sets intersection.

In [None]:
import random

def Alg_4(X_plus, X_minus, X_test, threshold_intersec):
    """Algorithm classifies test object by maximum subset of plus/minus context object and test object 
    intersection in minus/plus context object.
    Test object is True, if there is more votes for plus classification (and vice versa).
    It has threshold_intersec, which controls, how many elements can be in plus/minus context and test sets intersection.
    
    Parameters
    ----------
    X_plus : list of dict
        The plus context of train dataset
    X_minus : list of dict
        The minus context of train dataset
    X_test : list of dict
        The test dataset
    threshold_intersec : float
        The limitation threshold for counter of elements in plus/minus context and test sets intersection
    """
    
    norm_plus = (len(X_plus)+len(X_minus))//len(X_plus)
    norm_minus = (len(X_plus)+len(X_minus))//len(X_minus)
    y_pred = []
    random_counter = 0
    
    for ent_test in X_test:

        max_counter_plus = 0
        max_counter_minus = 0
        
        for ent_plus in X_plus:
            counter_plus = 0 
            
            intersec_plus = dict_intersec(ent_test, ent_plus, option='intersec')
            intersec_plus_len_norm = len(intersec_plus)/len(ent_test)
            if intersec_plus_len_norm < threshold_intersec:
                    continue
            for ent_minus in X_minus:
                if dict_intersec(intersec_plus, ent_minus, option='subset'):
                    counter_plus += 1
                    max_counter_plus = max(counter_plus, max_counter_plus)

        for ent_minus in X_minus:
            counter_minus = 0
            intersec_minus = dict_intersec(ent_test, ent_minus, option='intersec')
            intersec_minus_len_norm = len(intersec_minus)/len(ent_test)
            if intersec_minus_len_norm < threshold_intersec:
                    continue
            for ent_plus in X_plus:
                if dict_intersec(intersec_minus, ent_plus, option='subset'):
                    counter_minus += 1
                    max_counter_minus = max(counter_minus, max_counter_minus)
                
        y_pred.append(max_counter_plus<max_counter_minus)
    
    randomness = 0
    
    return y_pred, randomness

### BernoulliNB Algorithm

In [None]:
from sklearn.naive_bayes import BernoulliNB

def bernoulliNB(dataset_shortname):
    cv_path = 'C:/Users/Hp/PycharmProjects/OSDA/cross_validated/'
    y_pred_arr = []
    metrics = []
    for i in range(1,11):
        test = pd.read_csv(cv_path+dataset_shortname+r'_test_'+str(i)+r'.csv')
        train = pd.read_csv(cv_path+dataset_shortname+r'_train_'+str(i)+r'.csv')
        
        X_train = train.drop('target', axis = 1).to_numpy()
        y_train = train['target'].to_numpy()
        
        X_test = test.drop('target', axis = 1).to_numpy()
        y_test = test['target'].to_numpy()
        
        clf = BernoulliNB()
        clf.fit(X_train, y_train)
        
        y_pred = clf.predict(X_test)
        
#         print(y_test)
        acc, roc_auc, prc, rcl = metrics_test(y_pred_list=y_pred, y_real_list=y_test.tolist())
        metrics.append([acc, roc_auc, prc, rcl, 0])
        y_pred_arr.append(y_pred)
        
    return y_pred_arr, y_test, metrics

# 5. Evaluating algorithms

In [None]:
from datetime import datetime

def alg_eval(dataset_shortname, alg_n=1, n_splits=10, is_already_cv=True, 
             threshold_subset=0, threshold_intersec=0, print_time=True):
    
    start=datetime.now()
    
    # vars
    std_path = 'C:/Users/Hp/PycharmProjects/OSDA/std_datasets/'
    cv_path = 'C:/Users/Hp/PycharmProjects/OSDA/cross_validated/'
    # if using .py script
    # script_dir = os.path.abspath(os.path.dirname(sys.argv[0]) or '.')
    metrics = []
    y_pred_arr = []
    # 1. step - Cross Validation
    if not is_already_cv:
        cross_validation(path_in=std_path+dataset_shortname+r'.csv', #+r'tic_tac_toe.csv',
                        path_out=cv_path,
                        dataset_short_name=dataset_shortname,
                        n_splits=n_splits)
    for i in range(1,n_splits+1):
        # 2. step - Data Preprocessing
        X_plus, X_minus, X_test, y_test = data_preprocessing(path_test=cv_path+dataset_shortname+r'_test_'+str(i)+r'.csv',
                                                             path_train=cv_path+dataset_shortname+r'_train_'+str(i)+r'.csv')

        # 3. step - Algorithm
        y_real = y_test
        if alg_n == 1:
            y_pred, rdnm = Alg_1(X_plus, X_minus, X_test, threshold_subset=threshold_subset)

        if alg_n == 2:
            y_pred, rdnm = Alg_2(X_plus, X_minus, X_test, threshold_intersec=threshold_intersec)

        if alg_n == 3:
            y_pred, rdnm = Alg_3(X_plus, X_minus, X_test, threshold_subset=threshold_subset, threshold_intersec=threshold_intersec)

        if alg_n == 4:
            y_pred, rdnm = Alg_4(X_plus, X_minus, X_test, threshold_intersec=threshold_intersec)

        # 4. step - Metrics
        acc, roc_auc, prc, rcl = metrics_test(y_pred_list=y_pred, y_real_list=y_real)
        metrics.append([acc, roc_auc, prc, rcl, rdnm])
        y_pred_arr.append(y_pred)
    if print_time:
        print ('Algorithm runtime: {}'.format(datetime.now()-start)) #str().split(".")[0]
    return y_pred_arr, y_real, metrics

# 6. Experiments

**6.1 Supporting functions**

In [None]:
def avg_metrics(metrics):
    np_metrics = np.array(metrics)
    return np.mean(np_metrics, axis=0)

In [None]:
def print_metrics(metrics):
    print('Accuracy score: {:.4f}'.format(metrics[0]))
    print('Roc AUC Score: {:.4f}'.format(metrics[1]))
    print('Precision Score: {:.4f}'.format(metrics[2]))
    print('Recall Score: {:.4f}'.format(metrics[3]))
    print('Randomness of prediction: {:.4f}'.format(metrics[4]))

**6.2 Hyperparameters tuning**

In [None]:
import matplotlib.pyplot as plt

def hp_tuning(dataset_shortname, alg_n):
    avg_m_arr = np.empty((0,5), float)
    if alg_n == 1:
        for i in range(0,6):
            _, _, metrics = alg_eval(dataset_shortname=dataset_shortname, alg_n=alg_n, threshold_subset=i, print_time=False)
            avg_m = avg_metrics(metrics)
            avg_m_arr = np.vstack([avg_m_arr, avg_m])
    if alg_n in [2,3,4]:
        for i in np.arange(0, 1.1, 0.1):
            _, _, metrics = alg_eval(dataset_shortname=dataset_shortname, alg_n=alg_n, threshold_subset=0, threshold_intersec=i, print_time=False)
            avg_m = avg_metrics(metrics)
            avg_m_arr = np.vstack([avg_m_arr, avg_m])
    
    plt.plot(avg_m_arr)
    plt.legend(['Accuracy score', 'ROC AUC', 'Precision','Recall','Prediction Randomness'],bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    plt.grid()
    plt.show()
    
    return avg_m_arr

In [None]:
hp_tuning(dataset_shortname='hv', alg_n=1)

In [None]:
hp_tuning(dataset_shortname='ttt', alg_n=1)

In [None]:
hp_tuning(dataset_shortname='hv', alg_n=2)

In [None]:
hp_tuning(dataset_shortname='ttt', alg_n=2)

In [None]:
hp_tuning(dataset_shortname='hv', alg_n=3)

In [None]:
hp_tuning(dataset_shortname='ttt', alg_n=3)

In [None]:
hp_tuning(dataset_shortname='hv', alg_n=4)

In [None]:
hp_tuning(dataset_shortname='ttt', alg_n=4)

**6.3 Hyperparameters results**

Best hyperparameter for first algorithm is threshold_subset = 0 for both datasets.  
Best hyperparameter for second algorithm is threshold_intersec = 0.7 and 0.8 for two datasets respectfully.  
Best hyperparameter for third algorithm is threshold_subset = 0 (because it shows us on first alg) and threshold_intersec = 0.2 and 0.8 for two datasets respectfully.  
Best hyperparameter for fourth algorithm is threshold_intersec = 0.4 and 0.7 for two datasets respectfully.  

**6.4 Testing with tuned hyperparameters**

In [None]:
# alg 1 dataset 1 KFolds = 10
y_pred, y_real, metrics = alg_eval(dataset_shortname='hv', alg_n=1, threshold_subset=0)
avg_m_11 = avg_metrics(metrics)
print_metrics(avg_m_11)

In [None]:
# alg 1 dataset 2 KFolds = 10
y_pred, y_real, metrics = alg_eval(dataset_shortname='ttt', alg_n=1, threshold_subset=0)
avg_m_12 = avg_metrics(metrics)
print_metrics(avg_m_12)

In [None]:
# alg 2 dataset 1 KFolds = 10
y_pred, y_real, metrics = alg_eval(dataset_shortname='hv', alg_n=2, threshold_intersec=0.8)
avg_m_21 = avg_metrics(metrics)
print_metrics(avg_m_21)

In [None]:
# alg 2 dataset 2 KFolds = 10
y_pred, y_real, metrics = alg_eval(dataset_shortname='ttt', alg_n=2, threshold_intersec=0.8)
avg_m_22 = avg_metrics(metrics)
print_metrics(avg_m_22)

In [None]:
# alg 3 dataset 1
y_pred, y_real, metrics = alg_eval(dataset_shortname='hv', alg_n=3, threshold_subset=0, threshold_intersec=0.2)
avg_m_31 = avg_metrics(metrics)
print_metrics(avg_m_31)

In [None]:
# alg 3 dataset 2
y_pred, y_real, metrics = alg_eval(dataset_shortname='ttt', alg_n=3, threshold_subset=0, threshold_intersec=0.5)
avg_m_32 = avg_metrics(metrics)
print_metrics(avg_m_32)

In [None]:
# alg 4 dataset 1
y_pred, y_real, metrics = alg_eval(dataset_shortname='hv', alg_n=4, threshold_intersec=0.5)
avg_m_41 = avg_metrics(metrics)
print_metrics(avg_m_41)

In [None]:
y_pred, y_real, metrics = alg_eval(dataset_shortname='hv', alg_n=4, threshold_intersec=0.4)
avg_m_41 = avg_metrics(metrics)
print_metrics(avg_m_41)

In [None]:
# alg 4 dataset 2
y_pred, y_real, metrics = alg_eval(dataset_shortname='ttt', alg_n=4, threshold_intersec=0.7)
avg_m_42 = avg_metrics(metrics)
print_metrics(avg_m_42)

# 7. Results

In [None]:
avg_metrics_hv = np.vstack([avg_m_11, avg_m_21,avg_m_31,avg_m_41])
print(avg_metrics_hv)

Best: **Algorithm 4**  
Algorithm runtime: 0:00:01.578181  
Accuracy score: 0.9139  
Roc AUC Score: 0.9173  
Precision Score: 0.9779  
Recall Score: 0.8664  
Randomness of prediction: 0.0000

In [None]:
avg_metrics_ttt = np.vstack([avg_m_12, avg_m_22,avg_m_32,avg_m_42])
print(avg_metrics_ttt)

Best: **Algorithm 2**  
Algorithm runtime: 0:00:05.664447  
Accuracy score: 0.9906  
Roc AUC Score: 0.9873  
Precision Score: 0.9853  
Recall Score: 1.0000  
Randomness of prediction: 0.0042

**Comparing to Bernoulli Naive Bayes**

In [None]:
y_pred, y_real, metrics = bernoulliNB(dataset_shortname='hv')
avg_m_bNB_hv = avg_metrics(metrics)
print_metrics(avg_m_bNB_hv)

In [None]:
y_pred, y_real, metrics = bernoulliNB(dataset_shortname='ttt')
avg_m_bNB_ttt = avg_metrics(metrics)
print_metrics(avg_m_bNB_ttt)

**Histogram plots**

In [None]:
avg_metrics_ttt_full = np.vstack([avg_metrics_ttt, avg_m_bNB_ttt])
avg_metrics_hv_full = np.vstack([avg_metrics_hv, avg_m_bNB_hv])

In [None]:
def hist_plot(data, name):
    x = np.arange(data.shape[0])
    dx = (np.arange(data.shape[1])-data.shape[1]/2.)/(data.shape[1]+2.)
    d = 1./(data.shape[1]+2.)


    fig, ax=plt.subplots()
    labels = ['Alg1','Alg2','Alg3','Alg4','bNB']
    x_axis = np.arange(0, 5, 1)
    for i in range(data.shape[1]):
        ax.bar(x+dx[i],data[:,i], width=d)#, label="label {}".format(labels[i]))
    plt.xticks(x_axis,labels)
    plt.title(name)
    plt.legend(['Accuracy score', 'ROC AUC', 'Precision','Recall'],bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    plt.grid()
    plt.show()

In [None]:
hist_plot(avg_metrics_ttt_full, name= 'tic tac toe dataset')

In [None]:
hist_plot(avg_metrics_hv_full, name= 'house votes dataset')