# Panteleeva Svetlana

## Importing dataset

https://archive.ics.uci.edu/ml/datasets/Tic-Tac-Toe+Endgame

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('tic-tac-toe.csv', sep=',')
df

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive
...,...,...,...,...,...,...,...,...,...,...
953,o,x,x,x,o,o,o,x,x,negative
954,o,x,o,x,x,o,x,o,x,negative
955,o,x,o,x,o,x,x,o,x,negative
956,o,x,o,o,x,x,x,o,x,negative


In [3]:
df.describe()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10
count,958,958,958,958,958,958,958,958,958,958
unique,3,3,3,3,3,3,3,3,3,2
top,x,x,x,x,x,x,x,x,x,positive
freq,418,378,418,378,458,378,418,378,418,626


## 1.2 Подготовка данных
1) преобразуем категориальные данные в бинарные

2) cross_validation

3) positive, negative в 1 и 0

4) отделяем последняю колонку - target feature

5) найдем данные при которых target positive и target negative - plus context и minus context 

In [4]:
import copy
def dummy_encode_categorical_columns(data):
    result_data = copy.deepcopy(data)
    for column in data.columns.values:
        result_data = pd.concat([result_data, pd.get_dummies(result_data[column], prefix = column, prefix_sep = ': ')], axis = 1)
        del result_data[column]
    return result_data
df_bool = dummy_encode_categorical_columns(df)
# print(df_bool.head())
# df_bool.describe()

### Cross Validation
Makes cross validation of dataset from selected path in N splits and saves to selected path. 

In [5]:
from sklearn import model_selection
import sys

def cross_validation(path_in, path_out, n_splits):
    
    df_prep = path_in
    kf = model_selection.KFold(n_splits=n_splits, shuffle=True, random_state=None)
    kf.get_n_splits(df_prep)
    k = 1
    
    for train_index, test_index in kf.split(df_prep):
        df_prep.iloc[train_index].to_csv('_train_'+str(k)+'.csv',index=False)
        df_prep.iloc[test_index].to_csv('_test_'+str(k)+'.csv',index=False)
        k += 1
        
    return 'Cross Validation Completed'

In [6]:
def devide_x_y_and_make_bool_y(name):
    df = name #pd.read_csv(name)#, sep=',')
    df = df.replace(to_replace='positive', value=1)
    df = df.replace(to_replace='negative', value=0)
    y = np.array(df['V10: positive'], df['V10: negative'])
    del df['V10: positive']
    del df['V10: negative']
    bin_df = dummy_encode_categorical_columns(df)
    return np.array(bin_df).astype(int), y
    

In [7]:
def data_preprocessing(path_train, path_test):
    train = pd.read_csv(path_train)
    test = pd.read_csv(path_test)
        
    X_train, y_train = devide_x_y_and_make_bool_y(train)
    X_test, y_test = devide_x_y_and_make_bool_y(test)
#         print(X_train, y_train)
        
    X_train_pos = X_train[y_train == 1] #plus_context
    X_train_neg = X_train[y_train == 0] #minus_context
#         print(X_train_pos, X_train_neg)

    return X_train_pos, X_train_neg, X_train, y_train, X_test, y_test

In [8]:
#Функция которая из массивов делает sets из строк (), каждой строкой является "название колонки и элемент (1 или 0)"   
attrib_names = list(df_bool)

def make_set(example):
    return set([i+':'+str(k) for i, k in zip(attrib_names,example)])

### Алгоритм 0 (Very simple)

In [9]:
def alg_0(x_test, x_train_pos, x_train_neg):
    y_pred = []
    for test_obj in x_test:
        pos = np.sum(test_obj == x_train_pos) / float(len(x_train_pos))
        neg = np.sum(test_obj == x_train_neg) / float(len(x_train_neg))
        if (pos > neg):
            y_pred.append(1)
        else:
            y_pred.append(0)
    return y_pred

### Алгоритм 1

In [10]:
import random

def alg_1(x_test, x_train_pos, x_train_neg, threshold):
    
    y_pred = []
    clas = 0
    
    for el in x_test:
        labels = {'positive':0, 'negative':0}
        x_test_dict = make_set(el)
        
        for el_pos in x_train_pos:
            pos_dict = make_set(el_pos)
            intersection = pos_dict & x_test_dict
            neg_intersect = [make_set(i) for i in x_train_neg if make_set(i).issuperset(intersection)] #генератор списка:
            if len(neg_intersect) > threshold: #порог
                labels["positive"] = 0
                break
        for el_neg in x_train_neg:
            neg_string = make_set(el_neg)
            intersection = neg_string & x_test_dict
            pos_intersect = [make_set(i) for i in x_train_pos if make_set(i).issuperset(intersection)]
            if len(pos_intersect) > threshold:
                labels["negative"] = 0
                break
    
        if labels["positive"] == labels["negative"]:
            clas = random.choice([0,1])
        if labels["positive"] > labels["negative"]:
            clas = 1
        if labels["positive"] < labels["negative"]:
            clas = 0
        y_pred.append(clas)
        
    return y_pred

### Алгоритм 2

In [11]:
def alg_2(x_test, x_train_pos, x_train_neg, threshold_subset, threshold_intersec):
    
    norm_plus = (len(x_train_pos)+len(x_train_neg))//len(x_train_pos)
    norm_minus = (len(x_train_neg)+len(x_train_neg))//len(x_train_neg)
    
    import warnings
    warnings.filterwarnings('ignore')
    
    y_pred = []
    for el in x_test:
        labels = {'plus':0, 'minus':0}
        x_test_set = make_set(el)
        
        for el_plus in  x_train_pos:
            counter_plus = 0 
            x_train_plus_set = make_set(el_plus)
            intersec_plus = x_test_set & x_train_plus_set
            intersec_plus_len_norm = len(intersec_plus)/len(x_test_set)
            
            if intersec_plus_len_norm >= threshold_intersec: #в процентах
                for el_minus in x_train_neg,:
                    x_train_minus_set = make_set(el_minus)
                    if x_train_minus_set.issuperset(intersec_plus):#intersec_plus <= x_train_minus_set: 
                        counter_plus += 1
                if counter_plus <= threshold_subset: #в числах
                    labels['plus'] += 1
                    
                    
        for ent_minus in x_train_neg,:
            counter_minus = 0
            x_train_neg_set = make_set(ent_minus)
            intersec_minus = x_test_set & x_train_neg_set
            intersec_minus_len_norm = len(intersec_minus)/len(x_test_set)
            if intersec_minus_len_norm >= threshold_intersec:
                for ent_plus in  x_train_pos:
                    x_train_pos_set = make_set(ent_plus)
                    if intersec_minus <= x_train_pos_set: 
                        counter_minus += 1
                if counter_minus <= threshold_subset:
                    labels['minus'] += 1
        
        labels['plus'] = labels['plus']*norm_plus
        labels['minus'] = labels['minus']*norm_minus
        if labels['plus'] > labels['minus']:
            clas = 1
        elif labels['plus'] < labels['minus']:
            clas = 0
        else:
            clas = random.choice([0,1])
        y_pred.append(clas)

    
    return y_pred

### Оценка метрик качества (Metrics evaluation)

In [12]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score

In [13]:
def metrics_test(y_test, y_pred):
    
    y_test = np.array(y_test)
    y_pred = np.array(y_pred)
    
    TP = np.sum(y_test * y_pred)
    TN = np.sum(y_test + y_pred == 0)
    FP = np.sum((y_test  == 0) * (y_pred == 1))
    FN = np.sum((y_test  == 1) * (y_pred == 0))
    TPR = float(TP) / np.sum(y_test == 1)
    TNR = float(TN) / np.sum(y_test == 0)
    FPR = float(FP) / (TP + FN)
    NPV = float(TN) / (TN + FN)
    FDR = float(FP) / (TP + FP)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    
    return TP, TN, FP, FN, TPR, TNR, FPR, NPV, FDR, acc, prec, rec, roc_auc

In [14]:
def print_metrics(metrics):
    
    print('True Positive: {:.4f}'.format(metrics[0]))
    print('True Negative: {:.4f}'.format(metrics[1]))
    print('False Positive: {:.4f}'.format(metrics[2]))
    print('False Negativee: {:.4f}'.format(metrics[3]))
    print('True Positive Rate: {:.4f}'.format(metrics[4]))
    print('True Negative Rate: {:.4f}'.format(metrics[5]))
    print('False Positive Rate: {:.4f}'.format(metrics[6]))
    print('Negative Predictive Value: {:.4f}'.format(metrics[7]))
    print('False Discovery Rate: {:.4f}'.format(metrics[8]))
    print('Accuracy score: {:.4f}'.format(metrics[9]))
    print('Precision Score: {:.4f}'.format(metrics[10]))
    print('Recall Score: {:.4f}'.format(metrics[11]))
    print('Roc AUC Score: {:.4f}'.format(metrics[3]))
    

## Launching and evaluating algorithms

In [15]:
from tqdm.notebook import tqdm as tq

def main(alg_n, n_splits):
    
#     # time on
    import timeit
    start = timeit.default_timer()
    
    y_pred_arr = []
    # 1. step - Cross Validation
    path_out = './'
    cross_validation(path_in=df_bool,
                        path_out = path_out,
                        n_splits=n_splits)
    metrics = []
    # 2. step - Data Preprocessing
    for i in tq(range(1,n_splits+1)):
        path_train = r'_train_'+str(i)+r'.csv' 
        path_test = r'_test_'+str(i)+r'.csv'
        X_train_pos, X_train_neg, X_train, y_train, X_test, y_test = data_preprocessing(path_train = path_train,
                                                                                        path_test = path_test)
        # 3. step - Launching algorithms
        if alg_n == 0:
            y_pred = alg_0(X_test, X_train_pos, X_train_neg)
            
        if alg_n == 1:
            y_pred = alg_1(X_test, X_train_pos, X_train_neg, threshold=0)
            
        if alg_n == 2:
            y_pred = alg_2(X_test, X_train_pos, X_train_neg, threshold_subset= 1, threshold_intersec=0.7)
        
        # 4. step - Evaluating algorithms
        metrics_ = []
        TP, TN, FP, FN, TPR, TNR, FPR, NPV, FDR, acc, prec, rec, roc_auc = metrics_test(y_test=y_test, y_pred=y_pred)
        for metric in [TP, TN, FP, FN, TPR, TNR, FPR, NPV, FDR, acc, prec, rec, roc_auc]:
            metrics_.append(metric)
        metrics.append(metrics_)

    avg_metrics = np.mean(np.array(metrics), axis=0)
    
#     # time off    
    stop = timeit.default_timer()
    time = stop - start
    print ('Algorithm runtime: {}'.format(time))
    return avg_metrics

In [29]:
# Algoritm 0 
metrics = main(alg_n=0, n_splits = 4)
print_metrics(metrics)

A Jupyter Widget


Algorithm runtime: 1.5760992000000442
True Positive: 101.0000
True Negative: 57.5000
False Positive: 25.5000
False Negativee: 55.5000
True Positive Rate: 0.6451
True Negative Rate: 0.6928
False Positive Rate: 0.1632
Negative Predictive Value: 0.5084
False Discovery Rate: 0.2015
Accuracy score: 0.6618
Precision Score: 0.7985
Recall Score: 0.6451
Roc AUC Score: 0.6690


In [16]:
# Algoritm 1 
metrics = main(alg_n=1, n_splits = 4)
print_metrics(metrics)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))


Algorithm runtime: 98.49571789999999
True Positive: 83.5000
True Negative: 40.7500
False Positive: 42.2500
False Negativee: 73.0000
True Positive Rate: 0.5336
True Negative Rate: 0.4930
False Positive Rate: 0.2721
Negative Predictive Value: 0.3590
False Discovery Rate: 0.3357
Accuracy score: 0.5188
Precision Score: 0.6643
Recall Score: 0.5336
Roc AUC Score: 73.0000


In [24]:
# Algoritm 2 
metrics = main(alg_n=2, n_splits = 4)
print_metrics(metrics)

A Jupyter Widget


Algorithm runtime: 831.2897407
True Positive: 156.5000
True Negative: 0.0000
False Positive: 83.0000
False Negativee: 0.0000
True Positive Rate: 1.0000
True Negative Rate: 0.0000
False Positive Rate: 0.5338
Negative Predictive Value: nan
False Discovery Rate: 0.3466
Accuracy score: 0.6534
Precision Score: 0.6534
Recall Score: 1.0000
Roc AUC Score: 0.5000
