# Connecting google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir("drive/My Drive/KNN_application")

FileNotFoundError: [Errno 2] No such file or directory: 'drive/My Drive/KNN_application'

# Processing dataset for KNN and GVG-CNC
We will be processing monk1, monk2, monk3, balance-scale, tic-tac-toe, car_evaluation, kr-vs-kp datasets.

In [None]:
# use a dictionary to store all the dataset
total_dataset = dict()

## Data preprocessing -- Monk1 to Monk3

In [None]:
import os
import numpy as np
import pandas as pd

try:
    root
except NameError:
    root = os.getcwd()

In [None]:
# change directory and inspect existing files
os.chdir(os.path.join(root, "monk"))
os.listdir()

FileNotFoundError: [Errno 2] No such file or directory: '/content/monk'

In [None]:
# explore data structure

df = pd.read_csv('monks-2.train', delimiter=' ')
df.head()

# need to remove the first and last column
# column "1" is label, 1.1 to 1.5 are features

Unnamed: 0.1,Unnamed: 0,0,1,1.1,1.2,1.3,2,2.1,data_4
0,,0,1,1,1,1,4,1,data_7
1,,0,1,1,1,2,1,1,data_9
2,,0,1,1,1,2,1,2,data_10
3,,0,1,1,1,2,2,1,data_11
4,,0,1,1,1,2,3,1,data_13


In [None]:
# read all monk datasets

names = [ 'monks-1.test',
 'monks-1.train',
 'monks-2.test',
 'monks-2.train',
 'monks-3.test',
 'monks-3.train']

# create a dictionary to store all the monk datasets
monk_dict = dict(zip(names, names))


# rename the variable names to a til f, then define a1-a5, ..., f1-f5 as dummy variables
for name in names:
    df = pd.read_csv(name, delimiter=' ')
    df = df.iloc[:, range(1, 8)]
    df.columns= ['label', 'a', 'b', 'c', 'd', 'e', 'f']
    df['a1'] = df['a'] == 1
    df['a2'] = df['a'] == 2
    df['a3'] = df['a'] == 3
    df['a4'] = df['a'] == 4
    df['a5'] = df['a'] == 5

    df['b1'] = df['b'] == 1
    df['b2'] = df['b'] == 2
    df['b3'] = df['b'] == 3
    df['b4'] = df['b'] == 4
    df['b5'] = df['b'] == 5

    df['c1'] = df['c'] == 1
    df['c2'] = df['c'] == 2
    df['c3'] = df['c'] == 3
    df['c4'] = df['c'] == 4
    df['c5'] = df['c'] == 5

    df['d1'] = df['d'] == 1
    df['d2'] = df['d'] == 2
    df['d3'] = df['d'] == 3
    df['d4'] = df['d'] == 4
    df['d5'] = df['d'] == 5

    df['e1'] = df['e'] == 1
    df['e2'] = df['e'] == 2
    df['e3'] = df['e'] == 3
    df['e4'] = df['e'] == 4
    df['e5'] = df['e'] == 5

    df['f1'] = df['f'] == 1
    df['f2'] = df['f'] == 2
    df['f3'] = df['f'] == 3
    df['f4'] = df['f'] == 4
    df['f5'] = df['f'] == 5

    new_columns = "label a1 a2 a3 a4 a5 b1 b2 b3 b4 b5 c1 c2 c3 c4 c5 d1 d2 d3 d4 d5 e1 e2 e3 e4 e5 f1 f2 f3 f4 f5".split(' ')
    new_columns = "label a1 a2 b1 b2 c1 d1 d2 e1 e2 e3 f1".split(' ')
    df = df[new_columns]
    monk_dict[name] = df

# # append monks into total dataset
# total_dataset |= monk_dict

In [None]:
# inspect the dimension of dummy-variable-ized dataset
for name in names:
    n_instances, n_features_ = monk_dict[name].shape
#     print(monk_dict[name].sum())
    print(f"{name} has {n_instances} instances and {n_features_-1} features.")

monks-1.test has 431 instances and 11 features.
monks-1.train has 123 instances and 11 features.
monks-2.test has 431 instances and 11 features.
monks-2.train has 168 instances and 11 features.
monks-3.test has 431 instances and 11 features.
monks-3.train has 121 instances and 11 features.


In [None]:
# combine the train and test dataset together

monk1 = pd.concat([monk_dict['monks-1.train'], monk_dict['monks-1.test']], axis=0)
monk2 = pd.concat([monk_dict['monks-2.train'], monk_dict['monks-2.test']], axis=0)
monk3 = pd.concat([monk_dict['monks-3.train'], monk_dict['monks-3.test']], axis=0)


monk1.index= range(monk1.shape[0])
monk2.index= range(monk2.shape[0])
monk3.index= range(monk3.shape[0])

In [None]:
# store them into total dataset
total_dataset['monk1'] = monk1
total_dataset['monk2'] = monk2
total_dataset['monk3'] = monk3


## Data Preprocessing -- Balance_scale

In [None]:
# change directory and inspect files

os.chdir(os.path.join(root, "balance_scale"))
os.listdir()

['balance-scale.data', 'balance-scale.names', 'Index']

In [None]:
# inspect data structure

balance_scale = pd.read_csv("balance-scale.data")
balance_scale.head()

Unnamed: 0,B,1,1.1,1.2,1.3
0,R,1,1,1,2
1,R,1,1,1,3
2,R,1,1,1,4
3,R,1,1,1,5
4,R,1,1,2,1


In [None]:
# observe possible values for each variable
df = balance_scale
df.columns=['label', 'a', 'b', 'c', 'd']
for col in df.columns:
    print(col, df[col].unique())

label ['R' 'L' 'B']
a [1 2 3 4 5]
b [1 2 3 4 5]
c [1 2 3 4 5]
d [2 3 4 5 1]


In [None]:
# create dummy variables named a1-a5, ..., d1-d5

df['a1'] = df['a'] == 1
df['a2'] = df['a'] == 2
df['a3'] = df['a'] == 3
df['a4'] = df['a'] == 4
df['a5'] = df['a'] == 5

df['b1'] = df['b'] == 1
df['b2'] = df['b'] == 2
df['b3'] = df['b'] == 3
df['b4'] = df['b'] == 4
df['b5'] = df['b'] == 5

df['c1'] = df['c'] == 1
df['c2'] = df['c'] == 2
df['c3'] = df['c'] == 3
df['c4'] = df['c'] == 4
df['c5'] = df['c'] == 5

df['d1'] = df['d'] == 1
df['d2'] = df['d'] == 2
df['d3'] = df['d'] == 3
df['d4'] = df['d'] == 4
df['d5'] = df['d'] == 5

In [None]:
# classes are "Balanced" "Left" and "Right"
set(df["label"])

{'B', 'L', 'R'}

In [None]:
# select only newly created columns as a new dataframe
new_columns= "label a1 a2 a3 a4 b1 b2 b3 b4 c1 c2 c3 c4 d1 d2 d3 d4".split(" ")
df = df[new_columns]

KeyError: "['a1', 'a2', 'a3', 'a4', 'b1', 'b2', 'b3', 'b4', 'c1', 'c2', 'c3', 'c4', 'd1', 'd2', 'd3', 'd4'] not in index"

In [None]:
# create a dataset, where target is such that "B" (stands for Balanced) ->1 and other case->0
balance_scale_B = df.replace({'B':1, 'L':0, 'R':0})

In [None]:
# create a dataset, where target is such that "L" (stands for Left) ->1 and other case->0
balance_scale_L = df.replace({'B':0, 'L':1, 'R':0})

In [None]:
# store them into total dataset
total_dataset |= dict(balance_scale_B=balance_scale_B)
total_dataset |= dict(balance_scale_L=balance_scale_L)

## Data preprocessing -- tic-tac-toe

In [None]:
# change directory and inspect files
os.chdir(os.path.join(root, "tic+tac+toe+endgame"))
os.listdir()

['Index', 'tic-tac-toe.data', 'tic-tac-toe.names']

In [None]:
# read and inspect data
df = pd.read_csv("tic-tac-toe.data")
df

In [None]:
# give columns meaningful names
df = df[['positive', 'x', 'x.1', 'x.2', 'x.3', 'x.4', 'o', 'o.1', 'o.2', 'o.3']]
tic = df
df.columns = ['label', '1', '2', '3', '4', '5', '6', '7', '8', '9']
# check and found out that feature is not binary
set(df['1'])

KeyError: "None of [Index(['positive', 'x', 'x.1', 'x.2', 'x.3', 'x.4', 'o', 'o.1', 'o.2', 'o.3'], dtype='object')] are in the [columns]"

In [None]:
# transform features into binary. Knowing that each original feature represent a configuration in a cell out of 3x3 grid.
# which can either take value 'o' or 'x' or 'nothing'.

if df['label'][0] in ['positive', 'negative']:
    df['label'] = df['label'] == 'positive'

df['x1'] = df['1'] == 'x'
df['x2'] = df['2'] == 'x'
df['x3'] = df['3'] == 'x'

df['x4'] = df['4'] == 'x'
df['x5'] = df['5'] == 'x'
df['x6'] = df['6'] == 'x'

df['x7'] = df['7'] == 'x'
df['x8'] = df['8'] == 'x'
df['x9'] = df['9'] == 'x'


df['o1'] = df['1'] == 'o'
df['o2'] = df['2'] == 'o'
df['o3'] = df['3'] == 'o'

df['o4'] = df['4'] == 'o'
df['o5'] = df['5'] == 'o'
df['o6'] = df['6'] == 'o'

df['o7'] = df['7'] == 'o'
df['o8'] = df['8'] == 'o'
df['o9'] = df['9'] == 'o'

KeyError: '1'

In [None]:
# select newly created variable and store into total dataset
tic_bin = df[['label', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'o1', 'o2', 'o3', 'o4', 'o5', 'o6', 'o7', 'o8', 'o9']]
total_dataset |= dict(tic_bin=tic_bin)

KeyError: "['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'o1', 'o2', 'o3', 'o4', 'o5', 'o6', 'o7', 'o8', 'o9'] not in index"

## Data preprocessing -- car_evaluation

In [None]:
# change directory and inspect files

os.chdir(os.path.join(root, 'car+evaluation'))
os.listdir()

['car.c45-names', 'car.data', 'car.names']

In [None]:
# read and give meaningful names to each column
df = pd.read_csv('car.data')
df.columns=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'label']
df = df[['label', 'buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']]

In [None]:
# See for each variable, what are the possible values.
for col in df.columns:
    print(col, df[col].unique())

label ['unacc' 'acc' 'vgood' 'good']
buying ['vhigh' 'high' 'med' 'low']
maint ['vhigh' 'high' 'med' 'low']
doors ['2' '3' '4' '5more']
persons ['2' '4' 'more']
lug_boot ['small' 'med' 'big']
safety ['med' 'high' 'low']


In [None]:
# Give each variable numerical encodings since they are all ordinal variables
df['label'].replace({"unacc":0, "acc":1, "vgood":2, "good":3}, inplace=True)
df['buying'].replace({'vhigh':3, 'high':2, 'med':1, 'low':0}, inplace=True)
df['maint'].replace({'vhigh':3, 'high':2, 'med':1, 'low':0}, inplace=True)
df['doors'].replace({'2':2, '3':3, '4':4, '5more':5}, inplace=True)
df['persons'].replace({'2':2, '4':4, 'more':5}, inplace=True)
df['lug_boot'].replace({'small':0, 'med':1, 'big':2}, inplace=True)
df['safety'].replace({'med':1, 'high':2, 'low':0}, inplace=True)
None

In [None]:
# create binary variables based on different thresholds
df['labelvgood'] = df['label'] == 3
df['labelgood'] = df['label'] >= 2
df['labelacc'] = df['label'] >= 1

df['buyingvhigh'] = df['buying'] == 3
df['buyinghigh'] = df['buying'] >= 2
df['buyingmed'] = df['buying'] >= 1

df['maintvhigh'] = df['maint'] == 3
df['mainthigh'] = df['maint'] >= 2
df['maintmed'] = df['maint'] >= 1

df["doors5"] = df['doors'] >= 5
df["doors4"] = df['doors'] >= 4
df["doors3"] = df['doors'] >= 3

df["persons5"] = df['persons']>=5
df["persons4"] = df['persons']>=4

df["lug_boot2"] = df["lug_boot"] >=2
df["lug_boot1"] = df["lug_boot"] >=1

df["safety2"] = df["safety"] >=2
df["safety1"] = df["safety"] >=1

In [None]:
# select newly created features and make 3 datasets based on target labels
features = ['buyingvhigh', 'buyinghigh', 'buyingmed',
            'maintvhigh', 'mainthigh', 'maintmed',
            "doors5", "doors4", "doors3",
            "persons5", "persons4",
            "lug_boot2", "lug_boot1",
            "safety2", "safety1"]
car_evaluation_vgood = df[["labelvgood"]+features]
car_evaluation_good = df[["labelgood"]+features]
car_evaluation_acc = df[["labelacc"]+features]

In [None]:
# store them into total dataset
total_dataset |= dict(car_evaluation_vgood=car_evaluation_vgood,
                     car_evaluation_good=car_evaluation_good,
                     car_evaluation_acc=car_evaluation_acc)

## Data preprocessing -- kr-vs-kp

In [None]:
# change directory and inspect files
os.chdir(root, "kr-vs-kp")
os.listdir()

['.ipynb_checkpoints',
 'balance_scale',
 'car+evaluation',
 'KNN_data_preprocessing.ipynb',
 'kr-vs-kp_csv.csv',
 'monk',
 'monk1.csv',
 'monk1_graph.csv',
 'monk1_graph.txt',
 'monk1_phifile.txt',
 'monk1_solution.txt',
 'monk1_v_predefined.txt',
 'monk1_weight.csv',
 'monk1_weight.txt',
 'output_solution.txt',
 'output_table.txt',
 'PRINT_IVAN_LP.lp',
 'Project1.exe',
 'run_and_analysis.ipynb',
 'tic+tac+toe+endgame']

In [None]:
# read and insepct possible values for each columns to see if they are already binary, and what are current values
df = pd.read_csv("kr-vs-kp_csv.csv")
for col in df.columns:
    print(col)
    print(df[col].unique())

df['katri_n'] = df['katri'] == 'n'
df['katri_b'] = df['katri'] == 'b' # binary-ize all features

bkblk
['f' 't']
bknwy
['f' 't']
bkon8
['f' 't']
bkona
['f' 't']
bkspr
['f' 't']
bkxbq
['f' 't']
bkxcr
['f' 't']
bkxwp
['f' 't']
blxwp
['f' 't']
bxqsq
['f' 't']
cntxt
['f' 't']
dsopp
['f' 't']
dwipd
['l' 'g']
hdchk
['f' 't']
katri
['n' 'w' 'b']
mulch
['f' 't']
qxmsq
['f' 't']
r2ar8
['t' 'f']
reskd
['f' 't']
reskr
['f' 't']
rimmx
['f' 't']
rkxwp
['f' 't']
rxmsq
['f' 't']
simpl
['f' 't']
skach
['f' 't']
skewr
['t' 'f']
skrxp
['f' 't']
spcop
['f' 't']
stlmt
['f' 't']
thrsk
['f' 't']
wkcti
['f' 't']
wkna8
['f' 't']
wknck
['f' 't']
wkovl
['t' 'f']
wkpos
['t' 'f']
wtoeg
['n' 't']
class
['won' 'nowin']


In [None]:
# reorder the columns so that the label is the first (index 0) appearing.
new_columns = ['class', 'bkblk', 'bknwy', 'bkon8', 'bkona', 'bkspr', 'bkxbq', 'bkxcr', 'bkxwp', 'blxwp', 'bxqsq', 'cntxt', 'dsopp', 'dwipd', 'hdchk', 'mulch', 'qxmsq', 'r2ar8', 'reskd', 'reskr', 'rimmx', 'rkxwp', 'rxmsq', 'simpl', 'skach', 'skewr', 'skrxp', 'spcop', 'stlmt', 'thrsk', 'wkcti', 'wkna8', 'wknck', 'wkovl', 'wkpos', 'wtoeg', 'katri_n', 'katri_b']
df = df[new_columns] # reorder

In [None]:
# rename the features to be 0 1 while making sure each column is treaty correctly
df.replace({"won":1, "nowin":0, "t":0, "f":1}, inplace=True)
df.replace({"l":0, "g":1, "n":1}, inplace=True)

df.columns = ['label'] + list(df.columns)[1:]

In [None]:
# store it into total dataset
total_dataset['kr-vs-kp'] = df

## Getting statistics for each dataset

In [None]:
dimensions = dict()
for key, df in total_dataset.items():
    dimensions[key] = df.shape
dimensions

{'monk1': (554, 12),
 'monk2': (599, 12),
 'monk3': (552, 12),
 'balance_scale_B': (624, 17),
 'balance_scale_L': (624, 17),
 'tic_bin': (957, 19),
 'car_evaluation_vgood': (1727, 16),
 'car_evaluation_good': (1727, 16),
 'car_evaluation_acc': (1727, 16),
 'kr-vs-kp': (3196, 38)}

## Write the data down

In [None]:
# create a new directory (if not already existing) called df (stands for dataframe)
os.chdir(root)
if "df" not in os.listdir():
    os.mkdir("df")

In [None]:
# write down data in csv format
for (name, df) in total_dataset.items():
    df.to_csv(os.path.join(root, "df", name+".csv"))

# Generating data for C++ code
Need phi_file, v_predefined_file and w_file
## phi_file
For telling who are friends and who are enemies. K-nearest are friends and K' farthest are enemies.

In [None]:
# define the distance between instances, i.e. two given rows. Euclidien 2-distance squared is used.
def distance(df, i, j):
        dff = df.astype(int)
        return sum((dff.iloc[i, 1:] - dff.iloc[j, 1:])**2)

# For the instance in i-th row, calculate the distance of it with all the other instances.
# Reorder the distances and pick the K-first and K-last instances. If equality, pick all the instances of the same distance.
def find_K_neighbors(df, i, K=3):
    distances = [(j, distance(df, i, j)) for j in range(df.shape[0])] # attention : (i, 0) will be inside
    distances.sort(key=lambda x: x[1])

    # create a dictionary to count : how many instances, j, are at a given distance (to the anchor instance i) ?
    counter = dict()
    for (j, d) in distances:
        try:
            counter[d] += 1
        except KeyError:
            counter[d] = 1

    # find nearest neighbors
    # pick a distance d_from0 such that (j, d) with d<d_from0 counts more than K
    d_from0 = 0 # we pick d<d_thr
    count = 0
    while count < K+1: # (i, 0) are in and we want to exclude this
        try:
            count += counter[d_from0]
        except KeyError:
            pass
        d_from0 += 1


    # find farthest neighbors
    # pick a distance d_tomax such that (j, d) with d > d_tomax counts more than K
    d_tomax = max(d for (j, d) in distances)
    count = 0
    while count < K:
        try:
            count += counter[d_tomax]
        except KeyError:
            pass
        d_tomax -= 1


    # return a list, in order of index number, of whether it is among K-nearest (1), or among K-farthest(2), or neither case(0).
    phidata_list = []
    for (j, d) in sorted(distances, key=lambda x:x[0]):
#     for (j, d) in sorted(distances, key=lambda x:x[1]):
        if j == i:
            phidata_list.append(0) # no link with itself
        elif d < d_from0:
            phidata_list.append(1) # 1 for phiup
        elif d > d_tomax:
            phidata_list.append(2) # 2 for phidown
        else:
            phidata_list.append(0) # 0 for no link

    return phidata_list



# find_K_neighbors(total_dataset['monk1'], 0, K = 3)

In [None]:
from time import time

# run the find_K_neighbors n times (n = df.shape[0] is the number of rows). Store the lists into a matrix, saved in csv.
def create_phifile(df, filename, K=3):
    begin = time()
    phifile = pd.DataFrame(index=range(df.shape[0]), columns=range(df.shape[0]))
    for i in range(df.shape[0]):
        phidata_list = find_K_neighbors(df, i, K)
        phifile.iloc[i, :] = np.array(phidata_list)
        if i%10 == 1:
            current = time()
            print(f"{current-begin} sec passed, {i} instances processed, {df.shape[0]-i} instances left, ETA={(df.shape[0]-i)*(current-begin)/i} secs")


    phifile.to_csv(filename, header=False, index=False, sep=' ')
    return phifile

phifile = create_phifile(total_dataset['monk1'], "monk1_phifile.csv")

0.2380380630493164 sec passed, 1 instances processed, 553 instances left, ETA=131.63504886627197 secs
1.3080434799194336 sec passed, 11 instances processed, 543 instances left, ETA=64.5697826905684 secs
2.4350509643554688 sec passed, 21 instances processed, 533 instances left, ETA=61.80391257149832 secs
3.563039541244507 sec passed, 31 instances processed, 523 instances left, ETA=60.11192516357668 secs
4.69512152671814 sec passed, 41 instances processed, 513 instances left, ETA=58.74627666357087 secs
5.8245017528533936 sec passed, 51 instances processed, 503 instances left, ETA=57.445576111475624 secs
6.956801891326904 sec passed, 61 instances processed, 493 instances left, ETA=56.22464479383875 secs
8.060261487960815 sec passed, 71 instances processed, 483 instances left, ETA=54.83248308007146 secs
9.189079761505127 sec passed, 81 instances processed, 473 instances left, ETA=53.65968799002377 secs
10.29706859588623 sec passed, 91 instances processed, 463 instances left, ETA=52.3905797

In [None]:
create_phifile(total_dataset['monk2'], "monk2_phifile.csv")

0.24704742431640625 sec passed, 1 instances processed, 598 instances left, ETA=147.73435974121094 secs
1.4256434440612793 sec passed, 11 instances processed, 588 instances left, ETA=76.20712228254838 secs
2.6057217121124268 sec passed, 21 instances processed, 578 instances left, ETA=71.71938807623727 secs
3.8166489601135254 sec passed, 31 instances processed, 568 instances left, ETA=69.93085836595104 secs
5.012679100036621 sec passed, 41 instances processed, 558 instances left, ETA=68.22133994683986 secs
6.219496011734009 sec passed, 51 instances processed, 548 instances left, ETA=66.82909440059288 secs
7.414624929428101 sec passed, 61 instances processed, 538 instances left, ETA=65.39456085298882 secs
8.622133731842041 sec passed, 71 instances processed, 528 instances left, ETA=64.1195297241211 secs
9.847583293914795 sec passed, 81 instances processed, 518 instances left, ETA=62.97590304009708 secs
11.077240705490112 sec passed, 91 instances processed, 508 instances left, ETA=61.83778

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,589,590,591,592,593,594,595,596,597,598
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594,0,2,0,0,2,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
595,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
596,0,2,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
597,0,0,2,0,0,0,0,0,0,0,...,1,0,1,0,1,0,1,0,0,1


In [None]:
create_phifile(total_dataset['monk3'], "monk3_phifile.csv")

0.25008726119995117 sec passed, 1 instances processed, 551 instances left, ETA=137.7980809211731 secs
1.4959774017333984 sec passed, 11 instances processed, 541 instances left, ETA=73.57488857616077 secs
2.7879772186279297 sec passed, 21 instances processed, 531 instances left, ETA=70.49599538530622 secs
3.9833967685699463 sec passed, 31 instances processed, 521 instances left, ETA=66.94676504596588 secs
5.189645528793335 sec passed, 41 instances processed, 511 instances left, ETA=64.68070402959498 secs
6.371550798416138 sec passed, 51 instances processed, 501 instances left, ETA=62.59111666679382 secs
7.53463077545166 sec passed, 61 instances processed, 491 instances left, ETA=60.64760181552074 secs
8.701757431030273 sec passed, 71 instances processed, 481 instances left, ETA=58.95134259613467 secs
9.868827104568481 sec passed, 81 instances processed, 471 instances left, ETA=57.3854020524908 secs
11.030409574508667 sec passed, 91 instances processed, 461 instances left, ETA=55.8793276

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,542,543,544,545,546,547,548,549,550,551
0,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
547,0,0,2,0,2,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
548,0,2,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
549,2,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
550,0,2,0,0,0,2,0,0,0,0,...,1,0,1,0,1,0,1,0,0,1


In [None]:
create_phifile(total_dataset["balance_scale_B"], "balance_scale_B_phifile.csv")

0.293182373046875 sec passed, 1 instances processed, 623 instances left, ETA=182.65261840820312 secs
1.8041274547576904 sec passed, 11 instances processed, 613 instances left, ETA=100.5391027060422 secs
3.221510887145996 sec passed, 21 instances processed, 603 instances left, ETA=92.50338404519218 secs
4.590327978134155 sec passed, 31 instances processed, 593 instances left, ETA=87.80853196882433 secs
5.968287467956543 sec passed, 41 instances processed, 583 instances left, ETA=84.86613643460157 secs
7.321277856826782 sec passed, 51 instances processed, 573 instances left, ETA=82.25671003846561 secs
8.69523310661316 sec passed, 61 instances processed, 563 instances left, ETA=80.25272522988867 secs
10.167433738708496 sec passed, 71 instances processed, 553 instances left, ETA=79.19142052825067 secs
11.540913581848145 sec passed, 81 instances processed, 543 instances left, ETA=77.36686512275979 secs
12.927087783813477 sec passed, 91 instances processed, 533 instances left, ETA=75.7157998

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,614,615,616,617,618,619,620,621,622,623
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619,2,2,2,0,0,2,2,2,0,0,...,1,0,0,0,0,0,0,0,0,1
620,0,2,2,0,2,0,2,2,0,2,...,0,1,0,0,0,0,0,0,0,1
621,2,0,2,0,2,2,0,2,0,2,...,0,0,1,0,0,0,0,0,0,1
622,2,2,0,0,2,2,2,0,0,2,...,0,0,0,1,0,0,0,0,0,1


In [None]:
create_phifile(total_dataset["balance_scale_L"], "balance_scale_L_phifile.csv")

0.31722307205200195 sec passed, 1 instances processed, 623 instances left, ETA=197.62997388839722 secs
1.7846968173980713 sec passed, 11 instances processed, 613 instances left, ETA=99.45628627863798 secs
3.293459177017212 sec passed, 21 instances processed, 603 instances left, ETA=94.56932779720852 secs
4.729998826980591 sec passed, 31 instances processed, 593 instances left, ETA=90.48030014191905 secs
6.128119707107544 sec passed, 41 instances processed, 583 instances left, ETA=87.13887290838288 secs
7.55239462852478 sec passed, 51 instances processed, 573 instances left, ETA=84.85337494401371 secs
8.958381175994873 sec passed, 61 instances processed, 563 instances left, ETA=82.68145249319858 secs
10.393271207809448 sec passed, 71 instances processed, 553 instances left, ETA=80.95040813969894 secs
11.87490439414978 sec passed, 81 instances processed, 543 instances left, ETA=79.60584056818927 secs
13.320183038711548 sec passed, 91 instances processed, 533 instances left, ETA=78.018214

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,614,615,616,617,618,619,620,621,622,623
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
619,2,2,2,0,0,2,2,2,0,0,...,1,0,0,0,0,0,0,0,0,1
620,0,2,2,0,2,0,2,2,0,2,...,0,1,0,0,0,0,0,0,0,1
621,2,0,2,0,2,2,0,2,0,2,...,0,0,1,0,0,0,0,0,0,1
622,2,2,0,0,2,2,2,0,0,2,...,0,0,0,1,0,0,0,0,0,1


In [None]:
create_phifile(total_dataset["tic_bin"], "tic_bin_phifile.csv")

0.46902918815612793 sec passed, 1 instances processed, 956 instances left, ETA=448.3919038772583 secs
2.7521002292633057 sec passed, 11 instances processed, 946 instances left, ETA=236.6806197166443 secs
4.846218109130859 sec passed, 21 instances processed, 936 instances left, ETA=216.00286429268974 secs
6.860399484634399 sec passed, 31 instances processed, 926 instances left, ETA=204.92677170230496 secs
8.859587907791138 sec passed, 41 instances processed, 916 instances left, ETA=197.93615911065078 secs
10.824916124343872 sec passed, 51 instances processed, 906 instances left, ETA=192.3014511501088 secs
12.856912612915039 sec passed, 61 instances processed, 896 instances left, ETA=188.84907706839138 secs
14.933042287826538 sec passed, 71 instances processed, 886 instances left, ETA=186.347541788934 secs
16.95012903213501 sec passed, 81 instances processed, 876 instances left, ETA=183.3125065697564 secs
18.947154760360718 sec passed, 91 instances processed, 866 instances left, ETA=180.

168.0092089176178 sec passed, 801 instances processed, 156 instances left, ETA=32.72089462065965 secs
170.18834972381592 sec passed, 811 instances processed, 146 instances left, ETA=30.638099950280054 secs
172.41963171958923 sec passed, 821 instances processed, 136 instances left, ETA=28.561595510187743 secs
174.69998383522034 sec passed, 831 instances processed, 126 instances left, ETA=26.488806213282505 secs
176.82321453094482 sec passed, 841 instances processed, 116 instances left, ETA=24.389408900819976 secs
178.96720838546753 sec passed, 851 instances processed, 106 instances left, ETA=22.292037707238023 secs
181.0844702720642 sec passed, 861 instances processed, 96 instances left, ETA=20.190602957163954 secs
183.3821337223053 sec passed, 871 instances processed, 86 instances left, ETA=18.106617106909592 secs
185.49245500564575 sec passed, 881 instances processed, 76 instances left, ETA=16.001619274039815 secs
187.67571902275085 sec passed, 891 instances processed, 66 instances le

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,947,948,949,950,951,952,953,954,955,956
0,0,0,1,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,2,0
1,0,0,1,1,0,1,0,0,1,0,...,0,0,2,0,0,0,0,0,0,0
2,1,1,0,1,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,0,1,1,0,1,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,1,1,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
952,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
953,0,0,0,0,0,0,0,0,0,0,...,2,2,0,0,0,0,0,0,0,0
954,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
955,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
create_phifile(total_dataset["car_evaluation_vgood"], "car_evaluation_vgood_phifile.csv")

0.9258983135223389 sec passed, 1 instances processed, 1726 instances left, ETA=1598.1004891395569 secs
5.651883363723755 sec passed, 11 instances processed, 1716 instances left, ETA=881.6938047409058 secs
9.767988204956055 sec passed, 21 instances processed, 1706 instances left, ETA=793.532756078811 secs
13.771376609802246 sec passed, 31 instances processed, 1696 instances left, ETA=753.4275719427293 secs
17.73696494102478 sec passed, 41 instances processed, 1686 instances left, ETA=729.378607087019 secs
21.734830379486084 sec passed, 51 instances processed, 1676 instances left, ETA=714.2661905101701 secs
25.919797897338867 sec passed, 61 instances processed, 1666 instances left, ETA=707.9079229010911 secs
30.187159538269043 sec passed, 71 instances processed, 1656 instances left, ETA=704.0836083855428 secs
34.167412996292114 sec passed, 81 instances processed, 1646 instances left, ETA=694.3155776777385 secs
38.134573221206665 sec passed, 91 instances processed, 1636 instances left, ET

322.91083431243896 sec passed, 801 instances processed, 926 instances left, ETA=373.30266238866227 secs
326.85709381103516 sec passed, 811 instances processed, 916 instances left, ETA=369.1752132316994 secs
330.8118522167206 sec passed, 821 instances processed, 906 instances left, ETA=365.06155677021786 secs
334.7836923599243 sec passed, 831 instances processed, 896 instances left, ETA=360.97014242417833 secs
338.7104346752167 sec passed, 841 instances processed, 886 instances left, ETA=356.8340607874459 secs
342.80867171287537 sec passed, 851 instances processed, 876 instances left, ETA=352.879431751444 secs
346.78051924705505 sec passed, 861 instances processed, 866 instances left, ETA=348.7943434006384 secs
350.74997448921204 sec passed, 871 instances processed, 856 instances left, ETA=344.70950420524167 secs
354.73239755630493 sec passed, 881 instances processed, 846 instances left, ETA=340.6397370404472 secs
358.67301058769226 sec passed, 891 instances processed, 836 instances lef

644.9380741119385 sec passed, 1601 instances processed, 126 instances left, ETA=50.757150117491726 secs
648.909912109375 sec passed, 1611 instances processed, 116 instances left, ETA=46.72473606746586 secs
652.9529237747192 sec passed, 1621 instances processed, 106 instances left, ETA=42.697723578112424 secs
656.9311347007751 sec passed, 1631 instances processed, 96 instances left, ETA=38.66670075491994 secs
660.9124438762665 sec passed, 1641 instances processed, 86 instances left, ETA=34.63648395695242 secs
664.857146024704 sec passed, 1651 instances processed, 76 instances left, ETA=30.605174499017263 secs
668.8468101024628 sec passed, 1661 instances processed, 66 instances left, ETA=26.57669444115746 secs
672.7974798679352 sec passed, 1671 instances processed, 56 instances left, ETA=22.547372155957135 secs
677.3772325515747 sec passed, 1681 instances processed, 46 instances left, ETA=18.536200295878903 secs
681.4865140914917 sec passed, 1691 instances processed, 36 instances left, E

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1717,1718,1719,1720,1721,1722,1723,1724,1725,1726
0,0,1,0,1,0,0,0,0,0,1,...,2,0,0,0,2,0,2,2,2,2
1,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,2,0,0,2,2,0
2,0,0,0,1,0,1,0,0,0,0,...,2,0,2,2,0,0,2,0,2,2
3,1,0,1,0,1,0,1,0,0,0,...,0,2,0,2,0,0,0,2,0,2
4,0,1,0,1,0,0,0,1,0,0,...,0,2,2,0,2,0,0,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1722,0,2,0,0,0,2,0,2,0,0,...,0,0,1,0,1,0,1,0,1,0
1723,2,0,2,0,0,2,2,0,2,0,...,0,0,0,1,0,1,0,0,0,1
1724,2,2,0,0,2,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1725,2,2,2,0,2,0,0,0,2,0,...,0,0,0,0,0,1,0,1,0,1


In [None]:
create_phifile(total_dataset["car_evaluation_good"], "car_evaluation_good_phifile.csv")

0.9194748401641846 sec passed, 1 instances processed, 1726 instances left, ETA=1587.0135741233826 secs
5.818016052246094 sec passed, 11 instances processed, 1716 instances left, ETA=907.6105041503906 secs
10.283809900283813 sec passed, 21 instances processed, 1706 instances left, ETA=835.4371280897232 secs
14.767885684967041 sec passed, 31 instances processed, 1696 instances left, ETA=807.9462619904549 secs
18.76536226272583 sec passed, 41 instances processed, 1686 instances left, ETA=771.6683115842866 secs
22.711644172668457 sec passed, 51 instances processed, 1676 instances left, ETA=746.3669732037713 secs
26.690358877182007 sec passed, 61 instances processed, 1666 instances left, ETA=728.9530801538561 secs
31.070555925369263 sec passed, 71 instances processed, 1656 instances left, ETA=724.6878959494577 secs
35.17928218841553 sec passed, 81 instances processed, 1646 instances left, ETA=714.8777590386661 secs
39.12439298629761 sec passed, 91 instances processed, 1636 instances left, E

322.3676128387451 sec passed, 801 instances processed, 926 instances left, ETA=372.674668525191 secs
326.3849153518677 sec passed, 811 instances processed, 916 instances left, ETA=368.64190192640046 secs
330.3792986869812 sec passed, 821 instances processed, 906 instances left, ETA=364.584219988313 secs
334.41207671165466 sec passed, 831 instances processed, 896 instances left, ETA=360.5694593665976 secs
338.42837285995483 sec passed, 841 instances processed, 886 instances left, ETA=356.5369064850416 secs
342.428031206131 sec passed, 851 instances processed, 876 instances left, ETA=352.4876090911525 secs
346.50705766677856 sec passed, 861 instances processed, 866 instances left, ETA=348.5192937740189 secs
350.53285932540894 sec passed, 871 instances processed, 856 instances left, ETA=344.4961281085534 secs
354.54580330848694 sec passed, 881 instances processed, 846 instances left, ETA=340.46055573096476 secs
358.5224597454071 sec passed, 891 instances processed, 836 instances left, ETA

646.0267624855042 sec passed, 1601 instances processed, 126 instances left, ETA=50.84283077649814 secs
650.1408779621124 sec passed, 1611 instances processed, 116 instances left, ETA=46.81337172166669 secs
654.2192511558533 sec passed, 1621 instances processed, 106 instances left, ETA=42.78053092074056 secs
658.1971626281738 sec passed, 1631 instances processed, 96 instances left, ETA=38.741218646416115 secs
662.3301455974579 sec passed, 1641 instances processed, 86 instances left, ETA=34.710781548678476 secs
666.3548278808594 sec passed, 1651 instances processed, 76 instances left, ETA=30.674116849754885 secs
670.3077285289764 sec passed, 1661 instances processed, 66 instances left, ETA=26.63474417995933 secs
674.292174577713 sec passed, 1671 instances processed, 56 instances left, ETA=22.59746366029439 secs
678.3817076683044 sec passed, 1681 instances processed, 46 instances left, ETA=18.563687419834626 secs
682.4424834251404 sec passed, 1691 instances processed, 36 instances left, E

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1717,1718,1719,1720,1721,1722,1723,1724,1725,1726
0,0,1,0,1,0,0,0,0,0,1,...,2,0,0,0,2,0,2,2,2,2
1,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,2,0,0,2,2,0
2,0,0,0,1,0,1,0,0,0,0,...,2,0,2,2,0,0,2,0,2,2
3,1,0,1,0,1,0,1,0,0,0,...,0,2,0,2,0,0,0,2,0,2
4,0,1,0,1,0,0,0,1,0,0,...,0,2,2,0,2,0,0,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1722,0,2,0,0,0,2,0,2,0,0,...,0,0,1,0,1,0,1,0,1,0
1723,2,0,2,0,0,2,2,0,2,0,...,0,0,0,1,0,1,0,0,0,1
1724,2,2,0,0,2,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1725,2,2,2,0,2,0,0,0,2,0,...,0,0,0,0,0,1,0,1,0,1


In [None]:
create_phifile(total_dataset["car_evaluation_acc"], "car_evaluation_acc_phifile.csv")

0.8744232654571533 sec passed, 1 instances processed, 1726 instances left, ETA=1509.2545561790466 secs
5.479294776916504 sec passed, 11 instances processed, 1716 instances left, ETA=854.7699851989746 secs
10.813854455947876 sec passed, 21 instances processed, 1706 instances left, ETA=878.4969381831941 secs
15.353267669677734 sec passed, 31 instances processed, 1696 instances left, ETA=839.9723215410786 secs
19.32003426551819 sec passed, 41 instances processed, 1686 instances left, ETA=794.4775066259431 secs
23.28443193435669 sec passed, 51 instances processed, 1676 instances left, ETA=765.1903514114081 secs
27.464216232299805 sec passed, 61 instances processed, 1666 instances left, ETA=750.0882662788766 secs
31.451993227005005 sec passed, 71 instances processed, 1656 instances left, ETA=733.5845180833843 secs
35.41245245933533 sec passed, 81 instances processed, 1646 instances left, ETA=719.6160092353821 secs
39.41365647315979 sec passed, 91 instances processed, 1636 instances left, ET

324.02211475372314 sec passed, 801 instances processed, 926 instances left, ETA=374.5873636229059 secs
328.05459117889404 sec passed, 811 instances processed, 916 instances left, ETA=370.5277503327583 secs
332.01800870895386 sec passed, 821 instances processed, 906 instances left, ETA=366.39258939136687 secs
335.9658019542694 sec passed, 831 instances processed, 896 instances left, ETA=362.24471546453117 secs
340.0025327205658 sec passed, 841 instances processed, 886 instances left, ETA=358.1952960647102 secs
343.9760673046112 sec passed, 851 instances processed, 876 instances left, ETA=354.0811221607984 secs
347.92605352401733 sec passed, 861 instances processed, 866 instances left, ETA=349.9465300253182 secs
351.8816602230072 sec passed, 871 instances processed, 856 instances left, ETA=345.8217005176741 secs
355.83563804626465 sec passed, 881 instances processed, 846 instances left, ETA=341.6991484530532 secs
359.83329248428345 sec passed, 891 instances processed, 836 instances left,

644.2928342819214 sec passed, 1601 instances processed, 126 instances left, ETA=50.70636921893947 secs
648.2378947734833 sec passed, 1611 instances processed, 116 instances left, ETA=46.67634748213784 secs
652.5334033966064 sec passed, 1621 instances processed, 106 instances left, ETA=42.67029041334996 secs
656.6140079498291 sec passed, 1631 instances processed, 96 instances left, ETA=38.648034802687675 secs
660.6117334365845 sec passed, 1641 instances processed, 86 instances left, ETA=34.62072460423295 secs
664.5876157283783 sec passed, 1651 instances processed, 76 instances left, ETA=30.592767289737584 secs
668.5480732917786 sec passed, 1661 instances processed, 66 instances left, ETA=26.564824104309082 secs
672.6479704380035 sec passed, 1671 instances processed, 56 instances left, ETA=22.54236166638432 secs
676.5813543796539 sec passed, 1681 instances processed, 46 instances left, ETA=18.514421357206473 secs
680.5539546012878 sec passed, 1691 instances processed, 36 instances left, 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1717,1718,1719,1720,1721,1722,1723,1724,1725,1726
0,0,1,0,1,0,0,0,0,0,1,...,2,0,0,0,2,0,2,2,2,2
1,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,2,0,0,2,2,0
2,0,0,0,1,0,1,0,0,0,0,...,2,0,2,2,0,0,2,0,2,2
3,1,0,1,0,1,0,1,0,0,0,...,0,2,0,2,0,0,0,2,0,2
4,0,1,0,1,0,0,0,1,0,0,...,0,2,2,0,2,0,0,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1722,0,2,0,0,0,2,0,2,0,0,...,0,0,1,0,1,0,1,0,1,0
1723,2,0,2,0,0,2,2,0,2,0,...,0,0,0,1,0,1,0,0,0,1
1724,2,2,0,0,2,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1725,2,2,2,0,2,0,0,0,2,0,...,0,0,0,0,0,1,0,1,0,1


In [None]:
create_phifile(total_dataset["kr-vs-kp"], "kr-vs-kp_phifile.csv")

1.7743725776672363 sec passed, 1 instances processed, 3195 instances left, ETA=5669.12038564682 secs
10.2435142993927 sec passed, 11 instances processed, 3185 instances left, ETA=2965.9630039605227 secs
19.2714581489563 sec passed, 21 instances processed, 3175 instances left, ETA=2913.6609344255357 secs
27.9416184425354 sec passed, 31 instances processed, 3165 instances left, ETA=2852.749108729824 secs
36.83521270751953 sec passed, 41 instances processed, 3155 instances left, ETA=2834.5145388347346 secs
47.02004933357239 sec passed, 51 instances processed, 3145 instances left, ETA=2899.5697089036307 secs
56.467162132263184 sec passed, 61 instances processed, 3135 instances left, ETA=2902.041857125329 secs
65.96383857727051 sec passed, 71 instances processed, 3125 instances left, ETA=2903.337965548878 secs
75.06417179107666 sec passed, 81 instances processed, 3115 instances left, ETA=2886.7271003605406 secs
83.95466732978821 sec passed, 91 instances processed, 3105 instances left, ETA=2

844.6058552265167 sec passed, 801 instances processed, 2395 instances left, ETA=2525.3820515199845 secs
855.0824329853058 sec passed, 811 instances processed, 2385 instances left, ETA=2514.6382277064786 secs
866.1894216537476 sec passed, 821 instances processed, 2375 instances left, ETA=2505.7245754295377 secs
877.7612793445587 sec passed, 831 instances processed, 2365 instances left, ETA=2498.0811379661627 secs
887.792286157608 sec passed, 841 instances processed, 2355 instances left, ETA=2486.0295290144672 secs
898.5528681278229 sec passed, 851 instances processed, 2345 instances left, ETA=2476.0358117035776 secs
909.2104344367981 sec passed, 861 instances processed, 2335 instances left, ETA=2465.744906399447 secs
920.5716528892517 sec passed, 871 instances processed, 2325 instances left, ETA=2457.323872522974 secs
931.2789437770844 sec passed, 881 instances processed, 2315 instances left, ETA=2447.1177694028947 secs
941.6614906787872 sec passed, 891 instances processed, 2305 instanc

1746.8370385169983 sec passed, 1591 instances processed, 1605 instances left, ETA=1762.2083260966576 secs
1760.3566677570343 sec passed, 1601 instances processed, 1595 instances left, ETA=1753.7594535118487 secs
1773.7519180774689 sec passed, 1611 instances processed, 1585 instances left, ETA=1745.1252576988134 secs
1786.923131942749 sec passed, 1621 instances processed, 1575 instances left, ETA=1736.2146408450524 secs
1800.3458921909332 sec passed, 1631 instances processed, 1565 instances left, ETA=1727.4931460936914 secs
1813.575511932373 sec passed, 1641 instances processed, 1555 instances left, ETA=1718.5313351949055 secs
1826.8464534282684 sec passed, 1651 instances processed, 1545 instances left, ETA=1709.556493365642 secs
1841.109201669693 sec passed, 1661 instances processed, 1535 instances left, ETA=1701.4464928133527 secs
1854.256453037262 sec passed, 1671 instances processed, 1525 instances left, ETA=1692.2448180022886 secs
1867.4337022304535 sec passed, 1681 instances proce

2939.218735933304 sec passed, 2371 instances processed, 825 instances left, ETA=1022.7142375136971 secs
2955.87131357193 sec passed, 2381 instances processed, 815 instances left, ETA=1011.7745151453687 secs
2974.5442130565643 sec passed, 2391 instances processed, 805 instances left, ETA=1001.4672068216371 secs
2994.0197348594666 sec passed, 2401 instances processed, 795 instances left, ETA=991.3559721837884 secs
3012.7387273311615 sec passed, 2411 instances processed, 785 instances left, ETA=980.9207386789556 secs
3031.648512363434 sec passed, 2421 instances processed, 775 instances left, ETA=970.4781483195627 secs
3046.541736841202 sec passed, 2431 instances processed, 765 instances left, ETA=958.7019451598187 secs
3061.120099067688 sec passed, 2441 instances processed, 755 instances left, ETA=946.8028163851309 secs
3079.923467874527 sec passed, 2451 instances processed, 745 instances left, ETA=936.1660479667574 secs
3100.514059782028 sec passed, 2461 instances processed, 735 instance

4362.821889400482 sec passed, 3171 instances processed, 25 instances left, ETA=34.39626213655379 secs
4380.1127009391785 sec passed, 3181 instances processed, 15 instances left, ETA=20.65441386799361 secs
4397.148980379105 sec passed, 3191 instances processed, 5 instances left, ETA=6.889923190816522 secs


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3186,3187,3188,3189,3190,3191,3192,3193,3194,3195
0,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,2,0
4,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3191,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3192,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3193,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
3194,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [None]:
# create the graph file, i.e. the adjacency matrix of the (fully-connected) network, where each row is an instance.
def create_graphfile(df, filename):
    graphfile = pd.DataFrame(index=range(df.shape[0]), columns=range(df.shape[0]), dtype=np.int8)
    graphfile.iloc[:, :] = 1
    np.fill_diagonal(graphfile.values, 0)
    graphfile.to_csv(filename, header=False, index=False, sep= ' ')

create_graphfile(total_dataset['monk1'], "monk1_graph.txt")


In [None]:
# create the weight file, i.e. the weighted adjacency matrix of the (fully-connected) network, where each row is an instance.
def create_weightfile(df, filename):
    weightfile = pd.DataFrame(index=range(df.shape[0]), columns=range(df.shape[0]), dtype=np.int8)
    weightfile.iloc[:, :] = 1
    np.fill_diagonal(weightfile.values, 0)
    weightfile.to_csv(filename, header=False, index=False, sep= ' ')

create_weightfile(total_dataset['monk1'], "monk1_weight.txt")

In [None]:
# create the predefined value for each IF (individual-feature) pair.
# 1 : the IF pair is forced to select 1
# 0 : the IF pair is forced to select 0
# -1 :the IF pair is free to choose between 0 and 1

def create_v_predefined(df, filename, percentage=0.2): # percentage of test data vs all data
    a = np.linspace(0, 1, df.shape[0])
    a = a < 1-percentage  # create a bool array of length number of rows, beginning with 80% of 1's followed by 20% of 0's
    a = pd.Series(a)
    a = a.astype(np.int8)
    a = (df['label'] * a)-1 + a # turns the beginning 80% into the same as df['label'] and remaining 20% all equals -1
    a.to_csv(filename, header=False, index=False, sep=' ')
    print(a)

v_predefined = create_v_predefined(total_dataset['monk1'], 'monk1_v_predefined.txt')

0      1
1      1
2      1
3      1
4      1
      ..
549   -1
550   -1
551   -1
552   -1
553   -1
Length: 554, dtype: int64


In [None]:
v_predefined

# KNN classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
# testing monk dataset

In [None]:
# Assuming df_train and df_test are your training and test DataFrames respectively

# Load your data and separate features (X) and labels (y)

def calculate_accuracy_train_test_KNN(df_train, df_test, K=5):

    X_train = df_train.iloc[:, 1:]  # Features from second column onwards
    y_train = df_train.iloc[:, 0]   # Labels - first column

    X_test = df_test.iloc[:, 1:]    # Features from second column onwards
    y_test = df_test.iloc[:, 0]     # Labels - first column

    # Initialize KNN classifier
    knn = KNeighborsClassifier(n_neighbors=K)  # You can adjust the number of neighbors as needed

    # Fit the classifier to the training data
    knn.fit(X_train, y_train)

    # Predict labels for the test set
    y_pred = knn.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    print("Accuracy:", accuracy)
    return accuracy

def calculate_accuracy_split_KNN(df, test_size=0.3, K=5, random_state=42):

    # Assuming df is your DataFrame with both features and labels

    # Separate features (X) and labels (y)
    X = df.iloc[:, 1:]  # Features from the second column onwards
    y = df.iloc[:, 0]   # Labels - first column

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    # Adjust the test_size according to the percentage you want to allocate to the test set

    # Initialize KNN classifier
    knn = KNeighborsClassifier(n_neighbors=K)  # You can adjust the number of neighbors as needed

    # Fit the classifier to the training data
    knn.fit(X_train, y_train)

    # Predict labels for the test set
    y_pred = knn.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    print("Accuracy:", accuracy)

In [None]:
print("monk1")
calculate_accuracy_KNN(total_dataset['monks-1.train'], total_dataset['monks-1.test'])
print("monk2")
calculate_accuracy_KNN(total_dataset['monks-2.train'], total_dataset['monks-2.test'])
print("monk3")
calculate_accuracy_KNN(total_dataset['monks-3.train'], total_dataset['monks-3.test'])

monk1
Accuracy: 0.8491879350348028
monk2
Accuracy: 0.7122969837587007
monk3
Accuracy: 0.7540603248259861


0.7540603248259861

In [None]:
total_dataset.keys()

dict_keys(['monks-1.test', 'monks-1.train', 'monks-2.test', 'monks-2.train', 'monks-3.test', 'monks-3.train', 'balance_scale_B', 'balance_scale_L', 'tic_bin', 'car_evaluation_vgood', 'car_evaluation_good', 'car_evaluation_acc', 'kr-vs-kp'])

In [None]:
for K in [3, 5, 7]:
    print(K)
    for key, df in total_dataset.items():
        if key.startswith('monks'):
            continue
        print(key)
        calculate_accuracy_split_KNN(df, random_state=52, K=K)
    print()

3


NameError: name 'total_dataset' is not defined

## Test how KNN algorithm works
I create my own function of prediction and compare with KNN classifier from sklearn package.

Conclusion : every instance (except one, surprisingly) has the same result out of sklearn-KNN and custom-KNN.

In [None]:
# Import necessary libraries
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import statistics


In [None]:
# Load the iris dataset
iris = load_iris()
X = iris.data  # Features
y = iris.target  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=43)

# Initialize the KNN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

# Fit the classifier to the training data
knn.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = knn.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9466666666666667


In [None]:
def my_predict(X_train, y_train, x_test):
    def distance(quad1, quad2):
        return sum((quad1[i]-quad2[i])**2 for i in range(4))
    # find 3 closest
    candidates = []
    for index, x_train in enumerate(X_train):
        new_distance = distance(x_train, x_test)
        if len(candidates) < 3:
            candidates.append(index)
            continue
        if new_distance < distance(X_train[candidates[0]], x_test):
            candidates[0] = index
        elif new_distance < distance(X_train[candidates[1]], x_test):
            candidates[1] = index
        elif new_distance < distance(X_train[candidates[2]], x_test):
            candidates[2] = index
    print("found the three closest instances are of distance and of label : ")
    for i in range(3):
        print(distance(x_test, X_train[candidates[i]]), "of label ", y_train[candidates[i]])
    print("the label is predicted to be")
    mode_value = statistics.mode([y_train[candidates[i]] for i in range(3)])
    print("Mode:", mode_value)
    return mode_value


wrong = []
for i in range(len(y_pred)):
    if my_predict(X_train, y_train, X_test[i]) == y_pred[i]:
        pass
    else:

        wrong.append(i)



# my_predict(X_train, y_train, X_test[i])
wrong

found the three closest instances are of distance and of label : 
0.01999999999999995 of label  0
0.020000000000000122 of label  0
0.050000000000000086 of label  0
the label is predicted to be
Mode: 0
found the three closest instances are of distance and of label : 
0.020000000000000122 of label  0
0.020000000000000122 of label  0
0.05 of label  0
the label is predicted to be
Mode: 0
found the three closest instances are of distance and of label : 
0.020000000000000035 of label  2
0.09999999999999983 of label  2
0.1899999999999999 of label  2
the label is predicted to be
Mode: 2
found the three closest instances are of distance and of label : 
0.10999999999999988 of label  1
0.1400000000000002 of label  1
0.1499999999999999 of label  1
the label is predicted to be
Mode: 1
found the three closest instances are of distance and of label : 
0.27999999999999986 of label  2
0.6499999999999992 of label  2
0.6699999999999999 of label  2
the label is predicted to be
Mode: 2
found the three clos

[72]

In [None]:
my_predict(X_train, y_train, X_test[72])
y_pred[72]

found the three closest instances are of distance and of label : 
0.2900000000000001 of label  1
0.4399999999999999 of label  2
0.4999999999999994 of label  1
the label is predicted to be
Mode: 1


2