# Processing dataset for KNN and GVG-CNC
We will be processing monk1, monk2, monk3, balance-scale, tic-tac-toe, car_evaluation, kr-vs-kp datasets.

In [1]:
# use a dictionary to store all the dataset
total_dataset = dict()

## Data preprocessing -- Monk1 to Monk3

In [2]:
import os
import numpy as np
import pandas as pd

try:
    root
except NameError:
    root = os.getcwd()

In [3]:
# change directory and inspect existing files
os.chdir(os.path.join(root, "monk"))
os.listdir()

['Index',
 'monks-1.test',
 'monks-1.train',
 'monks-2.test',
 'monks-2.train',
 'monks-3.test',
 'monks-3.train',
 'monks.names',
 'thrun.comparison.dat',
 'thrun.comparison.ps.Z',
 'update']

In [4]:
# explore data structure

df = pd.read_csv('monks-2.train', delimiter=' ')
df.head()

# need to remove the first and last column
# column "1" is label, 1.1 to 1.5 are features

Unnamed: 0.1,Unnamed: 0,0,1,1.1,1.2,1.3,2,2.1,data_4
0,,0,1,1,1,1,4,1,data_7
1,,0,1,1,1,2,1,1,data_9
2,,0,1,1,1,2,1,2,data_10
3,,0,1,1,1,2,2,1,data_11
4,,0,1,1,1,2,3,1,data_13


In [5]:
# read all monk datasets

names = [ 'monks-1.test',
 'monks-1.train',
 'monks-2.test',
 'monks-2.train',
 'monks-3.test',
 'monks-3.train']

# create a dictionary to store all the monk datasets
monk_dict = dict(zip(names, names))


# rename the variable names to a til f, then define a1-a5, ..., f1-f5 as dummy variables
for name in names:
    df = pd.read_csv(name, delimiter=' ')
    df = df.iloc[:, range(1, 8)]
    df.columns= ['label', 'a', 'b', 'c', 'd', 'e', 'f']
    df['a1'] = df['a'] == 1
    df['a2'] = df['a'] == 2
    df['a3'] = df['a'] == 3
    df['a4'] = df['a'] == 4
    df['a5'] = df['a'] == 5
    
    df['b1'] = df['b'] == 1
    df['b2'] = df['b'] == 2
    df['b3'] = df['b'] == 3
    df['b4'] = df['b'] == 4
    df['b5'] = df['b'] == 5
    
    df['c1'] = df['c'] == 1
    df['c2'] = df['c'] == 2
    df['c3'] = df['c'] == 3
    df['c4'] = df['c'] == 4
    df['c5'] = df['c'] == 5
    
    df['d1'] = df['d'] == 1
    df['d2'] = df['d'] == 2
    df['d3'] = df['d'] == 3
    df['d4'] = df['d'] == 4
    df['d5'] = df['d'] == 5
    
    df['e1'] = df['e'] == 1
    df['e2'] = df['e'] == 2
    df['e3'] = df['e'] == 3
    df['e4'] = df['e'] == 4
    df['e5'] = df['e'] == 5
    
    df['f1'] = df['f'] == 1
    df['f2'] = df['f'] == 2
    df['f3'] = df['f'] == 3
    df['f4'] = df['f'] == 4
    df['f5'] = df['f'] == 5
    
    new_columns = "label a1 a2 a3 a4 a5 b1 b2 b3 b4 b5 c1 c2 c3 c4 c5 d1 d2 d3 d4 d5 e1 e2 e3 e4 e5 f1 f2 f3 f4 f5".split(' ')
    new_columns = "label a1 a2 b1 b2 c1 d1 d2 e1 e2 e3 f1".split(' ')
    df = df[new_columns]
    monk_dict[name] = df
    
# # append monks into total dataset
# total_dataset |= monk_dict

In [6]:
# inspect the dimension of dummy-variable-ized dataset
for name in names:
    n_instances, n_features_ = monk_dict[name].shape
#     print(monk_dict[name].sum())
    print(f"{name} has {n_instances} instances and {n_features_-1} features.")

monks-1.test has 431 instances and 11 features.
monks-1.train has 123 instances and 11 features.
monks-2.test has 431 instances and 11 features.
monks-2.train has 168 instances and 11 features.
monks-3.test has 431 instances and 11 features.
monks-3.train has 121 instances and 11 features.


In [7]:
# combine the train and test dataset together

monk1 = pd.concat([monk_dict['monks-1.train'], monk_dict['monks-1.test']], axis=0)
monk2 = pd.concat([monk_dict['monks-2.train'], monk_dict['monks-2.test']], axis=0)
monk3 = pd.concat([monk_dict['monks-3.train'], monk_dict['monks-3.test']], axis=0)


monk1.index= range(monk1.shape[0])
monk2.index= range(monk2.shape[0])
monk3.index= range(monk3.shape[0])

In [8]:
# store them into total dataset
total_dataset['monk1'] = monk1
total_dataset['monk2'] = monk2
total_dataset['monk3'] = monk3


## Data Preprocessing -- Balance_scale 

In [9]:
# change directory and inspect files

os.chdir(os.path.join(root, "balance_scale"))
os.listdir()

['balance-scale.data', 'balance-scale.names', 'Index']

In [10]:
# inspect data structure

balance_scale = pd.read_csv("balance-scale.data")
balance_scale.head()

Unnamed: 0,B,1,1.1,1.2,1.3
0,R,1,1,1,2
1,R,1,1,1,3
2,R,1,1,1,4
3,R,1,1,1,5
4,R,1,1,2,1


In [11]:
# observe possible values for each variable
df = balance_scale
df.columns=['label', 'a', 'b', 'c', 'd']
for col in df.columns:
    print(col, df[col].unique())

label ['R' 'L' 'B']
a [1 2 3 4 5]
b [1 2 3 4 5]
c [1 2 3 4 5]
d [2 3 4 5 1]


In [12]:
# create dummy variables named a1-a5, ..., d1-d5

df['a1'] = df['a'] == 1
df['a2'] = df['a'] == 2
df['a3'] = df['a'] == 3
df['a4'] = df['a'] == 4
df['a5'] = df['a'] == 5

df['b1'] = df['b'] == 1
df['b2'] = df['b'] == 2
df['b3'] = df['b'] == 3
df['b4'] = df['b'] == 4
df['b5'] = df['b'] == 5

df['c1'] = df['c'] == 1
df['c2'] = df['c'] == 2
df['c3'] = df['c'] == 3
df['c4'] = df['c'] == 4
df['c5'] = df['c'] == 5

df['d1'] = df['d'] == 1
df['d2'] = df['d'] == 2
df['d3'] = df['d'] == 3
df['d4'] = df['d'] == 4
df['d5'] = df['d'] == 5

In [13]:
# classes are "Balanced" "Left" and "Right"
set(df["label"])

{'B', 'L', 'R'}

In [14]:
# select only newly created columns as a new dataframe
new_columns= "label a1 a2 a3 a4 b1 b2 b3 b4 c1 c2 c3 c4 d1 d2 d3 d4".split(" ")
df = df[new_columns]

In [15]:
# create a dataset, where target is such that "B" (stands for Balanced) ->1 and other case->0
balance_scale_B = df.replace({'B':1, 'L':0, 'R':0})

In [16]:
# create a dataset, where target is such that "L" (stands for Left) ->1 and other case->0
balance_scale_L = df.replace({'B':0, 'L':1, 'R':0})

In [17]:
# store them into total dataset
total_dataset |= dict(balance_scale_B=balance_scale_B)
total_dataset |= dict(balance_scale_L=balance_scale_L)

## Data preprocessing -- tic-tac-toe

In [18]:
# change directory and inspect files
os.chdir(os.path.join(root, "tic+tac+toe+endgame"))
os.listdir()

['Index', 'tic-tac-toe.data', 'tic-tac-toe.names']

In [19]:
# read and inspect data
df = pd.read_csv("tic-tac-toe.data")
df

Unnamed: 0,x,x.1,x.2,x.3,o,o.1,x.4,o.2,o.3,positive
0,x,x,x,x,o,o,o,x,o,positive
1,x,x,x,x,o,o,o,o,x,positive
2,x,x,x,x,o,o,o,b,b,positive
3,x,x,x,x,o,o,b,o,b,positive
4,x,x,x,x,o,o,b,b,o,positive
...,...,...,...,...,...,...,...,...,...,...
952,o,x,x,x,o,o,o,x,x,negative
953,o,x,o,x,x,o,x,o,x,negative
954,o,x,o,x,o,x,x,o,x,negative
955,o,x,o,o,x,x,x,o,x,negative


In [20]:
# give columns meaningful names
df = df[['positive', 'x', 'x.1', 'x.2', 'x.3', 'x.4', 'o', 'o.1', 'o.2', 'o.3']]
tic = df
df.columns = ['label', '1', '2', '3', '4', '5', '6', '7', '8', '9']
# check and found out that feature is not binary
set(df['1'])

{'b', 'o', 'x'}

In [21]:
# transform features into binary. Knowing that each original feature represent a configuration in a cell out of 3x3 grid.
# which can either take value 'o' or 'x' or 'nothing'. 

if df['label'][0] in ['positive', 'negative']:
    df['label'] = df['label'] == 'positive'

df['x1'] = df['1'] == 'x'
df['x2'] = df['2'] == 'x'
df['x3'] = df['3'] == 'x'

df['x4'] = df['4'] == 'x'
df['x5'] = df['5'] == 'x'
df['x6'] = df['6'] == 'x'

df['x7'] = df['7'] == 'x'
df['x8'] = df['8'] == 'x'
df['x9'] = df['9'] == 'x'


df['o1'] = df['1'] == 'o'
df['o2'] = df['2'] == 'o'
df['o3'] = df['3'] == 'o'

df['o4'] = df['4'] == 'o'
df['o5'] = df['5'] == 'o'
df['o6'] = df['6'] == 'o'

df['o7'] = df['7'] == 'o'
df['o8'] = df['8'] == 'o'
df['o9'] = df['9'] == 'o'

In [22]:
# select newly created variable and store into total dataset
tic_bin = df[['label', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'o1', 'o2', 'o3', 'o4', 'o5', 'o6', 'o7', 'o8', 'o9']]
total_dataset |= dict(tic_bin=tic_bin)

## Data preprocessing -- car_evaluation

In [23]:
# change directory and inspect files

os.chdir(os.path.join(root, 'car+evaluation'))
os.listdir()

['car.c45-names', 'car.data', 'car.names']

In [24]:
# read and give meaningful names to each column
df = pd.read_csv('car.data')
df.columns=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'label']
df = df[['label', 'buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']]

In [25]:
# See for each variable, what are the possible values.
for col in df.columns:
    print(col, df[col].unique())

label ['unacc' 'acc' 'vgood' 'good']
buying ['vhigh' 'high' 'med' 'low']
maint ['vhigh' 'high' 'med' 'low']
doors ['2' '3' '4' '5more']
persons ['2' '4' 'more']
lug_boot ['small' 'med' 'big']
safety ['med' 'high' 'low']


In [26]:
# Give each variable numerical encodings since they are all ordinal variables
df['label'].replace({"unacc":0, "acc":1, "vgood":2, "good":3}, inplace=True)
df['buying'].replace({'vhigh':3, 'high':2, 'med':1, 'low':0}, inplace=True)
df['maint'].replace({'vhigh':3, 'high':2, 'med':1, 'low':0}, inplace=True)
df['doors'].replace({'2':2, '3':3, '4':4, '5more':5}, inplace=True)
df['persons'].replace({'2':2, '4':4, 'more':5}, inplace=True)
df['lug_boot'].replace({'small':0, 'med':1, 'big':2}, inplace=True)
df['safety'].replace({'med':1, 'high':2, 'low':0}, inplace=True)
None

In [27]:
# create binary variables based on different thresholds
df['labelvgood'] = df['label'] == 3
df['labelgood'] = df['label'] >= 2
df['labelacc'] = df['label'] >= 1

df['buyingvhigh'] = df['buying'] == 3
df['buyinghigh'] = df['buying'] >= 2
df['buyingmed'] = df['buying'] >= 1

df['maintvhigh'] = df['maint'] == 3
df['mainthigh'] = df['maint'] >= 2
df['maintmed'] = df['maint'] >= 1

df["doors5"] = df['doors'] >= 5
df["doors4"] = df['doors'] >= 4
df["doors3"] = df['doors'] >= 3

df["persons5"] = df['persons']>=5
df["persons4"] = df['persons']>=4

df["lug_boot2"] = df["lug_boot"] >=2
df["lug_boot1"] = df["lug_boot"] >=1

df["safety2"] = df["safety"] >=2
df["safety1"] = df["safety"] >=1

In [28]:
# select newly created features and make 3 datasets based on target labels
features = ['buyingvhigh', 'buyinghigh', 'buyingmed', 
            'maintvhigh', 'mainthigh', 'maintmed',
            "doors5", "doors4", "doors3",
            "persons5", "persons4",
            "lug_boot2", "lug_boot1",
            "safety2", "safety1"]
car_evaluation_vgood = df[["labelvgood"]+features]
car_evaluation_good = df[["labelgood"]+features]
car_evaluation_acc = df[["labelacc"]+features]

car_evaluation_vgood = car_evaluation_vgood.rename(columns={"labelvgood": "label"})
car_evaluation_good = car_evaluation_good.rename(columns={"labelgood": "label"})
car_evaluation_acc = car_evaluation_acc.rename(columns={"labelacc": "label"})

In [29]:
# store them into total dataset
total_dataset |= dict(car_evaluation_vgood=car_evaluation_vgood,
                     car_evaluation_good=car_evaluation_good,
                     car_evaluation_acc=car_evaluation_acc)

## Data preprocessing -- kr-vs-kp

In [30]:
# change directory and inspect files
os.chdir(os.path.join(root, "kr-vs-kp"))
os.listdir()

['kr-vs-kp_csv.csv']

In [31]:
# read and insepct possible values for each columns to see if they are already binary, and what are current values
df = pd.read_csv("kr-vs-kp_csv.csv")
for col in df.columns:
    print(col)
    print(df[col].unique())
    
df['katri_n'] = df['katri'] == 'n'
df['katri_b'] = df['katri'] == 'b' # binary-ize all features 

bkblk
['f' 't']
bknwy
['f' 't']
bkon8
['f' 't']
bkona
['f' 't']
bkspr
['f' 't']
bkxbq
['f' 't']
bkxcr
['f' 't']
bkxwp
['f' 't']
blxwp
['f' 't']
bxqsq
['f' 't']
cntxt
['f' 't']
dsopp
['f' 't']
dwipd
['l' 'g']
hdchk
['f' 't']
katri
['n' 'w' 'b']
mulch
['f' 't']
qxmsq
['f' 't']
r2ar8
['t' 'f']
reskd
['f' 't']
reskr
['f' 't']
rimmx
['f' 't']
rkxwp
['f' 't']
rxmsq
['f' 't']
simpl
['f' 't']
skach
['f' 't']
skewr
['t' 'f']
skrxp
['f' 't']
spcop
['f' 't']
stlmt
['f' 't']
thrsk
['f' 't']
wkcti
['f' 't']
wkna8
['f' 't']
wknck
['f' 't']
wkovl
['t' 'f']
wkpos
['t' 'f']
wtoeg
['n' 't']
class
['won' 'nowin']


In [32]:
# reorder the columns so that the label is the first (index 0) appearing. 
new_columns = ['class', 'bkblk', 'bknwy', 'bkon8', 'bkona', 'bkspr', 'bkxbq', 'bkxcr', 'bkxwp', 'blxwp', 'bxqsq', 'cntxt', 'dsopp', 'dwipd', 'hdchk', 'mulch', 'qxmsq', 'r2ar8', 'reskd', 'reskr', 'rimmx', 'rkxwp', 'rxmsq', 'simpl', 'skach', 'skewr', 'skrxp', 'spcop', 'stlmt', 'thrsk', 'wkcti', 'wkna8', 'wknck', 'wkovl', 'wkpos', 'wtoeg', 'katri_n', 'katri_b']
df = df[new_columns] # reorder

In [33]:
# rename the features to be 0 1 while making sure each column is treaty correctly
df.replace({"won":1, "nowin":0, "t":0, "f":1}, inplace=True)
df.replace({"l":0, "g":1, "n":1}, inplace=True)

df.columns = ['label'] + list(df.columns)[1:]

In [34]:
# store it into total dataset
total_dataset['kr-vs-kp'] = df

## Getting statistics for each dataset

In [35]:
dimensions = dict()
for key, df in total_dataset.items():
    dimensions[key] = df.shape
dimensions

{'monk1': (554, 12),
 'monk2': (599, 12),
 'monk3': (552, 12),
 'balance_scale_B': (624, 17),
 'balance_scale_L': (624, 17),
 'tic_bin': (957, 19),
 'car_evaluation_vgood': (1727, 16),
 'car_evaluation_good': (1727, 16),
 'car_evaluation_acc': (1727, 16),
 'kr-vs-kp': (3196, 38)}

## Write the data down

In [36]:
# create a new directory (if not already existing) called df (stands for dataframe)
os.chdir(root)
if "df" not in os.listdir():
    os.mkdir("df")

In [37]:
# write down data in csv format
for (name, df) in total_dataset.items():
    df.to_csv(os.path.join(root, "df", name+".csv"))

# Generating data for C++ code
Need phi_file, v_predefined_file and w_file
## phi_file
For telling who are friends and who are enemies. K-nearest are friends and K' farthest are enemies.

In [38]:
# define the distance between instances, i.e. two given rows. Euclidien 2-distance squared is used. 
def distance(df, i, j):
        dff = df.astype(int)
        return sum((dff.iloc[i, 1:] - dff.iloc[j, 1:])**2)

# For the instance in i-th row, calculate the distance of it with all the other instances.
# Reorder the distances and pick the K-first and K-last instances. If equality, pick all the instances of the same distance.
def find_K_neighbors(df, i, K=3):
    distances = [(j, distance(df, i, j)) for j in range(df.shape[0])] # attention : (i, 0) will be inside
    distances.sort(key=lambda x: x[1])
    
    # create a dictionary to count : how many instances, j, are at a given distance (to the anchor instance i) ? 
    counter = dict()
    for (j, d) in distances:
        try:
            counter[d] += 1
        except KeyError:
            counter[d] = 1    
    
    # find nearest neighbors
    # pick a distance d_from0 such that (j, d) with d<d_from0 counts more than K
    d_from0 = 0 # we pick d<d_thr
    count = 0
    while count < K+1: # (i, 0) are in and we want to exclude this
        try:
            count += counter[d_from0]
        except KeyError:
            pass
        d_from0 += 1
        
    
    # find farthest neighbors
    # pick a distance d_tomax such that (j, d) with d > d_tomax counts more than K
    d_tomax = max(d for (j, d) in distances)
    count = 0
    while count < K:
        try:
            count += counter[d_tomax]
        except KeyError:
            pass
        d_tomax -= 1
        
    
    # return a list, in order of index number, of whether it is among K-nearest (1), or among K-farthest(2), or neither case(0).
    phidata_list = []
    for (j, d) in sorted(distances, key=lambda x:x[0]):
#     for (j, d) in sorted(distances, key=lambda x:x[1]):
        if j == i:
            phidata_list.append(0) # no link with itself
        elif d < d_from0:
            phidata_list.append(1) # 1 for phiup
        elif d > d_tomax:
            phidata_list.append(2) # 2 for phidown
        else:
            phidata_list.append(0) # 0 for no link
    
    return phidata_list
            

    
# find_K_neighbors(total_dataset['monk1'], 0, K = 3)

In [39]:
from time import time

# run the find_K_neighbors n times (n = df.shape[0] is the number of rows). Store the lists into a matrix, saved in csv.  
def create_phifile(df, filename, K=3):
    parent = os.path.dirname(filename)
    if os.path.basename(filename) in os.listdir(parent):
        print(f"{os.path.basename(filename)} already existed. Finished.")
        return None
    begin = time()
    phifile = pd.DataFrame(index=range(df.shape[0]), columns=range(df.shape[0]))
    for i in range(df.shape[0]):
        phidata_list = find_K_neighbors(df, i, K)
        phifile.iloc[i, :] = np.array(phidata_list)
        if i%10 == 1:
            current = time()
            print(f"{current-begin} sec passed, {i} instances processed, {df.shape[0]-i} instances left, ETA={(df.shape[0]-i)*(current-begin)/i} secs")

            
    phifile.to_csv(filename, header=False, index=False, sep=' ')
    return phifile

create_phifile(total_dataset['monk1'], os.path.join(root, "data2run", "monk1_phifile.csv"))

monk1_phifile.csv already existed. Finished.


In [40]:
create_phifile(total_dataset['monk2'], os.path.join(root, "data2run","monk2_phifile.csv"))

monk2_phifile.csv already existed. Finished.


In [41]:
create_phifile(total_dataset['monk3'], os.path.join(root, "data2run", "monk3_phifile.csv"))

monk3_phifile.csv already existed. Finished.


In [42]:
create_phifile(total_dataset["balance_scale_B"], os.path.join(root, "data2run", "balance_scale_B_phifile.csv"))

balance_scale_B_phifile.csv already existed. Finished.


In [43]:
create_phifile(total_dataset["balance_scale_L"], os.path.join(root, "data2run", "balance_scale_L_phifile.csv"))

balance_scale_L_phifile.csv already existed. Finished.


In [44]:
create_phifile(total_dataset["tic_bin"], os.path.join(root, "data2run", "tic_bin_phifile.csv"))

tic_bin_phifile.csv already existed. Finished.


In [45]:
create_phifile(total_dataset["car_evaluation_vgood"], os.path.join(root, "data2run", "car_evaluation_vgood_phifile.csv"))

car_evaluation_vgood_phifile.csv already existed. Finished.


In [46]:
create_phifile(total_dataset["car_evaluation_good"], os.path.join(root, "data2run", "car_evaluation_good_phifile.csv"))

car_evaluation_good_phifile.csv already existed. Finished.


In [47]:
create_phifile(total_dataset["car_evaluation_acc"], os.path.join(root, "data2run", "car_evaluation_acc_phifile.csv"))

car_evaluation_acc_phifile.csv already existed. Finished.


In [48]:
create_phifile(total_dataset["kr-vs-kp"], os.path.join(root, "data2run", "kr-vs-kp_phifile.csv"))

kr-vs-kp_phifile.csv already existed. Finished.


In [54]:
# create the graph file, i.e. the adjacency matrix of the (fully-connected) network, where each row is an instance.
def create_graphfile(df, filename):
    parent = os.path.dirname(filename)
    if os.path.basename(filename) in os.listdir(parent):
        print(f"{os.path.basename(filename)} already existed. Finished.")
        return None
    graphfile = pd.DataFrame(index=range(df.shape[0]), columns=range(df.shape[0]), dtype=np.int8)
    graphfile.iloc[:, :] = 1
    np.fill_diagonal(graphfile.values, 0)
    graphfile.to_csv(filename, header=False, index=False, sep= ' ')
    
for data in total_dataset.keys():
    create_graphfile(total_dataset[data], os.path.join(root, "data2run", f"{data}_graph.txt"))
    print(f"The graph of {data} has been processed.")
    print()

monk1_graph.txt already existed. Finished.
The graph of monk1 has been processed.

monk2_graph.txt already existed. Finished.
The graph of monk2 has been processed.

monk3_graph.txt already existed. Finished.
The graph of monk3 has been processed.

balance_scale_B_graph.txt already existed. Finished.
The graph of balance_scale_B has been processed.

balance_scale_L_graph.txt already existed. Finished.
The graph of balance_scale_L has been processed.

tic_bin_graph.txt already existed. Finished.
The graph of tic_bin has been processed.

car_evaluation_vgood_graph.txt already existed. Finished.
The graph of car_evaluation_vgood has been processed.

car_evaluation_good_graph.txt already existed. Finished.
The graph of car_evaluation_good has been processed.

car_evaluation_acc_graph.txt already existed. Finished.
The graph of car_evaluation_acc has been processed.

kr-vs-kp_graph.txt already existed. Finished.
The graph of kr-vs-kp has been processed.



In [55]:
# create the weight file, i.e. the weighted adjacency matrix of the (fully-connected) network, where each row is an instance.
def create_weightfile(df, filename):
    parent = os.path.dirname(filename)
    if os.path.basename(filename) in os.listdir(parent):
        print(f"{os.path.basename(filename)} already existed. Finished.")
        return None    
    weightfile = pd.DataFrame(index=range(df.shape[0]), columns=range(df.shape[0]), dtype=np.int8)
    weightfile.iloc[:, :] = 1
    np.fill_diagonal(weightfile.values, 0)
    weightfile.to_csv(filename, header=False, index=False, sep= ' ')
    
for data in total_dataset.keys():
    create_weightfile(total_dataset[data], os.path.join(root, "data2run", f"{data}_weight.txt"))
    print(f"The weight of {data} has been processed.")
    print()

monk1_weight.txt already existed. Finished.
The weight of monk1 has been processed.

monk2_weight.txt already existed. Finished.
The weight of monk2 has been processed.

monk3_weight.txt already existed. Finished.
The weight of monk3 has been processed.

balance_scale_B_weight.txt already existed. Finished.
The weight of balance_scale_B has been processed.

balance_scale_L_weight.txt already existed. Finished.
The weight of balance_scale_L has been processed.

tic_bin_weight.txt already existed. Finished.
The weight of tic_bin has been processed.

car_evaluation_vgood_weight.txt already existed. Finished.
The weight of car_evaluation_vgood has been processed.

car_evaluation_good_weight.txt already existed. Finished.
The weight of car_evaluation_good has been processed.

car_evaluation_acc_weight.txt already existed. Finished.
The weight of car_evaluation_acc has been processed.

kr-vs-kp_weight.txt already existed. Finished.
The weight of kr-vs-kp has been processed.



In [57]:
# create the predefined value for each IF (individual-feature) pair. 
# 1 : the IF pair is forced to select 1
# 0 : the IF pair is forced to select 0
# -1 :the IF pair is free to choose between 0 and 1

def create_v_predefined(df, filename, percentage=0.2): # percentage of test data vs all data
    parent = os.path.dirname(filename)
    if os.path.basename(filename) in os.listdir(parent):
        print(f"{os.path.basename(filename)} already existed. Finished.")
        return None    
    a = np.linspace(0, 1, df.shape[0])
    a = a < 1-percentage  # create a bool array of length number of rows, beginning with 80% of 1's followed by 20% of 0's 
    a = pd.Series(a)
    a = a.astype(np.int8)
    a = (df['label'] * a)-1 + a # turns the beginning 80% into the same as df['label'] and remaining 20% all equals -1
    a.to_csv(filename, header=False, index=False, sep=' ')
    
    
for data in total_dataset.keys():
    
    v_predefined = create_v_predefined(
        total_dataset[data], os.path.join(root, "data2run", f'{data}_v_predefined.txt'))
    print(f"The predefined label of {data} has been processed.")
    print()

monk1_v_predefined.txt already existed. Finished.
The predefined label of monk1 has been processed.

monk2_v_predefined.txt already existed. Finished.
The predefined label of monk2 has been processed.

monk3_v_predefined.txt already existed. Finished.
The predefined label of monk3 has been processed.

balance_scale_B_v_predefined.txt already existed. Finished.
The predefined label of balance_scale_B has been processed.

balance_scale_L_v_predefined.txt already existed. Finished.
The predefined label of balance_scale_L has been processed.

tic_bin_v_predefined.txt already existed. Finished.
The predefined label of tic_bin has been processed.

car_evaluation_vgood_v_predefined.txt already existed. Finished.
The predefined label of car_evaluation_vgood has been processed.

car_evaluation_good_v_predefined.txt already existed. Finished.
The predefined label of car_evaluation_good has been processed.

car_evaluation_acc_v_predefined.txt already existed. Finished.
The predefined label of car

# KNN classification

In [93]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd

In [94]:
# testing monk dataset

In [60]:
# Assuming df_train and df_test are your training and test DataFrames respectively

# Load your data and separate features (X) and labels (y)

def calculate_accuracy_train_test_KNN(df_train, df_test, K=5):

    X_train = df_train.iloc[:, 1:]  # Features from second column onwards
    y_train = df_train.iloc[:, 0]   # Labels - first column

    X_test = df_test.iloc[:, 1:]    # Features from second column onwards
    y_test = df_test.iloc[:, 0]     # Labels - first column

    # Initialize KNN classifier
    knn = KNeighborsClassifier(n_neighbors=K)  # You can adjust the number of neighbors as needed

    # Fit the classifier to the training data
    knn.fit(X_train, y_train)

    # Predict labels for the test set
    y_pred = knn.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    print("Accuracy:", accuracy)
    return accuracy

def calculate_accuracy_split_KNN(df, test_size=0.3, K=5, random_state=42):

    # Assuming df is your DataFrame with both features and labels

    # Separate features (X) and labels (y)
    X = df.iloc[:, 1:]  # Features from the second column onwards
    y = df.iloc[:, 0]   # Labels - first column

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    # Adjust the test_size according to the percentage you want to allocate to the test set

    # Initialize KNN classifier
    knn = KNeighborsClassifier(n_neighbors=K)  # You can adjust the number of neighbors as needed

    # Fit the classifier to the training data
    knn.fit(X_train, y_train)

    # Predict labels for the test set
    y_pred = knn.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    print("Accuracy:", accuracy)

In [62]:
print("monk1")
calculate_accuracy_split_KNN(total_dataset['monks-1.train'], total_dataset['monks-1.test'])
print("monk2")
calculate_accuracy_split_KNN(total_dataset['monks-2.train'], total_dataset['monks-2.test'])
print("monk3")
calculate_accuracy_split_KNN(total_dataset['monks-3.train'], total_dataset['monks-3.test'])

monk1


NameError: name 'monk_dataset' is not defined

In [97]:
total_dataset.keys()

dict_keys(['monks-1.test', 'monks-1.train', 'monks-2.test', 'monks-2.train', 'monks-3.test', 'monks-3.train', 'balance_scale_B', 'balance_scale_L', 'tic_bin', 'car_evaluation_vgood', 'car_evaluation_good', 'car_evaluation_acc', 'kr-vs-kp'])

In [58]:
for K in [3, 5, 7]:
    print(K)
    for key, df in total_dataset.items():
        if key.startswith('monks'):
            continue
        print(key)
        calculate_accuracy_split_KNN(df, random_state=52, K=K)
    print()

3
monk1


NameError: name 'calculate_accuracy_split_KNN' is not defined

## Test how KNN algorithm works
I create my own function of prediction and compare with KNN classifier from sklearn package. 

Conclusion : every instance (except one, surprisingly) has the same result out of sklearn-KNN and custom-KNN. 

In [15]:
# Import necessary libraries
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import statistics


In [29]:
# Load the iris dataset
iris = load_iris()
X = iris.data  # Features
y = iris.target  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=43)

# Initialize the KNN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

# Fit the classifier to the training data
knn.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = knn.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9466666666666667


In [33]:
def my_predict(X_train, y_train, x_test):
    def distance(quad1, quad2):
        return sum((quad1[i]-quad2[i])**2 for i in range(4))
    # find 3 closest
    candidates = []
    for index, x_train in enumerate(X_train):
        new_distance = distance(x_train, x_test)
        if len(candidates) < 3:
            candidates.append(index)
            continue
        if new_distance < distance(X_train[candidates[0]], x_test):
            candidates[0] = index
        elif new_distance < distance(X_train[candidates[1]], x_test):
            candidates[1] = index
        elif new_distance < distance(X_train[candidates[2]], x_test):
            candidates[2] = index
    print("found the three closest instances are of distance and of label : ")
    for i in range(3):
        print(distance(x_test, X_train[candidates[i]]), "of label ", y_train[candidates[i]])
    print("the label is predicted to be")
    mode_value = statistics.mode([y_train[candidates[i]] for i in range(3)])
    print("Mode:", mode_value)
    return mode_value
        
    
wrong = []
for i in range(len(y_pred)):
    if my_predict(X_train, y_train, X_test[i]) == y_pred[i]:
        pass
    else:
        
        wrong.append(i)



# my_predict(X_train, y_train, X_test[i])
wrong

found the three closest instances are of distance and of label : 
0.01999999999999995 of label  0
0.020000000000000122 of label  0
0.050000000000000086 of label  0
the label is predicted to be
Mode: 0
found the three closest instances are of distance and of label : 
0.020000000000000122 of label  0
0.020000000000000122 of label  0
0.05 of label  0
the label is predicted to be
Mode: 0
found the three closest instances are of distance and of label : 
0.020000000000000035 of label  2
0.09999999999999983 of label  2
0.1899999999999999 of label  2
the label is predicted to be
Mode: 2
found the three closest instances are of distance and of label : 
0.10999999999999988 of label  1
0.1400000000000002 of label  1
0.1499999999999999 of label  1
the label is predicted to be
Mode: 1
found the three closest instances are of distance and of label : 
0.27999999999999986 of label  2
0.6499999999999992 of label  2
0.6699999999999999 of label  2
the label is predicted to be
Mode: 2
found the three clos

[72]

In [35]:
my_predict(X_train, y_train, X_test[72])
y_pred[72]

found the three closest instances are of distance and of label : 
0.2900000000000001 of label  1
0.4399999999999999 of label  2
0.4999999999999994 of label  1
the label is predicted to be
Mode: 1


2