# Processing dataset for KNN and GVG-CNC
We will be preprocessing monk1, monk2, monk3, balance-scale, tic-tac-toe, car_evaluation, kr-vs-kp datasets.
In order for C++ code of GVG-CNC model to run, graph_data, weight_data and v_predefined_data need to be generated in a txt format with blank instead of comma.

## This notebook follows the order :
<a id='TableOfContent'> </a>
1. <a href=#github> Github download </a>
1. Dataset preparation
    1. <a href=#Monk> Monk1, Monk2 and Monk3 </a>
    1. <a href=#Balance> Balance scale </a>
    1. <a href=#Tic> Tic Tac Toe </a>
    1. <a href=#Car> Car evaluation </a>
    1. <a href=#Krkp> Kr vs Kp </a>
1. Dataset Processing 
    1. <a href=#DistanceCalculation> Calculating distances between rows </a>
    1. <a href=#DatasetProcessing> Processing Datasets </a>
    1. <a href=#MissingValueCreation> Creating missing values </a>
    1. <a href=#WriteDownDatasets> Write down Dataset </a>
1. Generation
    1. <a href=#Graphfile> Graph weight and v predefined file generation </a>
    1. <a href=#Phi> Phi file generation </a>
    1. <a href=#Weight> Weight file generation </a>

<a id='Github'> </a>
## Import data from github and change directory

In [1]:
!ls

import os
import numpy as np
import pandas as pd

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [2]:
#----------------------
# Clone the GITHUB directory
#----------------------

# !git clone -b master https://github.com/StefanoNasini/GVG_CNC_KNN_application

In [3]:
#----------------------
# Set working directory
#----------------------

# os.chdir("GVG_CNC_KNN_application")

<a id='Monk'> </a>
## Data preprocessing -- Monk1 to Monk3
<a href=#TableOfContent> Back to Table of Content </a>

In [4]:
#----------------------
# use a dictionary to store all the dataset
#----------------------

total_dataset = dict()

try:
    root
except NameError:
    root = os.getcwd()


In [5]:
#----------------------
# Monk direcotry
#----------------------

os.chdir(os.path.join(root, "monk"))

# Inspect existing files

os.listdir()

['Index',
 'monks-1.test',
 'monks-1.train',
 'monks-2.test',
 'monks-2.train',
 'monks-3.test',
 'monks-3.train',
 'monks.names',
 'thrun.comparison.dat',
 'thrun.comparison.ps.Z',
 'update']

In [6]:
#----------------------
# explore data structure
#----------------------

df = pd.read_csv('monks-2.train', delimiter=' ')
df.head()

# need to remove the first and last column
# column "1" is label, 1.1 to 1.5 are features

Unnamed: 0.1,Unnamed: 0,0,1,1.1,1.2,1.3,2,2.1,data_4
0,,0,1,1,1,1,4,1,data_7
1,,0,1,1,1,2,1,1,data_9
2,,0,1,1,1,2,1,2,data_10
3,,0,1,1,1,2,2,1,data_11
4,,0,1,1,1,2,3,1,data_13


In [7]:
#----------------------
# read all monk datasets
#----------------------

names = [ 'monks-1.test',
 'monks-1.train',
 'monks-2.test',
 'monks-2.train',
 'monks-3.test',
 'monks-3.train']

# create a dictionary to store all the monk datasets

monk_dict = dict(zip(names, names))

# rename the variable from 'a' to 'f', then define a1-a5, ..., f1-f5 as dummy variables

for name in names:
    df = pd.read_csv(name, delimiter=' ')
    df = df.iloc[:, range(1, 8)]
    df.columns= ['label', 'a', 'b', 'c', 'd', 'e', 'f']
    df['a1'] = df['a'] == 1
    df['a2'] = df['a'] == 2
    df['a3'] = df['a'] == 3
    df['a4'] = df['a'] == 4
    df['a5'] = df['a'] == 5

    df['b1'] = df['b'] == 1
    df['b2'] = df['b'] == 2
    df['b3'] = df['b'] == 3
    df['b4'] = df['b'] == 4
    df['b5'] = df['b'] == 5

    df['c1'] = df['c'] == 1
    df['c2'] = df['c'] == 2
    df['c3'] = df['c'] == 3
    df['c4'] = df['c'] == 4
    df['c5'] = df['c'] == 5

    df['d1'] = df['d'] == 1
    df['d2'] = df['d'] == 2
    df['d3'] = df['d'] == 3
    df['d4'] = df['d'] == 4
    df['d5'] = df['d'] == 5

    df['e1'] = df['e'] == 1
    df['e2'] = df['e'] == 2
    df['e3'] = df['e'] == 3
    df['e4'] = df['e'] == 4
    df['e5'] = df['e'] == 5

    df['f1'] = df['f'] == 1
    df['f2'] = df['f'] == 2
    df['f3'] = df['f'] == 3
    df['f4'] = df['f'] == 4
    df['f5'] = df['f'] == 5

    new_columns = "label a1 a2 a3 a4 a5 b1 b2 b3 b4 b5 c1 c2 c3 c4 c5 d1 d2 d3 d4 d5 e1 e2 e3 e4 e5 f1 f2 f3 f4 f5".split(' ')
    new_columns = "label a1 a2 b1 b2 c1 d1 d2 e1 e2 e3 f1".split(' ')
    df = df[new_columns]
    monk_dict[name] = df

In [8]:
#----------------------
# inspect the dimension of dummy-variable-ized dataset
#----------------------

for name in names:
    n_instances, n_features_ = monk_dict[name].shape
#     print(monk_dict[name].sum())
    print(f"{name} has {n_instances} instances and {n_features_-1} features.")

monks-1.test has 431 instances and 11 features.
monks-1.train has 123 instances and 11 features.
monks-2.test has 431 instances and 11 features.
monks-2.train has 168 instances and 11 features.
monks-3.test has 431 instances and 11 features.
monks-3.train has 121 instances and 11 features.


In [9]:
#----------------------
# combine the train and test dataset together
#----------------------

monk1 = pd.concat([monk_dict['monks-1.train'], monk_dict['monks-1.test']], axis=0)
monk2 = pd.concat([monk_dict['monks-2.train'], monk_dict['monks-2.test']], axis=0)
monk3 = pd.concat([monk_dict['monks-3.train'], monk_dict['monks-3.test']], axis=0)

monk1.index= range(monk1.shape[0])
monk2.index= range(monk2.shape[0])
monk3.index= range(monk3.shape[0])

In [10]:
#----------------------
# store them into total dataset
#----------------------

total_dataset['monk1'] = monk1
total_dataset['monk2'] = monk2
total_dataset['monk3'] = monk3


<a id='Balance'> </a>
## Data Preprocessing -- Balance_scale
<a href=#TableOfContent> Back to Table of Content </a>

In [11]:
#----------------------
# change directory and inspect files
#----------------------

os.chdir(os.path.join(root, "balance_scale"))
os.listdir()

['balance-scale.data', 'balance-scale.names', 'Index']

In [12]:
# inspect data structure

balance_scale = pd.read_csv("balance-scale.data")
balance_scale.head()

Unnamed: 0,B,1,1.1,1.2,1.3
0,R,1,1,1,2
1,R,1,1,1,3
2,R,1,1,1,4
3,R,1,1,1,5
4,R,1,1,2,1


In [13]:
# observe possible values for each variable
df = balance_scale
df.columns=['label', 'a', 'b', 'c', 'd']
for col in df.columns:
    print(col, df[col].unique())

label ['R' 'L' 'B']
a [1 2 3 4 5]
b [1 2 3 4 5]
c [1 2 3 4 5]
d [2 3 4 5 1]


In [14]:
# create dummy variables named a1-a5, ..., d1-d5

df['a1'] = df['a'] == 1
df['a2'] = df['a'] == 2
df['a3'] = df['a'] == 3
df['a4'] = df['a'] == 4
df['a5'] = df['a'] == 5

df['b1'] = df['b'] == 1
df['b2'] = df['b'] == 2
df['b3'] = df['b'] == 3
df['b4'] = df['b'] == 4
df['b5'] = df['b'] == 5

df['c1'] = df['c'] == 1
df['c2'] = df['c'] == 2
df['c3'] = df['c'] == 3
df['c4'] = df['c'] == 4
df['c5'] = df['c'] == 5

df['d1'] = df['d'] == 1
df['d2'] = df['d'] == 2
df['d3'] = df['d'] == 3
df['d4'] = df['d'] == 4
df['d5'] = df['d'] == 5

In [15]:
# classes are "Balanced" "Left" and "Right"
set(df["label"])

{'B', 'L', 'R'}

In [16]:
# select only newly created columns as a new dataframe
new_columns= "label a1 a2 a3 a4 b1 b2 b3 b4 c1 c2 c3 c4 d1 d2 d3 d4".split(" ")
df = df[new_columns]

In [17]:
# create a dataset, where target is such that "B" (stands for Balanced) ->1 and other case->0
balance_scale_B = df.replace({'B':1, 'L':0, 'R':0})

In [18]:
# create a dataset, where target is such that "L" (stands for Left) ->1 and other case->0
balance_scale_L = df.replace({'B':0, 'L':1, 'R':0})

In [19]:
# store them into total dataset
total_dataset |= dict(balance_scale_B=balance_scale_B)
total_dataset |= dict(balance_scale_L=balance_scale_L)

<a id='Tic'> </a>
## Data preprocessing -- tic-tac-toe
<a href=#TableOfContent> Back to Table of Content </a>

In [20]:
# change directory and inspect files
os.chdir(os.path.join(root, "tic+tac+toe+endgame"))
os.listdir()

['Index', 'tic-tac-toe.data', 'tic-tac-toe.names']

In [21]:
# read and inspect data
df = pd.read_csv("tic-tac-toe.data")
df

Unnamed: 0,x,x.1,x.2,x.3,o,o.1,x.4,o.2,o.3,positive
0,x,x,x,x,o,o,o,x,o,positive
1,x,x,x,x,o,o,o,o,x,positive
2,x,x,x,x,o,o,o,b,b,positive
3,x,x,x,x,o,o,b,o,b,positive
4,x,x,x,x,o,o,b,b,o,positive
...,...,...,...,...,...,...,...,...,...,...
952,o,x,x,x,o,o,o,x,x,negative
953,o,x,o,x,x,o,x,o,x,negative
954,o,x,o,x,o,x,x,o,x,negative
955,o,x,o,o,x,x,x,o,x,negative


In [22]:
# give columns meaningful names
df = df[['positive', 'x', 'x.1', 'x.2', 'x.3', 'x.4', 'o', 'o.1', 'o.2', 'o.3']]
tic = df
df.columns = ['label', '1', '2', '3', '4', '5', '6', '7', '8', '9']
# check and found out that feature is not binary
set(df['1'])

{'b', 'o', 'x'}

In [23]:
# transform features into binary. Knowing that each original feature represent a configuration in a cell out of 3x3 grid.
# which can either take value 'o' or 'x' or 'nothing'.

if df['label'][0] in ['positive', 'negative']:
    df['label'] = df['label'] == 'positive'

df['x1'] = df['1'] == 'x'
df['x2'] = df['2'] == 'x'
df['x3'] = df['3'] == 'x'

df['x4'] = df['4'] == 'x'
df['x5'] = df['5'] == 'x'
df['x6'] = df['6'] == 'x'

df['x7'] = df['7'] == 'x'
df['x8'] = df['8'] == 'x'
df['x9'] = df['9'] == 'x'


df['o1'] = df['1'] == 'o'
df['o2'] = df['2'] == 'o'
df['o3'] = df['3'] == 'o'

df['o4'] = df['4'] == 'o'
df['o5'] = df['5'] == 'o'
df['o6'] = df['6'] == 'o'

df['o7'] = df['7'] == 'o'
df['o8'] = df['8'] == 'o'
df['o9'] = df['9'] == 'o'

In [24]:
# select newly created variable and store into total dataset
tic_bin = df[['label', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'o1', 'o2', 'o3', 'o4', 'o5', 'o6', 'o7', 'o8', 'o9']]
total_dataset |= dict(tic_bin=tic_bin)

<a id='Car'> </a>
## Data preprocessing -- car_evaluation
<a href=#TableOfContent> Back to Table of Content </a>

In [25]:
# change directory and inspect files

os.chdir(os.path.join(root, 'car+evaluation'))
os.listdir()

['car.c45-names', 'car.data', 'car.names']

In [26]:
# read and give meaningful names to each column
df = pd.read_csv('car.data')
df.columns=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'label']
df = df[['label', 'buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']]

In [27]:
# See for each variable, what are the possible values.
for col in df.columns:
    print(col, df[col].unique())

label ['unacc' 'acc' 'vgood' 'good']
buying ['vhigh' 'high' 'med' 'low']
maint ['vhigh' 'high' 'med' 'low']
doors ['2' '3' '4' '5more']
persons ['2' '4' 'more']
lug_boot ['small' 'med' 'big']
safety ['med' 'high' 'low']


In [28]:
# Give each variable numerical encodings since they are all ordinal variables
df['label'].replace({"unacc":0, "acc":1, "vgood":2, "good":3}, inplace=True)
df['buying'].replace({'vhigh':3, 'high':2, 'med':1, 'low':0}, inplace=True)
df['maint'].replace({'vhigh':3, 'high':2, 'med':1, 'low':0}, inplace=True)
df['doors'].replace({'2':2, '3':3, '4':4, '5more':5}, inplace=True)
df['persons'].replace({'2':2, '4':4, 'more':5}, inplace=True)
df['lug_boot'].replace({'small':0, 'med':1, 'big':2}, inplace=True)
df['safety'].replace({'med':1, 'high':2, 'low':0}, inplace=True)
None

In [29]:
# create binary variables based on different thresholds
df['labelvgood'] = df['label'] == 3
df['labelgood'] = df['label'] >= 2
df['labelacc'] = df['label'] >= 1

df['buyingvhigh'] = df['buying'] == 3
df['buyinghigh'] = df['buying'] >= 2
df['buyingmed'] = df['buying'] >= 1

df['maintvhigh'] = df['maint'] == 3
df['mainthigh'] = df['maint'] >= 2
df['maintmed'] = df['maint'] >= 1

df["doors5"] = df['doors'] >= 5
df["doors4"] = df['doors'] >= 4
df["doors3"] = df['doors'] >= 3

df["persons5"] = df['persons']>=5
df["persons4"] = df['persons']>=4

df["lug_boot2"] = df["lug_boot"] >=2
df["lug_boot1"] = df["lug_boot"] >=1

df["safety2"] = df["safety"] >=2
df["safety1"] = df["safety"] >=1

In [30]:
# select newly created features and make 3 datasets based on target labels
features = ['buyingvhigh', 'buyinghigh', 'buyingmed',
            'maintvhigh', 'mainthigh', 'maintmed',
            "doors5", "doors4", "doors3",
            "persons5", "persons4",
            "lug_boot2", "lug_boot1",
            "safety2", "safety1"]
car_evaluation_vgood = df[["labelvgood"]+features]
car_evaluation_good = df[["labelgood"]+features]
car_evaluation_acc = df[["labelacc"]+features]

car_evaluation_vgood = car_evaluation_vgood.rename(columns={"labelvgood": "label"})
car_evaluation_good = car_evaluation_good.rename(columns={"labelgood": "label"})
car_evaluation_acc = car_evaluation_acc.rename(columns={"labelacc": "label"})

In [31]:
# store them into total dataset
total_dataset |= dict(car_evaluation_vgood=car_evaluation_vgood,
                     car_evaluation_good=car_evaluation_good,
                     car_evaluation_acc=car_evaluation_acc)

<a id='Krkp'> </a>
## Data preprocessing -- kr-vs-kp
<a href=#TableOfContent> Back to Table of Content </a>

In [32]:
# change directory and inspect files
os.chdir(os.path.join(root, "kr-vs-kp"))
os.listdir()

['kr-vs-kp_csv.csv']

In [33]:
# read and insepct possible values for each columns to see if they are already binary, and what are current values
df = pd.read_csv("kr-vs-kp_csv.csv")
for col in df.columns:
    print(col)
    print(df[col].unique())

df['katri_n'] = df['katri'] == 'n'
df['katri_b'] = df['katri'] == 'b' # binary-ize all features

bkblk
['f' 't']
bknwy
['f' 't']
bkon8
['f' 't']
bkona
['f' 't']
bkspr
['f' 't']
bkxbq
['f' 't']
bkxcr
['f' 't']
bkxwp
['f' 't']
blxwp
['f' 't']
bxqsq
['f' 't']
cntxt
['f' 't']
dsopp
['f' 't']
dwipd
['l' 'g']
hdchk
['f' 't']
katri
['n' 'w' 'b']
mulch
['f' 't']
qxmsq
['f' 't']
r2ar8
['t' 'f']
reskd
['f' 't']
reskr
['f' 't']
rimmx
['f' 't']
rkxwp
['f' 't']
rxmsq
['f' 't']
simpl
['f' 't']
skach
['f' 't']
skewr
['t' 'f']
skrxp
['f' 't']
spcop
['f' 't']
stlmt
['f' 't']
thrsk
['f' 't']
wkcti
['f' 't']
wkna8
['f' 't']
wknck
['f' 't']
wkovl
['t' 'f']
wkpos
['t' 'f']
wtoeg
['n' 't']
class
['won' 'nowin']


In [34]:
# reorder the columns so that the label is the first (index 0) appearing.
new_columns = ['class', 'bkblk', 'bknwy', 'bkon8', 'bkona', 'bkspr', 'bkxbq', 'bkxcr', 'bkxwp', 'blxwp', 'bxqsq', 'cntxt', 'dsopp', 'dwipd', 'hdchk', 'mulch', 'qxmsq', 'r2ar8', 'reskd', 'reskr', 'rimmx', 'rkxwp', 'rxmsq', 'simpl', 'skach', 'skewr', 'skrxp', 'spcop', 'stlmt', 'thrsk', 'wkcti', 'wkna8', 'wknck', 'wkovl', 'wkpos', 'wtoeg', 'katri_n', 'katri_b']
df = df[new_columns] # reorder

In [35]:
# rename the features to be 0 1 while making sure each column is treaty correctly
df.replace({"won":1, "nowin":0, "t":0, "f":1}, inplace=True)
df.replace({"l":0, "g":1, "n":1}, inplace=True)

df.columns = ['label'] + list(df.columns)[1:]

In [36]:
# store it into total dataset
total_dataset['kr-vs-kp'] = df

<a id='DatasetProcessing'> </a>
# Processing Datasets
## Getting statistics for each dataset
<a href=#TableOfContent> Back to Table of Content </a>

In [37]:
dimensions = dict()
for key, df in total_dataset.items():
    dimensions[key] = df.shape
dimensions

{'monk1': (554, 12),
 'monk2': (599, 12),
 'monk3': (552, 12),
 'balance_scale_B': (624, 17),
 'balance_scale_L': (624, 17),
 'tic_bin': (957, 19),
 'car_evaluation_vgood': (1727, 16),
 'car_evaluation_good': (1727, 16),
 'car_evaluation_acc': (1727, 16),
 'kr-vs-kp': (3196, 38)}

<a id='DistanceCalculation'> </a>
## Calculating distances between rows
<a href=#TableOfContent> Back to Table of Content </a>

In [38]:
# define the distance between instances, i.e. two given rows. Euclidien 2-distance squared is used.
def distance(df, i, j):
    dff = df.astype(int)
    return sum((dff.iloc[i, 1:] - dff.iloc[j, 1:])**2)
    
# For the instance in i-th row, calculate the distance of it with all the other instances.
# Reorder the distances and pick the K-first and K-last instances. If equality, pick all the instances of the same distance.
def calculate_distances(df, i, distmatrix=None):
    if distmatrix is not None:
        return list(zip(range(df.shape[0]), distmatrix[i]))
    distances = [(j, distance(df, i, j)) for j in range(df.shape[0])] # attention : (i, 0) will be inside
    return distances

def calculate_distmatrix(df):
    distmatrix = pd.DataFrame(index=range(df.shape[0]), columns=range(df.shape[0]), dtype=np.int8)
    distmatrix.iloc[:, :] = 0
    for t in range(1, df.shape[1]):
        df_t = np.array(df.iloc[:, t])
        symmetric_diff = df_t[:, np.newaxis] ^ df_t
        distmatrix += symmetric_diff.astype(np.int8)
    return distmatrix

In [39]:
def calculate_d_from0_tomax(distances, K=3, training=None):
    # create a dictionary to count : how many instances, j, are at a given distance (to the anchor instance i) ?
    # training is an array of indices indicating which ones are training. So only training is taken into account when it 
    # comes to selecting the K nearest and farest neighbors
    if training is None:
        training = range(len(distances))
#     distances.sort(key=lambda x: x[1])
    
    
#     # count how many instances is of certain distance to i, the given instance.
#     counter_df = pd.DataFrame(distances, columns=["index", "distance_to_i"])
#     counter = counter_df.iloc[training, 1].value_counts()
#     counter = dict(counter)
    counter = pd.Series(distances, dtype=np.int8).value_counts()
    counter = dict(counter)

    # find nearest neighbors
    # pick a distance d_from0 such that (j, d) with d<d_from0 counts more than K
    d_from0 = 0 # we pick d<d_thr
    count = 0
    while count < K+1: # (i, 0) are in and we want to exclude this
        try:
            count += counter[d_from0]
        except KeyError:
            pass
        d_from0 += 1

    # find farthest neighbors
    # pick a distance d_tomax such that (j, d) with d > d_tomax counts more than K
#     d_tomax = max(d for (j, d) in distances)
    d_tomax = max(distances)
    count = 0
    while count < K:
        try:
            count += counter[d_tomax]
        except KeyError:
            pass
        d_tomax -= 1
    return d_from0, d_tomax

def create_phidata_list(zip_distances, d_from0, d_tomax, i, draw_take_all=True, training=None):
    # return a list, in order of index number, of whether it is among K-nearest (1), or among K-farthest(2), or neither case(0).
    distances = zip_distances
    # if distance is small (d < d_from0) then we select as attractive neighbor ; if big (d>d_tomax) then repulsive
    def aux(d, d_from0, d_tomax):
        if d < d_from0:
            return 1 # 1 for phiup, attractive
        elif d > d_tomax:
            return 2 # 2 for phidown, repulsive
        else:
            return 0 # 0 for no link

    phidata_list = list(map(lambda jd: aux(jd[1], d_from0, d_tomax), sorted(distances, key=lambda jd: jd[0]) ))
    phidata_list[i] = 0  # you don't select yourself, i is closest to itself but not useful in this model for social influence
    if training is None:
        return phidata_list

    # if not in training set, we shouldn't select
    for j in range(len(distances)):
        if j not in training:
            phidata_list[j] = 0
    return phidata_list
        

    
def find_distances_and_selected_neighbors(df, i, K=3, draw_take_all=True, training=None, distances=None):
    if training is not None and i in training:
        return [0]*df.shape[0]
    
    if distances is None:
        distances = calculate_distmatrix(df)[i]
    
    if draw_take_all:
        d_from0, d_tomax = calculate_d_from0_tomax(distances, K=K, training=training)
        zip_distances = list(enumerate(distances))
        phidata_list = create_phidata_list(zip_distances, 
                                           d_from0, d_tomax, i, draw_take_all=draw_take_all, training=training)
        return phidata_list
    
    # pick K nearest distances
    indices = list(np.argsort(distances))
    phidata_list = [0]*df.shape[0]
    j = 0
    while j < K:
        if indices[j] not in training:
            indices.pop(j)
        else:
            phidata_list[indices[j]] = 1
            j+=1
    
    for i in range(K):
        phidata_list[indices[i]] = 1 # first K of them are friends
        phidata_list[indices[-1-i]] = 2 # last K of them are enemies
    return phidata_list

# for debugging
# phidata_list = find_distances_and_selected_neighbors(
#     total_dataset['monk1'].iloc[:30, :], 0, K = 3, draw_take_all=False, training=each_missing['monk1'])

# print()
# print(phidata_list)
# print(list(zip(phidata_list, distances)))

<a id='MissingValueCreation'> </a>
## Manually create missing values
<a href=#TableOfContent> Back to Table of Content </a>

For each dataset, we will randomly select 1% of rows/individuals and remove them and 10 of their closest friends too. So that the missing data is always clustered "close to each other" centered around the 1% selected rows.

In [40]:
import random
random.seed(40)

def generate_missing_values(df, perc=0.01, K=10):
    
    # calculate the amount of 1% selected rows 
    amount = int(df.shape[0]*perc)
    
    # select the missing_centers. Create a list storing all the missing rows to be calculated.
    missing_center = random.sample(range(df.shape[0]), amount)
    missing = []+ missing_center
    distmatrix = calculate_distmatrix(df)

    # for each missing center, add their 10 closest neighbors
    for i in missing_center:
        phidata_list = find_distances_and_selected_neighbors(df, i, K=K, training=None, distances=distmatrix[i])
        
        # for the j-th row with distance d to i. 
        for (j, d) in enumerate(distmatrix[i]):
            if phidata_list[j] == 1: # if j is amongst the K-nearest to the missing center i
                missing.append(j)
                
    # remove the potential duplicates
    return list(set(missing))

<a id='WriteDownDataset'> </a>
## Write the dataset down
<a href=#TableOfContent> Back to Table of Content </a>

In [41]:
# create a new directory (if not already existing) called df (stands for dataframe)
os.chdir(root)
if "df" not in os.listdir():
    os.mkdir("df")

In [42]:
# write down data in csv format
for (name, df) in total_dataset.items():
    df.to_csv(os.path.join(root, "df", name+".csv"))

In [43]:
# calculate the missing data for each dataset and write it as json file, 
# unless it has been written already, in which case we read instead of calculate again.

if "missing_data.json" in os.listdir(os.path.join(root, "df")) and False:
    with open(os.path.join(root, "df", "missing_data.json"), 'r') as f:
        each_missing = eval(f.read()) 
else:
    each_missing = dict()
    for (name, df) in total_dataset.items():
        missing = generate_missing_values(df, perc=0.01, K=10)
        each_missing[name] = sorted(missing)
        print(f"Missing value processed for {name}.")

    # write the missing instance down in the folder df.
    with open(os.path.join(root, "df", "missing_data.json"), 'w') as f:
        f.write(str(each_missing))

Missing value processed for monk1.
Missing value processed for monk2.
Missing value processed for monk3.
Missing value processed for balance_scale_B.
Missing value processed for balance_scale_L.
Missing value processed for tic_bin.
Missing value processed for car_evaluation_vgood.
Missing value processed for car_evaluation_good.
Missing value processed for car_evaluation_acc.
Missing value processed for kr-vs-kp.


In [44]:
len(each_missing['monk1']) / len(total_dataset['monk1']) # about 20 percent missing data.

0.2292418772563177

# Generating data for C++ code
Need phi_file, v_predefined_file and w_file
<a id='Graphfile'></a>
## Graph file, weight file and v predefined file
<a href=#TableOfContent> Back to Table of Content </a>

In [45]:
# create the graph file, i.e. the adjacency matrix of the (fully-connected) network, where each row is an instance.
def create_graphfile(df, filename):
    parent = os.path.dirname(filename)
    if os.path.basename(filename) in os.listdir(parent):
        print(f"{os.path.basename(filename)} already existed. Finished.")
        return None
    graphfile = pd.DataFrame(index=range(df.shape[0]), columns=range(df.shape[0]), dtype=np.int8)
    graphfile.iloc[:, :] = 1
    np.fill_diagonal(graphfile.values, 0)
    graphfile.to_csv(filename, header=False, index=False, sep= ' ', float_format='%.0f')

for data in total_dataset.keys():
    create_graphfile(total_dataset[data], os.path.join(root, "data2run", f"{data}_graph.txt"))
    print(f"The graph of {data} has been processed.")
    print()


The graph of monk1 has been processed.

The graph of monk2 has been processed.

The graph of monk3 has been processed.

The graph of balance_scale_B has been processed.

The graph of balance_scale_L has been processed.

The graph of tic_bin has been processed.

The graph of car_evaluation_vgood has been processed.

The graph of car_evaluation_good has been processed.

The graph of car_evaluation_acc has been processed.

The graph of kr-vs-kp has been processed.



In [46]:
# create the weight file uniform, i.e. the weighted adjacency matrix of the (fully-connected) network, where each row is an instance.
def create_weightfile_uniform(df, filename, overwrite=False):
    parent = os.path.dirname(filename)
    if not overwrite and os.path.basename(filename) in os.listdir(parent):
        print(f"{os.path.basename(filename)} already existed. Finished.")
        return None
    weightfile = pd.DataFrame(index=range(df.shape[0]), columns=range(df.shape[0]), dtype=np.int8)
    weightfile.iloc[:, :] = 1
    np.fill_diagonal(weightfile.values, 0)
    weightfile.to_csv(filename, header=False, index=False, sep= ' ', float_format='%.0f')

for data in total_dataset.keys():
    create_weightfile_uniform(total_dataset[data], os.path.join(root, "data2run", f"{data}_weight.txt"))
    print(f"The weight of {data} has been processed.")
    print()

The weight of monk1 has been processed.

The weight of monk2 has been processed.

The weight of monk3 has been processed.

The weight of balance_scale_B has been processed.

The weight of balance_scale_L has been processed.

The weight of tic_bin has been processed.

The weight of car_evaluation_vgood has been processed.

The weight of car_evaluation_good has been processed.

The weight of car_evaluation_acc has been processed.

The weight of kr-vs-kp has been processed.



In [47]:
def create_v_predefined(df, filename, percentage=0.2, testing=None, overwrite=False): # percentage of test data vs all data
    parent = os.path.dirname(filename)
    if not overwrite and os.path.basename(filename) in os.listdir(parent):
        print(f"{os.path.basename(filename)} already existed. Finished.")
        return None
    
    if testing is not None:
        # turn indices from testing into -1
        v_predefined = (df.iloc[:, 0]).copy() # the labels
        for i in testing:
            v_predefined[i] = -1
        v_predefined.to_csv(filename, header=False, index=False, sep=' ', float_format='%.0f')
    else:
        a = np.linspace(0, 1, df.shape[0])
        a = a < 1-percentage  # create a bool array of length number of rows, beginning with 80% of 1's followed by 20% of 0's
        a = pd.Series(a)
        a = a.astype(np.int8)
        a = (df['label'] * a)-1 + a # turns the beginning 80% into the same as df['label'] and remaining 20% all equals -1
        a.to_csv(filename, header=False, index=False, sep=' ', float_format='%.0f')

    
for data in total_dataset.keys():

    v_predefined = create_v_predefined(
        total_dataset[data], 
        os.path.join(root, "data2run", f'{data}_v_predefined.txt'),
        testing=each_missing[data],
        overwrite=True)
    print(f"The predefined label of {data} has been processed.")
    print()

The predefined label of monk1 has been processed.

The predefined label of monk2 has been processed.

The predefined label of monk3 has been processed.

The predefined label of balance_scale_B has been processed.

The predefined label of balance_scale_L has been processed.

The predefined label of tic_bin has been processed.

The predefined label of car_evaluation_vgood has been processed.

The predefined label of car_evaluation_good has been processed.

The predefined label of car_evaluation_acc has been processed.

The predefined label of kr-vs-kp has been processed.



<a id='Phi'></a>
## phi_file 
<a href=#TableOfContent> Back to Table of Content </a>

For telling who are friends and who are enemies. K-nearest are friends and K' farthest are enemies.

In [48]:
# run the find_K_neighbors n times (n = df.shape[0] is the number of rows). Store the lists into a matrix, saved in csv.
def create_phifile(df, filename, K=3, training=None, testing=None, overwrite=True):
    if testing is not None:
        training = [i for i in range(df.shape[0]) if i not in testing]
        
    parent = os.path.dirname(filename)
    if not overwrite and os.path.basename(filename) in os.listdir(parent):
        print(f"{os.path.basename(filename)} already existed. Finished.")
        return None

    phifile = pd.DataFrame(index=range(df.shape[0]), columns=range(df.shape[0]))
    distmatrix = calculate_distmatrix(df)
    for i in range(df.shape[0]):
        distances = distmatrix[i]
        phidata_list = find_distances_and_selected_neighbors(df, i, K, draw_take_all=False,
                                    training=training, distances=distances)
        # ignore all the enemies
        phidata_list_no_enemies = []
        for phi in phidata_list:
            if phi == 2:  # correspond to where phidown[i,j]==1
                phidata_list_no_enemies.append(0)  # store it as 0, meaning ignoring enemies
            else:
                phidata_list_no_enemies.append(phi)
        phidata_list = phidata_list_no_enemies
        
        phifile.iloc[i, :] = np.array(phidata_list)


    phifile.to_csv(filename, header=False, index=False, sep=' ', float_format='%.0f')


In [49]:
for name in total_dataset.keys():
    print(f"Processing {name} dataset.")
    create_phifile(total_dataset[name], 
                   os.path.join(root, "data2run", f"{name}_phifile.txt"), 
                   testing=each_missing[name],
                   overwrite=True)
print("Done")

Processing monk1 dataset.
Processing monk2 dataset.
Processing monk3 dataset.
Processing balance_scale_B dataset.
Processing balance_scale_L dataset.
Processing tic_bin dataset.
Processing car_evaluation_vgood dataset.
Processing car_evaluation_good dataset.
Processing car_evaluation_acc dataset.
Processing kr-vs-kp dataset.
Done


<a id='Weight'> </a>
## Create distance-parametered weight file
<a href=#TableOfContent> Back to Table of Content </a>

Calculate $d(i, j)$ the distance between i and j. 
Define for attractive neighbors (where j are one of those closer to i). The closer the distance, the higher the weight. We denote $M = \lceil median(d(i, :)) \rceil$
$$w(i, j) = M - d(i, j) + 1 \quad \mbox{ if } \quad d(i, j) <= M$$
Define for repulsive neighbors (where j are one of those closer to i). The farther the distance, the higher the weight.
$$w(i, j) = d(i, j) - M + 1 \quad \mbox{ if } \quad d(i, j) > M$$


In [50]:
from time import time

def weight(distance, median, dmax):
    if distance <= median:
        return median - distance + 1
    else:
        return distance - median +1
    
def calculate_weights_i(df, i, distances=None):
    median = distances.median()
    weights = distances.copy()
    for j in range(df.shape[0]):
        if distances[j] <= median:
            weights[j] = median -  distances[j] + 1
        else:
            weights[j] =  distances[j] - median +1
    return weights
    
        
        
def create_weightfile_distanced(df, filename, overwrite=False):
    parent = os.path.dirname(filename)
    
    if not overwrite and os.path.basename(filename) in os.listdir(parent):
        print(f"{os.path.basename(filename)} already existed. Finished.")
        return None
    
    weightfile = pd.DataFrame(index=range(df.shape[0]), columns=range(df.shape[0]), dtype=np.int8)
    begin=time()
    distmatrix = calculate_distmatrix(df)
    for i in range(df.shape[0]):
        weightfile.iloc[i, :] = calculate_weights_i(df, i, distances=distmatrix[i])

    weightfile.to_csv(filename, header=False, index=False, sep= ' ', float_format='%.0f')

In [51]:
for data in total_dataset.keys():
    print("-"*70 + f"\nRunning {data}\n" + "-"*70)
    create_weightfile_distanced(total_dataset[data], 
                            os.path.join(root, "data2run", f"{data}_weightfile_distance.txt"),
                               overwrite=True)


----------------------------------------------------------------------
Running monk1
----------------------------------------------------------------------
----------------------------------------------------------------------
Running monk2
----------------------------------------------------------------------
----------------------------------------------------------------------
Running monk3
----------------------------------------------------------------------
----------------------------------------------------------------------
Running balance_scale_B
----------------------------------------------------------------------
----------------------------------------------------------------------
Running balance_scale_L
----------------------------------------------------------------------
----------------------------------------------------------------------
Running tic_bin
----------------------------------------------------------------------
------------------------------------------