In [1]:
import numpy as np
from matplotlib import pyplot as plt
import PIL
import cv2
import time
import pickle
import methods as mt
import csv

In [2]:
# Create list of nodes/edges of each molecule 
nodesList = [[]]
edgesList = [[]]

roots = mt.importMolecule('gxl')

listedMolecules = mt.moleculeToList(roots)
nodesList = listedMolecules[0]
edgesList = listedMolecules[1]
print(nodesList[0])
print(edgesList[0])

[10071, ['O', 'C', 'C', 'C', 'O', 'C', 'C', 'C', 'C', 'C', 'O', 'C']]
[10071, [[1, 2], [1, 3], [3, 4], [3, 5], [4, 6], [4, 7], [5, 8], [6, 9], [7, 10], [8, 11], [9, 12], [6, 8], [10, 12]]]


In [3]:
# Transform lists to dictionnarys
nodesDict = mt.moleculeNodeListToDict(nodesList)
edgesDict = mt.moleculeEdgeListToDict(edgesList)
# Count number of adjacent edges of each node
edgesCountDict = mt.moleculeEdgesListToEdgeCountDict(nodesList, edgesDict)

## KNN Algorithm:
1. compare every molecule (from the valid set) with every other molecule (from the train set)
 - calculate each time the GED (graph edit distance between the two molecules)
 - to be more precise: approximate GED using bipartite graph matching
 - build cost matrix (using Dirac cost function (see lecture 9 slide 36))
 - use Hungarian algorithm to find optimal assignment (using an existing framework!)
2. for ever molecule (from the train set) find the K closest molecules (in the valid set)
 - the K molecules with the shortest GED (since we only have two classes (active, inactive): use odd K!)
 - using valid.txt check if the KNN (the K closest neighbour molecules) are active or inactive
 - assign train molecule to the closer class (and check if assignment was correct using train.txt (for accuracy measurement))
 - also: optimize for K (task for later)

In [4]:
# Read in train and valid files (len(train) + len(valid) = len(nodesList) (=len(edgesList)))
with open('train.txt', newline='') as train:
   reader = csv.reader(train, delimiter=' ')
   stringTrainRows = []
   for row in reader:
        stringTrainRows.append(row)
trainData = []
trainDict = dict()
for row in stringTrainRows:
    trainData.append([int(row[0]),row[1]])
    if (row[1] == 'a'):
        trainDict[int(row[0])] = {'class': row[1], 'value': 1}
    else:
        trainDict[int(row[0])] = {'class': row[1], 'value': 0}

with open('valid.txt', newline='') as train:
   reader = csv.reader(train, delimiter=' ')
   stringTrainRows = []
   for row in reader:
        stringTrainRows.append(row)
validData = []
validDict = dict()
for row in stringTrainRows:
    validData.append([int(row[0]),row[1]])
    if (row[1] == 'a'):
        validDict[int(row[0])] = {'class': row[1], 'value': 1}
    else:
        validDict[int(row[0])] = {'class': row[1], 'value': 0}

In [5]:
# Create distance matrix (takes approx. 23 min)
distanceMatrix = np.matrix(np.zeros((len(trainData), len(validData))))
start_time = time.time()
for i in range(len(trainData)):
    for j in range(len(validData)):
        molecule1 = trainData[i][0]
        molecule2 = validData[j][0]
        # using Cn = Ce = 1
        dist = mt.BP(nodesDict[molecule1], nodesDict[molecule2], edgesCountDict[molecule1], edgesCountDict[molecule2])
        distanceMatrix[i,j] = dist
print("--- %s seconds ---" % (time.time() - start_time))
print(distanceMatrix)
np.save("distanceMatrix.npy", distanceMatrix)

--- 1360.571976184845 seconds ---
[[19. 28. 29. ... 16. 12. 18.]
 [12. 13. 14. ... 17. 19. 21.]
 [14.  3.  4. ... 17. 19. 23.]
 ...
 [17. 26. 29. ... 18. 14. 18.]
 [ 9. 14. 15. ...  6.  8. 14.]
 [ 9. 18. 23. ...  6.  8.  8.]]


In [6]:
# Calculate accuracy
accuracy = 0
k = 5
# extract k nearest neighbours
for r in range(len(trainData)):
    row = np.squeeze(np.asarray(distanceMatrix[r]))
    idx = np.argpartition(row, k)
    sum = 0
    for i in range(k): 
        sum += validDict[validData[idx[i]][0]]['value']
    
    # assign molecule to class
    if sum > k/2:
    #     active
        if trainDict[trainData[r][0]]['class'] == 'a' :
            accuracy += 1
    else:
    #     inactive
        if trainDict[trainData[r][0]]['class'] == 'i' :
            accuracy += 1

print(accuracy)
print(accuracy/len(trainData))

249
0.996
