In [1]:
import doctest
import random
import re
import time
import pickle
import xml.etree.ElementTree as ET
import json
import os
import sys
from nltk.tokenize import word_tokenize

In [2]:
def p_equal(p1, p2):
    if len(p1) > 0 and p1[-1] == '$':
        p1 = p1[:-1]
    if len(p2) > 0 and p2[-1] == '$':
        p2 = p2[:-1]
    return p1 == p2

def p_match(p1, p2):
    if p1[-1:] == ['$']:
        if len(p1) != len(p2) or p2[-1:] != ['$']:
            return False
    else:
        if len(p1) > len(p2):
            return False
    return p1 == p2[:len(p1)]

def p_diff(p1, p2):
    for i in range(len(p1)):
        if i >= len(p2) or p1[i] != p2[i]:
            return p1[i:]
    return []

In [5]:
SIMILARITY_THRESHOLD = 5
ADD_LINK_TIME = 10000
DISCRIMINATION_TIME = 10000
FAMILIARISATION_TIME = 2000

In [6]:
class Stm:
    class Item:   #create an item container with head and pointer to next container                
        def __init__(self, data):    
            self.next = None
            self.data = data

    def __init__(self, maxlen): #STM constructor
        self.maxlen = maxlen #STM length passed from Modality
        self.size = 0
        self.head = None
        self.tail = None

    def push(self, data):
        self.remove(data)
        e = Stm.Item(data)
        if self.size == 0:
            self.head = self.tail = e
        else:
            check = self.head.data.image.copy()
            k = 0
            # print('check similarity', check, data.image)
            matches = []
            for i in data.image:
                if i in check:
                    k += 1
                    matches.append((data.idx, i, check))
                    if k == SIMILARITY_THRESHOLD:
                        self.head.data.similarity_links.add(data)
                        data.similarity_links.add(self.head.data)
#                         print('------------------------')
#                         for idx, i, check in matches:
#                             print(data.idx, 'FOUND', i, 'in', check)
#                         print('------------------------')
                        break
                    check.remove(i)
            self.tail.next = e
            self.tail = e
        self.size += 1
        if self.size > self.maxlen:
            self.pop()

    def pop(self):
        if self.size == 0:
            return None
        else:
            e = self.head.data
            self.head = self.head.next
            self.size -= 1
            return e

    def remove(self, value):
        if self.size == 0:
            return
        if self.head.data == value:
            self.head = self.head.next
            self.size -= 1
        else:
            p = self.head
            e = self.head.next
            while e:
                if e.data == value:
                    p.next = e.next
                    self.size -= 1
                    if e == self.tail:
                        self.tail = p
                    break
                p = p.next
                e = e.next

    class Iterator:
        def __init__(self, e):
            self.e = e

        def __next__(self):
            if self.e:
                e = self.e
                self.e = e.next
                return e.data
            raise StopIteration()

    def __iter__(self):
        return Stm.Iterator(self.head)

In [7]:
class Link:
    def __init__(self, test, child):
        self.test = test
        self.child = child
    
    def passes(self, pattern):
        return p_match(self.test, pattern)

In [8]:
class Node:
    def __init__(self, nodes, contents, image, children):
        self.contents = contents
        self.image = image
        if len(self.image) > 1 and self.image[-1] == '$':
            self.image.pop()
        self.children = children
        self.idx = len(nodes)
        nodes.append(self)
        self.label = None
        self.similarity_links = set()

In [9]:
class Modality:
    def __init__(self, cogact, name, stm_size=3):
        self.cogact = cogact
        self.name = name
        self.stm = Stm(stm_size)
        self.net = Node(nodes=cogact.nodes, contents=['ROOT NODE'], image=[], children=[])

    def recognise(self, pattern):
        current_node = self.net
        children = current_node.children
        sorted_pattern = pattern
        next_link = 0
        
        while next_link < len(children):
            link = children[next_link]
            if link.passes(sorted_pattern):
                current_node = link.child
                children = current_node.children
                next_link = 0
                sorted_pattern = p_diff(sorted_pattern, link.test)
            else:
                next_link += 1
        self.stm.push(current_node)
        return current_node
    
    def recognise_and_learn(self, pattern):
        current_node = self.recognise(pattern)
        if current_node.image != pattern:
            if current_node == self.net or \
               not p_match(current_node.image, pattern) or \
               current_node.image[-1:] == ['$']:
                current_node = self.discriminate(current_node, pattern)
            else:
                current_node = self.familiarise(current_node, pattern)
        self.stm.push(current_node)
        return current_node
    
    def get_first(self, pattern):
        fst = pattern[:1]
        if len(fst) == 0 or fst == ['$']:
            return []
        else:
            return fst
    
    def familiarise(self, node, pattern):
        new_info = self.get_first(p_diff(pattern, node.image))
        if len(new_info) == 0:
            return node
        
        retrieved_chunk = self.recognise(new_info)
        if retrieved_chunk == self.net:
            return self.learn_primitive(new_info)
        
        if node.image[:-1] == ['$']:
            node.image.pop()
        node.image += new_info
        self.cogact.clock += FAMILIARISATION_TIME
        return node
    
    def discriminate(self, node, pattern):
        new_info = p_diff(pattern, node.contents)
        
        if len(new_info) == 0 or new_info == ['$']:
            new_info = ['$']
            if self.recognise(new_info).contents == new_info:
                return self.add_test(node, new_info)
            else:
                child = Node(nodes=self.cogact.nodes, contents=new_info, image=new_info, children=[])
                self.net.children.append(Link(new_info, child))
                return child
        
        chunk = self.recognise(new_info)
        if chunk == self.net:
            return self.learn_primitive(self.get_first(new_info) or ['$'])
        elif p_match(chunk.contents, new_info):
            return self.add_test(node, chunk.contents.copy())
        else:
            return self.add_test(node, self.get_first(new_info))
            

    def add_test(self, node, pattern):
        for child in node.children:
            if child.test == pattern:
                return node
        if node == self.net:
            pat = pattern.copy()
        else:
            pat = node.contents + pattern
        child = Node(nodes=self.cogact.nodes, contents=pat, image=pat.copy(), children=[])
        node.children.append(Link(pattern, child))
        self.cogact.clock += DISCRIMINATION_TIME
        return child
    
    def learn_primitive(self, pattern):
        assert len(pattern) == 1
        child = Node(nodes=self.cogact.nodes, contents=pattern, image=[], children=[])
        self.net.children.append(Link(pattern, child))
        self.cogact.clock += DISCRIMINATION_TIME
        return child
    
    def print_tree(self, node=None, parent=[], level=0):
        if not node:
            node = self.net
        indent = '-------' * level
        contents = '< ' + ' '.join(p_diff(node.contents, parent)) + ' >'
        text = 'Node: ' + str(node.idx)
        image = '< '+ ' '.join(node.image) + ' >'
        
        string = [indent + contents, text, image]
        if node.label is not None:
            string.append('(' + str(node.label) + ')')
        if node.similarity_links:
            string.append({i.idx for i in node.similarity_links})
        print(*string)

        for child in node.children:
            self.print_tree(child.child, node.contents, level + 1)

    def print_stm(self):
        print('STM: ', end='')
        for node in self.stm:
            print(node.idx, '<', *node.image, '>', end=', ')
        print()


In [10]:
class CogAct:
    def __init__(self):
        self.nodes = []
        self.visual = Modality(self, 'visual')
        self.verbal = Modality(self, 'verbal')
        self.action = Modality(self, 'action')
        self.modalities = {
            'visual': self.visual,
            'verbal': self.verbal,
            'action': self.action,
        }
        self.clock = 0

# TESTS ====================================================

### Binary logic functions

In [11]:
def binary_logic(): #logical functions like AND, OR, XOR
    
    cogact = CogAct() #comment out to learn on top of previous knowledge, uncomment to train from blank slate
    inp = [list(map(list, x.split('~'))) for x in ['11$~T$', '00$~F$','01~F$', '10~F$']]
    print('inp=', inp)

    number_of_nodes =0

    #random.shuffle(inp) #shuffle input patterns
    for i in range(5000): #set an arbitary numebr of training cycles
        errors = 0        #set a stop condition for he training loop
        for pattern in inp:
            vis, verb = pattern
            time = cogact.clock   #check time before learning
            node1 = cogact.visual.recognise_and_learn(vis) #if learning is neccessary, it will add to time
            node2 = cogact.verbal.recognise_and_learn(verb)
            
            if node1.idx>number_of_nodes: #node counter
                number_of_nodes = node1.idx
            
            if cogact.clock == time: #if no time spent on learning means the pattern is learnt -> create naming link
                node1.label = node2.idx
                node2.label = node1.idx
            else:
                errors += 1
        if errors == 0:
            break

    cogact.visual.print_tree()
    cogact.verbal.print_tree()
    
        
    tests = list(map(list, (
         '11$','01$','10$','00$'
    )))
    window_size = 7 #attention window size
    
    print('\n-------- MAX IMAGE METHOD --------\n') 
    try:
        for test in tests: #for each of the test patterns
            big_chunks ={}
            print()
            print('test_pattern:', ''.join(test))
            label = None
            max_image = []  #intiialise the largest chunk activation
            for window_start in range(len(test)-2): #for each attention window start points from 0 to length of pattern-2
                
                if window_start + window_size < len(test):   #if the attention window size fits inside the pattern                 
                    window_end = window_start + window_size
                else:                                        #if the attention window size does not fit inside the pattern
                    window_end=None
                    
                attention_window = test[window_start:window_end] #
                print('Attention window:', ''.join(attention_window))

                node = cogact.visual.recognise(attention_window)
               
                if node.label is not None: #if label is not None                    
                    if len(node.image) > len(max_image): #if the size of the current chunk is bigger than the largest chunk
                        max_image = node.image   #set the current chunk to be the largest chunk
                        label = node.label       #take the naming link of the current chunk to be the label
                        big_chunks[node.label] = big_chunks.get(node.label, 0) + 1
                    print('recognises', ''.join(test),'as',''.join(max_image),'label:', cogact.nodes[label].image)
            lab_sum = lab_sum = sum(big_chunks.values())
            cogact_normalised = [(k, v / lab_sum) for k, v in big_chunks.items()]
            top = sorted(cogact_normalised, key=lambda x: -x[1])
            for x in top:
                print('%s (prob = %s)' % (''.join(cogact.nodes[x[0]].image), round(x[1], 2)))
            winner = ''.join(cogact.nodes[top[0][0]].image) if len(top) > 0 else None
            #print('correct:', winner == correct_label)

            print('final big_chunks scores', big_chunks)

    except:
            
        print('None found!')
#     print(cogact.visual.recognise(list('LIVERPOOZ$')).image )
    print('number_of_nodes=', number_of_nodes)

binary_logic()

inp= [[['1', '1', '$'], ['T', '$']], [['0', '0', '$'], ['F', '$']], [['0', '1'], ['F', '$']], [['1', '0'], ['F', '$']]]
< ROOT NODE > Node: 0 <  >
-------< 1 > Node: 3 < 1 1 > (4)
--------------< 0 > Node: 8 < 1 0 > (6)
-------< 0 > Node: 5 < 0 0 > (6)
--------------< 1 > Node: 7 < 0 1 > (6)
< ROOT NODE > Node: 1 <  >
-------< T > Node: 4 < T > (3)
-------< F > Node: 6 < F > (8)

-------- MAX IMAGE METHOD --------


test_pattern: 11$
Attention window: 11$
recognises 11$ as 11 label: ['T']
T (prob = 1.0)
final big_chunks scores {4: 1}

test_pattern: 01$
Attention window: 01$
recognises 01$ as 01 label: ['F']
F (prob = 1.0)
final big_chunks scores {6: 1}

test_pattern: 10$
Attention window: 10$
recognises 10$ as 10 label: ['F']
F (prob = 1.0)
final big_chunks scores {6: 1}

test_pattern: 00$
Attention window: 00$
recognises 00$ as 00 label: ['F']
F (prob = 1.0)
final big_chunks scores {6: 1}
number_of_nodes= 8


### 5-4 Faces / Verbal occlusions

In [12]:
def zliverpool_atten_big_chunk(): #the 5-4 artifical categories binary faces experiment
    
    cogact = CogAct() #comment out to learn on top of previous knowledge, uncomment to train from blank slate
    #inp = [list(map(list, x.split('~'))) for x in ['1110$~A$', '1010$~A$','1011$~A$','1101$~A$','0111$~A$','1100$~B$',
    #                                               '1100$~B$','0110$~B$','0001$~B$','0000$~B$',]]
    inp = [list(map(list, x.split('~'))) for x in ['LIVERPOOL$~A$', 'MANCHESTER$~B$', 'LONDON$~C$']]
    number_of_nodes =0

    #random.shuffle(inp) #shuffle input patterns
    for i in range(5000): #set an arbitary numebr of training cycles
        errors = 0        #set a stop condition for he training loop
        for pattern in inp:
            vis, verb = pattern
            time = cogact.clock   #check time before learning
            node1 = cogact.visual.recognise_and_learn(vis) #if learning is neccessary, it will add to time
            node2 = cogact.verbal.recognise_and_learn(verb)
            
            if node1.idx>number_of_nodes: #node counter
                number_of_nodes = node1.idx
            
            if cogact.clock == time: #if no time spent on learning means the pattern is learnt -> create naming link
                node1.label = node2.idx
                node2.label = node1.idx
            else:
                errors += 1
        if errors == 0:
            break

    cogact.visual.print_tree()
    cogact.verbal.print_tree()
    
        
    #tests = list(map(list, ('1001$','1000$','1111$','0010$','0101$','0011$','0100$'  )))
    tests = list(map(list, (
         'ZZLIVERZOOL$','LIVERPOOZ$','LZVERPOOL$', 'LIVER$', 'L$')))
    
    window_size = 7 #attention window size
    
    print('\n-------- MAX IMAGE METHOD --------\n') 
    try:
        for test in tests: #for each of the test patterns
            big_chunks ={}
            print()
            print('test_pattern:', ''.join(test))
            label = None
            max_image = []  #intiialise the largest chunk activation
            for window_start in range(len(test)-2): #for each attention window start points from 0 to length of pattern-2
                
                if window_start + window_size < len(test):   #if the attention window size fits inside the pattern                 
                    window_end = window_start + window_size
                else:                                        #if the attention window size does not fit inside the pattern
                    window_end=None
                    
                attention_window = test[window_start:window_end] #
                print('Attention window:', ''.join(attention_window))

                node = cogact.visual.recognise(attention_window)
               
                if node.label is not None: #if label is not None                    
                    if len(node.image) > len(max_image): #if the size of the current chunk is bigger than the largest chunk
                        max_image = node.image   #set the current chunk to be the largest chunk
                        label = node.label       #take the naming link of the current chunk to be the label
                        big_chunks[node.label] = big_chunks.get(node.label, 0) + 1
                    print('recognises', ''.join(test),'as',''.join(max_image),'label:', cogact.nodes[label].image)
            lab_sum = lab_sum = sum(big_chunks.values())
            cogact_normalised = [(k, v / lab_sum) for k, v in big_chunks.items()]
            top = sorted(cogact_normalised, key=lambda x: -x[1])
            for x in top:
                print('%s (prob = %s)' % (''.join(cogact.nodes[x[0]].image), round(x[1], 2)))
            winner = ''.join(cogact.nodes[top[0][0]].image) if len(top) > 0 else None
            print('Winner label:', winner)
            #print('correct:', winner == correct_label)

            print('final big_chunks scores', big_chunks)

    except:
            
        print('None found!')
#     print(cogact.visual.recognise(list('LIVERPOOZ$')).image )
    print('number_of_nodes=', number_of_nodes)
zliverpool_atten_big_chunk()

< ROOT NODE > Node: 0 <  >
-------< L > Node: 3 < L I V E R P O O L > (4)
--------------< O > Node: 11 < L O N D O N > (7)
-------< M > Node: 5 < M A N C H E S T E R > (6)
-------< I > Node: 8 <  >
-------< O > Node: 9 <  >
-------< A > Node: 10 <  >
-------< V > Node: 12 <  >
-------< N > Node: 13 <  >
-------< E > Node: 14 <  >
-------< C > Node: 15 <  >
-------< D > Node: 16 <  >
-------< R > Node: 17 <  >
-------< H > Node: 18 <  >
-------< P > Node: 19 <  >
-------< S > Node: 20 <  >
-------< T > Node: 21 <  >
< ROOT NODE > Node: 1 <  >
-------< A > Node: 4 < A > (3)
-------< B > Node: 6 < B > (5)
-------< C > Node: 7 < C > (11)

-------- MAX IMAGE METHOD --------


test_pattern: ZZLIVERZOOL$
Attention window: ZZLIVER
Attention window: ZLIVERZ
Attention window: LIVERZO
recognises ZZLIVERZOOL$ as LIVERPOOL label: ['A']
Attention window: IVERZOO
Attention window: VERZOOL
Attention window: ERZOOL$
Attention window: RZOOL$
Attention window: ZOOL$
Attention window: OOL$
Attention windo

## Chess

In [13]:
#train

In [14]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Mit
[nltk_data]     Ere\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
sep_sent = '\.|!|\?|\.\.\.|\;|…'
sep_phrases = '\.|!|\?|\.\.\.|…|,|:|;|-|—|"|\'|&'
sep_number = 64

In [16]:
def get_sentences_test(text, max_len=120): #chess only
    sentences = []
    #words = [*filter(lambda word: word not in stopwords, re.split('[^a-zа-я]+', text.lower()))] #uncomment to sanitise text
    #words = re.split('[^a-zа-я]+', text.lower())
    words = re.split('$\n', text)
    for i in range(len(words)):
        sentences.append(' '.join(words[i:i + max_len]))

    return sentences

In [17]:
def nonblank_lines(f):
    for l in f:
        line = l.rstrip()
        if line:
            yield line

In [18]:
def get_label_pattern_pairs(folder_location): #chess only

    output_labels = []
    output_sentences = []

    #folder = 'literature/train_texts'
    folder = folder_location
    for label in os.listdir(folder):
        p = os.path.join(folder, label)
        if os.path.isdir(p):
            for file in os.listdir(p):
                file = os.path.join(p, file)
                with open(file, 'r',  encoding='utf-8', errors='ignore') as f:
#                     for sentence in get_sentences(f.read()):
#                         #print('SNETENCE****', sentence)
#                         output_labels.append(label)
#                         output_sentences.append(sentence)

                     for line in nonblank_lines(f):
                        #text = line.split('-')[0].split('(')[0].strip() #strip text after - or (
                        text = line.split('\n') #strip text after "-" or "space"
                        #text_list.append(text)
                        output_sentences.extend(text)
                        #print(text)
                        output_labels.append(label + '$')
    ####################                    
    train_zip = zip(output_labels, output_sentences)  
    train_zip_list = list(train_zip)
    random.shuffle(train_zip_list) #uncomment to RANDOM shuffle data, see end page for refresh on zip/shuffle
    output_labels_tuple, output_sentences_tuple = zip(*train_zip_list)
    output_labels = list(output_labels_tuple)
    output_sentences = list(output_sentences_tuple)
    #return output_labels, output_sentences
    return  train_zip_list

In [19]:
#Training chess
test_folder = 'chess tests/train_c'
inp_train = list(get_label_pattern_pairs(test_folder))
inp_train

[('sicilian$',
  ' _ _ _ _ _ r k _ / _ _ r b _ p p p / p _ _ R p _ _ _ / _ p _ _ _ _ P n / _ _ _ _ P P _ _ / P _ _ _ n _ _ _ / _ P P _ N _ _ P / _ K _ _ _ B _ R;'),
 ('french$',
  ' _ _ _ _ _ _ _ _ / p _ _ _ _ _ p k / _ _ _ _ p r N p / _ p _ p _ _ _ _ / _ _ p n _ _ _ P / q _ P _ _ _ Q _ / _ _ P _ _ _ P K / _ _ _ _ R _ _ _;'),
 ('french$',
  ' _ _ _ _ _ _ k _ / p _ _ _ r _ p p / R _ b _ _ _ _ q / _ _ _ p _ p _ P / _ _ p P r P _ _ / _ _ P _ _ Q _ _ / _ _ P K B R _ _ / _ _ _ _ _ _ _ _;'),
 ('french$',
  ' r _ _ _ _ _ r _ / _ _ _ _ k _ _ p / p _ _ _ b _ _ _ / _ p _ p _ _ _ _ / _ _ _ R p P _ P / _ P _ _ _ _ _ _ / P _ P _ B P _ _ / _ _ K _ _ _ _ R;'),
 ('french$',
  ' _ _ _ _ k _ _ r / _ _ _ _ n p _ _ / _ p _ _ p _ _ _ / _ q b _ P p N P / p _ p _ _ P _ _ / _ _ P r _ _ _ _ / P P _ _ Q _ P _ / _ R B _ K _ _ R;'),
 ('french$',
  ' _ _ _ r _ _ _ _ / p p _ _ _ _ p k / _ _ n _ _ _ _ q / _ _ _ _ P _ n _ / _ _ _ _ _ _ _ _ / R _ _ _ _ _ _ _ / P P b _ N Q P P / R _ _ _ K _ _ _;'),
 ('french$',
  ' _ _

In [20]:
#Testing
test_folder = 'chess tests/test_c'
inp_test = list(get_label_pattern_pairs(test_folder))
inp_test

[('french$',
  ' _ _ k r _ _ _ _ / p b _ q n p _ _ / B p _ _ p _ _ _ / _ _ _ _ _ _ P _ / _ _ _ _ _ B _ _ / P _ p _ _ _ P _ / _ _ _ _ Q _ _ P / R _ _ _ K _ _ _;'),
 ('sicilian$',
  ' _ _ r _ _ _ r k / _ _ _ _ _ q _ p / p _ _ p _ b _ _ / _ p _ N _ _ _ _ / _ _ n N P _ _ _ / _ _ _ Q _ _ _ _ / P P P _ _ _ _ _ / _ K _ _ _ R _ R;'),
 ('french$',
  ' _ _ r _ _ _ _ k / p _ r _ n _ p p / b p _ _ _ _ _ _ / _ _ _ p _ p _ _ / _ _ _ P p P _ _ / P _ P _ B _ P _ / R _ _ _ N _ _ P / _ _ R _ _ _ K _;'),
 ('french$',
  ' _ _ _ _ _ _ _ Q / p k _ q _ _ _ p / _ p n _ N _ b _ / _ _ _ _ _ _ _ _ / _ _ p _ _ _ _ B / P _ _ r _ _ _ P / _ _ _ _ _ P P K / _ _ _ _ R _ _ _;'),
 ('sicilian$',
  ' _ q _ _ _ r _ _ / r _ _ _ _ _ k _ / p _ _ p _ _ n p / _ p _ N _ _ b _ / _ _ _ _ P _ _ _ / Q _ _ _ _ _ _ _ / P P P _ B _ _ _ / _ K _ _ _ _ R R;'),
 ('sicilian$',
  ' r _ _ _ _ r _ k / _ _ _ _ _ _ b p / p _ _ p B _ p _ / q p _ N n _ _ _ / _ _ _ _ P _ _ _ / _ _ _ _ _ _ _ _ / P P P _ Q _ _ P / _ K _ _ _ R R _;'),
 ('french$',
  '

In [21]:
cogact = CogAct() #comment out to learn on top of previous knowledge, uncomment to train from blank slate
def cogact_chess_learn(): 
    
    #cogact = CogAct() #comment out to learn on top of previous knowledge, uncomment to train from blank slate
#     inp = [list(map(list, x.split('~'))) for x in ['11$~T$', '00$~F$','01~F$', '10~F$']]
#     print('inp=', inp)
    inp = inp_train
    number_of_nodes =0

    random.shuffle(inp) #shuffle input patterns
    for i in range(5000): #set an arbitary numebr of training cycles
        errors = 0        #set a stop condition for he training loop
        for pattern in inp:
            verb, vis = pattern
            time = cogact.clock   #check time before learning
            node1 = cogact.visual.recognise_and_learn(word_tokenize(vis)) #if learning is neccessary, it will add to time
            node2 = cogact.verbal.recognise_and_learn(word_tokenize(verb))
            
            if node1.idx>number_of_nodes: #node counter
                number_of_nodes = node1.idx
            
            if cogact.clock == time: #if no time spent on learning means the pattern is learnt -> create naming link
                node1.label = node2.idx
                node2.label = node1.idx
            else:
                errors += 1
        if errors == 0:
            break

    cogact.visual.print_tree()
    cogact.verbal.print_tree()
    print('number_of_nodes=', number_of_nodes)

cogact_chess_learn()

< ROOT NODE > Node: 0 <  >
-------< _ > Node: 3 < _ _ _ _ _ > {5, 8, 9, 10, 11, 12, 14, 15, 16, 18, 20, 21, 23, 24, 25, 26, 29, 31, 32, 33, 35, 36, 37, 39, 40, 41, 42, 44, 45, 47, 49, 51, 53, 54, 55, 56, 57, 58, 59, 60, 61, 63, 64, 65, 66, 67, 68, 69, 70, 71, 76, 77, 78, 79, 81, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 96, 97, 98, 100, 101, 102, 103, 104, 105, 109, 110, 112, 113, 117, 118, 119, 121, 122, 124, 125, 127, 128, 129, 130, 131, 132, 135, 136, 137, 138, 140, 141, 143, 144, 145, 146, 147, 148, 149, 152, 153, 154, 155, 156, 159, 160, 161, 162, 163, 164, 165, 166, 167, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 182, 183, 184, 185, 186, 187, 188, 189, 190, 192, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 210, 212, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 229, 230, 232, 233, 234, 236, 237, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 

-----------------------------------< _ _ k > Node: 152 < _ _ r r _ _ _ k / _ _ _ _ _ p p _ / p _ b _ p _ _ p / _ p _ _ _ _ _ n / _ _ _ P N q _ _ / _ _ _ _ Q _ _ _ / P P _ N _ P P P / _ _ R _ R _ K _ ; > (4) {416, 3, 549, 550, 261, 585, 138, 490, 587, 368, 308, 309, 150, 151, 284}
----------------------------< _ k > Node: 32 < _ _ r _ k _ r _ / _ p > {3, 13, 18, 22, 23, 26, 27, 35, 36, 37, 42, 43, 45, 49, 50, 57, 58, 64, 65, 66, 76, 77, 92, 105}
-----------------------------------< _ _ _ > Node: 82 < _ _ r _ k _ _ _ / _ p > {98, 105, 106, 107, 81, 50, 83, 116, 84, 120, 90, 125}
------------------------------------------< / > Node: 139 < _ _ r _ k _ _ _ / _ _ > {224, 34, 164, 232, 138, 108, 174, 50, 83, 22, 57, 154, 127}
-------------------------------------------------< p > Node: 175 < _ _ r _ k _ _ _ / p _ _ q > {288, 193, 3, 287, 325, 71, 299, 300, 365, 250, 412, 127}
--------------------------------------------------------< _ _ > Node: 432 < _ _ r _ k _ _ _ / p _ _ q n r p p / _ p _ 

In [22]:
sep_sent = '\.|!|\?|\.\.\.|\;|…'
sep_phrases = '\.|!|\?|\.\.\.|…|,|:|;|-|—|"|\'|&'
sep_number = 8

In [23]:
def get_sentences(text, sep, min_len=3): #number of words?
    patterns = []
    
    if isinstance(sep, int): #take sep_number words otherwise split by punctuation
        words = [word for word in re.split('[^A-Za-zа-я0-9_/]+', text)] # if len(word) >= 3] #option to remove short words
        for i in range(0, len(words), sep):
            patterns.append(words[i:i + sep] )
    else:
        for sent in re.split(sep, text):
            sent = list(filter(lambda word : len(word) >= 3, re.split('[^A-Za-zа-я0-9_/]+', sent)))
            if len(sent) >= min_len:
                patterns.append(sent + ['$'])
    return patterns

In [24]:
def train(folder, get_patterns=None): #Same as above, easier interface for multiple categories
    inp = []

    for label in os.listdir(folder):
        p = os.path.join(folder, label)
        if os.path.isdir(p):
            for file in os.listdir(p):
                file = os.path.join(p, file)
                with open(file, 'r',  encoding='utf-8', errors='ignore') as f:
                    patterns = get_patterns(f.read())
                inp += list(zip((list(p) + ['$'] for p in patterns), [[label, '$']] * len(patterns)))
                #print('inp', inp)
    
    random.shuffle(inp) #shuffle input patterns
    cogact = CogAct() #comment out to learn on top of previous knowledge, uncomment to train from blank slate
    for i in range(500): #default 300
        errors = 0
#         if i//10==0:
        print(i, 'training cylce')
        i = 0
        n = len(inp)
        for pattern in inp:
            # print('handle pattern', i, '/', n)
            i += 1
            vis, verb = pattern
            t = cogact.clock
            node1 = cogact.visual.recognise_and_learn(vis)
            node2 = cogact.verbal.recognise_and_learn(verb)
            if cogact.clock != t:
                errors += 1
                continue
            node1.label = node2.idx
            node2.label = node1.idx
        if errors == 0:
            break
    return cogact

In [34]:
start = time.time()
def train_chess(folder, sep): 
    return train(folder, lambda text: get_sentences(text, sep))
end = time.time()
cogact = train_chess('chess tests/train_c',7)
print(end - start)

0 training cylce
1 training cylce
2 training cylce
3 training cylce
4 training cylce
5 training cylce
6 training cylce
7 training cylce
0.0


In [26]:
len(cogact.nodes)

2975

In [27]:
cogact.verbal.print_tree()

< ROOT NODE > Node: 1 <  >
-------< french > Node: 4 < french > (2038)
-------< sicilian > Node: 6 < sicilian > (2300)


In [28]:
cogact.visual.print_tree()

< ROOT NODE > Node: 0 <  >
-------< N > Node: 3 < N _ _ P P / R > (4) {1896, 41, 1354, 875, 1676, 845, 1808, 1905, 2673, 1713, 2385, 1137, 594, 443, 1179, 1694}
--------------< N > Node: 83 < N N _ / P _ _ > (6) {2361}
--------------< / p > Node: 105 < N / p p _ n _ > (4)
--------------< _ K > Node: 303 < N _ K _ P P / > (6) {2100}
---------------------< B > Node: 2480 < N _ K B _ _ _ > (4) {1457}
--------------< _ B > Node: 407 < N _ B R _ _ _ > (6)
---------------------< / > Node: 2531 < N _ B / _ _ _ > (6) {337, 409, 404}
--------------< _ _ _ / > Node: 540 < N _ _ _ / n _ > (4) {752, 2634, 751}
---------------------< _ _ > Node: 1154 < N _ _ _ / _ _ > (4) {1152, 571}
---------------------< _ P > Node: 1246 < N _ _ _ / _ P > (6) {1249, 362, 2571, 123, 1244}
--------------< _ _ n > Node: 558 < N _ _ n / _ _ > (6) {2576, 556, 557, 559}
--------------< P / > Node: 591 < N P / _ _ P _ > (4)
---------------------< P P _ > Node: 2588 < N P / P P _ _ > (6) {2589}
---------------------< _ _

-----------------------------------< P > Node: 613 < _ / _ P P B _ > (4) {384, 966, 7, 2952, 967}
------------------------------------------< Q > Node: 2596 < _ / _ P P Q _ > (6) {612, 614}
-----------------------------------< _ _ q > Node: 1313 < _ / _ P _ _ q > (6) {145, 1314}
-----------------------------------< R > Node: 1335 < _ / _ P R _ _ > (6) {1336, 1096, 941, 1334}
-----------------------------------< r > Node: 1688 < _ / _ P r _ _ > (6) {123, 148, 2406, 7}
------------------------------------------< P _ > Node: 2834 < _ / _ P r P _ > (4) {1687}
-----------------------------------< n > Node: 2068 < _ / _ P n _ p > (6) {2065, 2066, 2898, 2067, 2070}
-----------------------------------< _ P > Node: 2160 < _ / _ P _ P _ > (6) {185, 395, 2435}
-----------------------------------< _ p > Node: 2413 < _ / _ P _ p _ > (4) {2415, 760, 155, 156, 158}
----------------------------< N > Node: 273 < _ / _ N P _ _ > (6) {452, 2150, 2950, 7, 653, 243, 2675}
----------------------------------

-----------------------------------< p > Node: 2041 < _ _ p _ p / p > (4) {833, 464, 2161, 2162, 1301}
----------------------------< b > Node: 1185 < _ _ p b _ / _ > (6) {2817, 2466, 260, 263, 264, 2634, 749, 751, 436, 2102}
-----------------------------------< p _ / > Node: 2734 < _ _ p b p _ / > (6) {279}
-----------------------------------< _ / p > Node: 2904 < _ _ p b _ / p > (6) {568, 716, 2101, 2102}
----------------------------< k _ / _ > Node: 1252 < _ _ p k _ / _ > (4) {1250, 1253, 1254}
----------------------------< _ p > Node: 1301 < _ _ p _ p _ / > (4) {643, 708, 132, 2185, 2187, 2162, 599, 2041, 730, 1303}
----------------------------< _ Q > Node: 1656 < _ _ p _ Q p / > (4) {1654}
----------------------------< _ n > Node: 2151 < _ _ p _ n _ / > (4) {376, 2150}
----------------------------< r > Node: 2216 < _ _ p r b p / > (6) {914}
---------------------< B > Node: 25 < _ _ B N _ >
----------------------------< Q > Node: 159 < _ _ B Q _ / _ > (6) {1920, 1921, 304, 184}
----

----------------------------< N > Node: 1666 < _ _ P N _ / P > (6) {1664, 1514, 1100, 2831, 1777, 84}
-----------------------------------< _ Q > Node: 2852 < _ _ P N _ Q _ > (4)
-----------------------------------< _ / _ > Node: 2917 < _ _ P N _ / _ > (4) {419, 231}
----------------------------< P P / > Node: 1702 < _ _ P P P / _ > (6) {185, 1957, 1567}
----------------------------< _ / p > Node: 1705 < _ _ P _ / p P > (4) {2960, 2841, 338, 1540}
----------------------------< _ _ n P > Node: 2061 < _ _ P _ _ n P > (6) {1091}
----------------------------< _ _ Q > Node: 2420 < _ _ P _ _ Q B > (4) {2236, 2237, 2239}
-----------------------------------< _ > Node: 2931 < _ _ P _ _ Q _ > (6) {169, 196}
---------------------< n > Node: 48 < _ _ n _ _ / _ > (4) {515, 7, 19, 22, 24, 1696, 677, 38, 1194, 1195, 941, 2735, 1336, 1339, 572, 702, 64, 457, 2007, 600, 2009, 87, 93, 871, 123}
----------------------------< P > Node: 116 < _ _ n P r _ _ > (6) {424, 2535}
---------------------------------

----------------------------< p b > Node: 2537 < _ B P p b _ _ > (4)
---------------------< _ P _ _ / > Node: 783 < _ B _ P _ _ / > (6) {1731, 555, 368, 20, 2649}
---------------------< _ _ _ _ P > Node: 904 < _ B _ _ _ _ P > (6) {104, 906, 902}
---------------------< _ _ _ k / > Node: 1148 < _ B _ _ _ k / > (4) {47, 24, 1146, 1147, 1149, 1150}
---------------------< _ _ _ Q / > Node: 1292 < _ B _ _ _ Q / > (6) {1290, 1291, 1293, 112, 26}
---------------------< _ _ Q > Node: 1322 < _ B _ _ Q _ / > (6) {737, 2756, 1325}
---------------------< _ _ r > Node: 1434 < _ B _ _ r _ _ > (4) {1432, 296, 1435}
---------------------< _ _ _ R > Node: 1523 < _ B _ _ _ R _ > (6) {304, 1521, 243}
---------------------< _ R > Node: 1572 < _ B _ R _ _ _ > (6) {737, 1573, 7, 456, 177}
----------------------------< _ r > Node: 2899 < _ B _ R _ r _ > (6) {2075, 1340}
---------------------< N _ _ _ _ > Node: 1744 < _ B N _ _ _ _ > (6) {2962, 2850, 421}
---------------------< _ _ _ _ / > Node: 1894 < _ B _ _

----------------------------< B P _ > Node: 1886 < P _ / B P _ _ > (6) {1885, 982}
----------------------------< _ _ R > Node: 2432 < P _ / _ _ R _ > (4) {857, 196, 197, 2431}
---------------------< _ / _ _ > Node: 217 < P _ _ / _ _ _ > (4) {7, 2444, 30, 161, 33, 165, 444, 64, 707, 80, 1625, 218, 219, 1627, 2399, 870, 104, 1385, 1384, 1386, 1011}
----------------------------< p > Node: 1211 < P _ _ / _ _ p > (6) {2273, 2274, 910, 1456, 213, 1209, 1210, 1212, 1213, 1214}
---------------------< P / > Node: 229 < P _ P / _ _ _ > (4) {1056, 2082, 867, 1464, 7, 328, 1194, 2602, 568, 46, 435, 2682, 823, 1368, 922, 152, 639}
----------------------------< R > Node: 1165 < P _ P / R _ _ > (4)
----------------------------< _ _ P > Node: 1612 < P _ P / _ _ P > (6) {16, 1354, 2826, 2140}
----------------------------< _ P > Node: 2448 < P _ P / _ P P > (6)
----------------------------< _ K > Node: 2456 < P _ P / _ K _ > (6) {2457, 1693, 247}
---------------------< _ Q > Node: 261 < P _ _ Q _ P _ > 

--------------< P _ > Node: 269 < n P _ p _ _ / > (4) {2467, 1163}
---------------------< _ _ / _ > Node: 2467 < n P _ _ _ / _ > (6) {268, 269}
--------------< _ b > Node: 300 < n _ b _ / _ _ > (6) {136, 244}
--------------< _ _ _ P > Node: 320 < n _ _ _ P _ / > (6) {322, 318, 2487}
--------------< _ _ n P > Node: 392 < n _ _ n P b _ > (4) {393, 113}
--------------< p _ _ / _ > Node: 418 < n p _ _ / _ _ > (6) {1012, 1013, 1015}
---------------------< p > Node: 2533 < n p _ _ / _ p > (4)
--------------< _ _ _ / > Node: 485 < n _ _ _ / p _ > (4) {532, 535, 1670, 534}
---------------------< _ _ > Node: 559 < n _ _ _ / _ _ > (6) {131, 484, 104, 558, 272, 561, 210}
--------------< q > Node: 522 < n q p _ p / b > (4)
--------------< B > Node: 535 < n B _ P p / _ > (6) {485}
--------------< _ _ / _ > Node: 549 < n _ _ / _ _ P > (6) {1216, 561, 1218, 1696}
---------------------< Q > Node: 715 < n _ _ / _ Q n > (4)
---------------------< _ _ > Node: 1350 < n _ _ / _ _ _ > (4) {1352, 1353, 204, 

## chess test

In [29]:
#Testing
test_folder = 'chess tests/test_c'
inp_test = list(get_label_pattern_pairs(test_folder))
inp_test

[('sicilian$',
  ' _ _ r _ _ q _ k / _ _ _ _ _ _ b p / p _ _ p _ _ p _ / _ _ _ N _ _ _ _ / _ p _ N P _ Q P / _ _ P n _ _ _ _ / P P _ _ _ _ _ _ / _ K _ _ _ _ R _;'),
 ('french$',
  ' _ n r _ _ _ k _ / r _ _ _ _ p _ p / q _ _ _ p _ p P / _ _ _ p P _ _ _ / _ p _ _ _ P _ _ / _ N _ _ _ _ _ R / P K P _ _ Q P _ / R _ _ _ _ _ _ _;'),
 ('sicilian$',
  ' r _ b q _ _ k _ / _ _ _ _ _ r _ _ / p _ _ p _ _ p _ / _ p _ N n _ _ p / _ _ _ N P _ _ b / _ _ _ _ _ _ _ B / P P P _ Q _ _ _ / _ K _ R _ _ _ R;'),
 ('french$',
  ' _ _ k _ _ _ r _ / p p _ b _ _ _ _ / _ _ q _ _ _ r _ / _ _ _ _ _ p _ _ / _ _ _ p _ B _ _ / P _ p _ _ P _ Q / _ _ P _ _ _ P P / R _ _ _ R K _ _;'),
 ('sicilian$',
  ' _ _ _ q r _ k _ / _ _ _ _ _ _ _ p / p _ _ p _ b p _ / _ p _ N n _ _ P / _ _ r N P _ _ _ / _ _ _ _ _ _ Q _ / P P P _ _ _ _ _ / _ K _ R _ _ R _;'),
 ('sicilian$',
  ' _ q _ _ _ r _ _ / r _ _ _ _ _ k _ / p _ _ p _ _ n p / _ p _ N _ _ b _ / _ _ _ _ P _ _ _ / Q _ _ _ _ _ _ _ / P P P _ B _ _ _ / _ K _ _ _ _ R R;'),
 ('sicilian$',

In [30]:
def chess_categ (label_pat_pairs):
    Num_correct_guesses=0
    Incorrect_guesses=0
    window_size = 7 #attention window size
    result_pairs={}
    for categ in inp_test:
        result_pairs[categ[0][:-1]] = 0
    mistakes = result_pairs
    
    

    print('\n-------- MAX IMAGE METHOD --------\n') 

    for pair in label_pat_pairs: #for each of the test patterns
        true_label, test = pair
        true_label = word_tokenize(true_label)[:-1]
        test = word_tokenize(test)
        print('test=',test)
        print('test_pattern:', ' '.join(test))
        label = None
        winner = None 
        big_chunks ={}
        for window_start in range(len(test)-7): #for each attention window start points from 0 to length of pattern-2
            max_image = [] #initialise the largest chunk activation
            if window_start + window_size < len(test):   #if the attention window size fits inside the pattern                 
                window_end = window_start + window_size
            else:                                        #if the attention window size does not fit inside the pattern
                window_end=None

            attention_window = test[window_start:window_end] #
            #print('Attention window:', ' '.join(attention_window))
            #print('Attention window:', attention_window)

            node = cogact.visual.recognise(attention_window)

            if node.label: #if label is not None
                if len(node.image) > len(max_image): #if the size of the current chunk is bigger than the largest chunk
                    max_image = node.image   #set the current chunk to be the largest chunk
                    label = node.label       #take the naming link of the current chunk to be the label
                    big_chunks[node.label] = big_chunks.get(node.label, 0) + 1
                    print("big_chunks=", big_chunks)
                    #winner = cogact.nodes[label].image
        #print("Max Image =", max_image)
        #print('recognises', true_label,'AS', 'label:', cogact.nodes[label].image)
        lab_sum = sum(big_chunks.values())
        cogact_normalised = [(k, v / lab_sum) for k, v in big_chunks.items()]
        top = sorted(cogact_normalised, key=lambda x: -x[1])
        for x in top:
            print('%s (prob = %s)' % (''.join(cogact.nodes[x[0]].image), round(x[1], 2)))
        winner = ''.join(cogact.nodes[top[0][0]].image) if len(top) > 0 else None
        print('correct:', winner == ''.join(true_label))

        print('final big_chunks scores', big_chunks)

        print('recognises', ''.join(true_label),'AS', 'label:', winner)
        
        if winner == ''.join(true_label):
            Num_correct_guesses+=1
            print("CORRECT!")
        else:
            Incorrect_guesses += 1
            mistakes[('').join(true_label)]+=1
            print('WRONG!')
        print("-"*20) 

            
    print("Num_correct_guesses=", Num_correct_guesses)
    print("Incorrect_guesses = ", Incorrect_guesses)
    print('perc correct', Num_correct_guesses/ (Num_correct_guesses + Incorrect_guesses + 0.0001))
    print("Mistakes=", mistakes)
       

In [35]:
chess_categ(inp_test)


-------- MAX IMAGE METHOD --------

test= ['_', '_', 'r', '_', '_', 'q', '_', 'k', '/', '_', '_', '_', '_', '_', '_', 'b', 'p', '/', 'p', '_', '_', 'p', '_', '_', 'p', '_', '/', '_', '_', '_', 'N', '_', '_', '_', '_', '/', '_', 'p', '_', 'N', 'P', '_', 'Q', 'P', '/', '_', '_', 'P', 'n', '_', '_', '_', '_', '/', 'P', 'P', '_', '_', '_', '_', '_', '_', '/', '_', 'K', '_', '_', '_', '_', 'R', '_', ';']
test_pattern: _ _ r _ _ q _ k / _ _ _ _ _ _ b p / p _ _ p _ _ p _ / _ _ _ N _ _ _ _ / _ p _ N P _ Q P / _ _ P n _ _ _ _ / P P _ _ _ _ _ _ / _ K _ _ _ _ R _ ;
big_chunks= {4: 1}
big_chunks= {4: 1, 7: 1}
big_chunks= {4: 1, 7: 2}
big_chunks= {4: 2, 7: 2}
big_chunks= {4: 3, 7: 2}
big_chunks= {4: 4, 7: 2}
big_chunks= {4: 4, 7: 3}
big_chunks= {4: 5, 7: 3}
big_chunks= {4: 6, 7: 3}
big_chunks= {4: 7, 7: 3}
big_chunks= {4: 8, 7: 3}
big_chunks= {4: 8, 7: 4}
big_chunks= {4: 9, 7: 4}
big_chunks= {4: 10, 7: 4}
big_chunks= {4: 11, 7: 4}
big_chunks= {4: 12, 7: 4}
big_chunks= {4: 13, 7: 4}
big_chunks= {4:

big_chunks= {4: 7, 7: 6}
big_chunks= {4: 7, 7: 7}
big_chunks= {4: 7, 7: 8}
big_chunks= {4: 7, 7: 9}
big_chunks= {4: 7, 7: 10}
big_chunks= {4: 7, 7: 11}
big_chunks= {4: 7, 7: 12}
big_chunks= {4: 7, 7: 13}
big_chunks= {4: 7, 7: 14}
big_chunks= {4: 8, 7: 14}
big_chunks= {4: 9, 7: 14}
big_chunks= {4: 10, 7: 14}
big_chunks= {4: 11, 7: 14}
big_chunks= {4: 12, 7: 14}
big_chunks= {4: 12, 7: 15}
big_chunks= {4: 12, 7: 16}
big_chunks= {4: 12, 7: 17}
big_chunks= {4: 12, 7: 18}
big_chunks= {4: 12, 7: 19}
big_chunks= {4: 12, 7: 20}
big_chunks= {4: 12, 7: 21}
big_chunks= {4: 12, 7: 22}
big_chunks= {4: 12, 7: 23}
big_chunks= {4: 12, 7: 24}
big_chunks= {4: 13, 7: 24}
big_chunks= {4: 13, 7: 25}
big_chunks= {4: 14, 7: 25}
big_chunks= {4: 14, 7: 26}
big_chunks= {4: 14, 7: 27}
big_chunks= {4: 15, 7: 27}
big_chunks= {4: 16, 7: 27}
big_chunks= {4: 16, 7: 28}
big_chunks= {4: 16, 7: 29}
big_chunks= {4: 17, 7: 29}
big_chunks= {4: 17, 7: 30}
big_chunks= {4: 18, 7: 30}
big_chunks= {4: 18, 7: 31}
big_chunks= {4: 

big_chunks= {4: 13, 7: 5}
big_chunks= {4: 14, 7: 5}
big_chunks= {4: 14, 7: 6}
big_chunks= {4: 15, 7: 6}
big_chunks= {4: 16, 7: 6}
big_chunks= {4: 17, 7: 6}
big_chunks= {4: 18, 7: 6}
big_chunks= {4: 19, 7: 6}
big_chunks= {4: 20, 7: 6}
big_chunks= {4: 21, 7: 6}
big_chunks= {4: 22, 7: 6}
big_chunks= {4: 22, 7: 7}
big_chunks= {4: 22, 7: 8}
big_chunks= {4: 23, 7: 8}
big_chunks= {4: 24, 7: 8}
big_chunks= {4: 25, 7: 8}
big_chunks= {4: 25, 7: 9}
big_chunks= {4: 26, 7: 9}
big_chunks= {4: 26, 7: 10}
big_chunks= {4: 27, 7: 10}
big_chunks= {4: 27, 7: 11}
big_chunks= {4: 27, 7: 12}
big_chunks= {4: 27, 7: 13}
big_chunks= {4: 27, 7: 14}
big_chunks= {4: 28, 7: 14}
big_chunks= {4: 29, 7: 14}
big_chunks= {4: 30, 7: 14}
big_chunks= {4: 31, 7: 14}
big_chunks= {4: 32, 7: 14}
big_chunks= {4: 32, 7: 15}
big_chunks= {4: 33, 7: 15}
big_chunks= {4: 34, 7: 15}
big_chunks= {4: 34, 7: 16}
big_chunks= {4: 35, 7: 16}
big_chunks= {4: 35, 7: 17}
big_chunks= {4: 35, 7: 18}
big_chunks= {4: 36, 7: 18}
big_chunks= {4: 37,

big_chunks= {7: 9, 4: 9}
big_chunks= {7: 9, 4: 10}
big_chunks= {7: 10, 4: 10}
big_chunks= {7: 10, 4: 11}
big_chunks= {7: 11, 4: 11}
big_chunks= {7: 11, 4: 12}
big_chunks= {7: 11, 4: 13}
big_chunks= {7: 11, 4: 14}
big_chunks= {7: 11, 4: 15}
big_chunks= {7: 11, 4: 16}
big_chunks= {7: 11, 4: 17}
big_chunks= {7: 12, 4: 17}
big_chunks= {7: 12, 4: 18}
big_chunks= {7: 12, 4: 19}
big_chunks= {7: 13, 4: 19}
big_chunks= {7: 14, 4: 19}
big_chunks= {7: 14, 4: 20}
big_chunks= {7: 14, 4: 21}
big_chunks= {7: 14, 4: 22}
big_chunks= {7: 15, 4: 22}
big_chunks= {7: 15, 4: 23}
big_chunks= {7: 15, 4: 24}
big_chunks= {7: 16, 4: 24}
big_chunks= {7: 16, 4: 25}
big_chunks= {7: 17, 4: 25}
big_chunks= {7: 18, 4: 25}
big_chunks= {7: 19, 4: 25}
big_chunks= {7: 20, 4: 25}
big_chunks= {7: 21, 4: 25}
big_chunks= {7: 22, 4: 25}
big_chunks= {7: 22, 4: 26}
big_chunks= {7: 22, 4: 27}
big_chunks= {7: 22, 4: 28}
big_chunks= {7: 22, 4: 29}
big_chunks= {7: 22, 4: 30}
big_chunks= {7: 23, 4: 30}
big_chunks= {7: 23, 4: 31}
big_

# Music and Literature

# FILE OPS ================================================

In [36]:
"""To generate music experiment data, convert the .midi files to .musicxml (using something like Musescore) and then
convert the .musicxml to .json using code with comment
#CONVERT INDIVIDUAL FILES TO JSON
or
#CONVERT MUSIC DIRECTORY TO JSON 
This code cell is at the very bottom of the FILE OPS section
"""

'To generate music experiment data, convert the .midi files to .musicxml (using something like Musescore) and then\nconvert the .musicxml to .json using code with comment\n#CONVERT INDIVIDUAL FILES TO JSON\nor\n#CONVERT MUSIC DIRECTORY TO JSON \nThis code cell is at the very bottom of the FILE OPS section\n'

In [38]:
note_codes = { #coding music notes
    'C': 0,
    'D': 2,
    'E': 4,
    'F': 5,
    'G': 7,
    'A': 9,
    'B': 11
}

In [39]:
def sort_chord(a):
    if len(a) < 2:
        return 0
    octave = int(a[1])
    note = note_codes[a[0]]
    if len(a) > 2:
        if a[2] == 'M':
            note -= 1
        else:
            note += 1
    return -(octave * 12 + note)

In [40]:
print(sort_chord('A1'))
print(sort_chord('B6'))
print(sort_chord('C5'))
print(sort_chord('C5M'))
print(sort_chord('B4'))
print(sorted(['A1','B4','C5'], key = sort_chord))

-21
-83
-60
-59
-59
['C5', 'B4', 'A1']


In [41]:
def get_patterns_musicxml(filepath):
    root = ET.parse(filepath).getroot()
    pattern1 = []
    pattern2 = []
    pattern3 = []
    
    
    for msr_first in root.findall('part[@id="P1"]/measure'):
        notes = {}
        types = {}
        times = {}
        for part in root.findall('part'):
            part_id = part.attrib['id']
            measure = part.find('measure[@number="' + msr_first.attrib['number'] + '"]')
            time = 0
            for elem in measure.getchildren():
                if elem.tag == 'note':
                    note = elem
                    if note.find('grace') is not None:       #*************************************GRACE NOTE KOSTYL'
                        continue
                    voice = part_id + '-' + note.find('voice').text
                    print('voice', voice)
                    print('notes', notes)
                    pitch = note.find('pitch')
                    if note.find('rest') is not None:
                        s = 'P'
                    else:
                        s = pitch.find('step').text + pitch.find('octave').text
                        alter = pitch.find('alter')
                        if alter is not None:            #***********************to do: double flat, double sharp?
                            s += 'Z' if alter.text == '1' else 'M'
                    print('s',s)
                    duration = int(note.find('duration').text)
                    if note.find('chord') is not None:
                        notes[voice][-1].append(s)
                    else:
                        if voice not in notes:
                            notes[voice] = []
                            types[voice] = []
                            times[voice] = []
                        notes[voice].append([s])
                        times[voice].append(time)
                        note_type = note.find('type')
                        if note_type is not None:
                            nt = note_type.text
                            if note.find('dot') is not None:
                                nt += ' dot'
                        else:
                            nt = '.'
                        types[voice].append(nt)
                        time += duration
                elif elem.tag == 'backup':
                    time -= int(elem.find('duration').text)
#         print('-------------------------------------------')
#         print('notes=', notes)
#         print('types=', types)
#         print('times=', times)
        
        result_notes = []
        result_times = []
        result_types = []
        
        idx = {voice: 0 for voice in notes}
        while True:
            min_time = 9999999
            for voice in times:
                i = idx[voice]
                t = times[voice]
                if i < len(t) and t[i] < min_time:
                    min_time = t[i]
            step = []
            time = []
            type_ = []
            for voice in times:
                i = idx[voice]
                t = times[voice]
                if i < len(t) and t[i] == min_time:
                    step.extend(notes[voice][i])
                    time = t[i]
                    type_ = types[voice][i]
                    idx[voice] += 1
            if step:
                while 'P' in step:
                    step.remove('P')
                if step:
                    result_notes.append(step)
                    result_times.append(time)
                    result_types.append(type_)
            else:
                break

        variant1 = []
        variant2 = []
        variant3 = []
        for i in range(len(result_notes)):
            notes = ''.join(sorted(set(result_notes[i]), key=sort_chord))
            variant1.append(notes)
            variant2.append(notes + '-' + str(result_times[i]))
            variant3.append(notes + '-' + result_types[i])
        pattern1.append(variant1)
        pattern2.append(variant2)
        pattern3.append(variant3)
    return pattern1, pattern2, pattern3 #returns the same pattern in 3 versions

In [None]:
def get_all_files(folder, end='.musicxml'):
    return tuple(os.path.join(folder, name) for name in os.listdir(folder) if name.endswith(end))

In [None]:
#CONVERT INDIVIDUAL FILES TO JSON
# files = ('full_pieces/2chop.musicxml', 'full_pieces/Elochka2.musicxml')
# save_names = ('full_pieces/2chop.json', 'full_pieces/Elochka2.json')

#CONVERT MUSIC DIRECTORY TO JSON
files = get_all_files('music/music_xmls/schumann')
#('full_pieces/Elochka3.musicxml','full_pieces/Nocturne_Op.9_No.2_Frederic_Chopin.musicxml')
save_names = tuple('music/jsons/schumann\\' + name.split('\\')[-1][:-8] + 'json' for name in files)
#('full_pieces/Elochka3.json','full_pieces/Nocturne_Op.9_No.2_Frederic_Chopin.json')

variant = 0 # 0=pure pitch; varian1=pitch+duration integer, 2=pitch+type(e.g.quarter) 
for file, save_filename in zip(files, save_names):
    print(file)
    patterns = get_patterns_musicxml(file)[variant] 
    with open(save_filename, 'w') as f:
        f.write(json.dumps(patterns))

In [None]:
files = get_all_files('tests')
save_names = tuple('tests_json\\' + name.split('\\')[-1][:-8] + 'json' for name in files)

variant = 0 # 0=pure pitch; varian1=pitch+duration integer, 2=pitch+type(e.g.quarter) 
for file, save_filename in zip(files, save_names):
    print(file)
    patterns = get_patterns_musicxml(file)[variant] 
    with open(save_filename, 'w') as f:
        f.write(json.dumps(patterns))

# Music and Literature TRAINING ================================================

In [42]:
sep_sent = '\.|!|\?|\.\.\.|…'
sep_phrases = '\.|!|\?|\.\.\.|…|,|:|;|-|—|"|\'|&'
sep_number = 20

In [43]:
def get_literature_patterns(text, separator, split_by_letters=None):
    patterns=[]
    if isinstance(separator, int):
        words = [word for word in re.split('[^a-zа-я]+', text.lower())]
        for i in range(0, len(words), separator):
            patterns.append(words[i:i+separator] +['$'])
    return patterns

In [44]:
def get_sentences(text, sep, min_len=3): #number of words?
    patterns = []
    
    if isinstance(sep, int): #take sep_number words otherwise split by punctuation
        words = [word for word in re.split('[^a-zа-я]+', text.lower())] # if len(word) >= 3] #option to remove short words
        for i in range(0, len(words), sep):
            patterns.append(words[i:i + sep] + ['$'])
    else:
        for sent in re.split(sep, text.lower()):
            sent = list(filter(lambda word : len(word) >= 3, re.split('[^a-zа-я]+', sent)))
            if len(sent) >= min_len:
                patterns.append(sent + ['$'])
    return patterns

In [45]:
#(\r\n){2,}[^\r\n]*(\r\n){2,} -> \r\n\r\n\r\n #clean poetry text from titles and one line rubbish in Notepad+

In [46]:
def train(folder, get_patterns=None): #Same as above, easier interface for multiple categories
    inp = []

    for label in os.listdir(folder):
        p = os.path.join(folder, label)
        if os.path.isdir(p):
            for file in os.listdir(p):
                file = os.path.join(p, file)
                with open(file, 'r',  encoding='utf-8', errors='ignore') as f:
                    patterns = get_patterns(f.read())
                inp += list(zip((list(p) + ['$'] for p in patterns), [[*label, '$']] * len(patterns)))
                print('inp', inp)
    
    #random.shuffle(inp) #shuffle input patterns
    cogact = CogAct() #comment out to learn on top of previous knowledge, uncomment to train from blank slate
    for i in range(3): #default 300
        errors = 0
#         if i//10==0:
        print(i, '/', 5000)
        i = 0
        n = len(inp)
        for pattern in inp:
            # print('handle pattern', i, '/', n)
            i += 1
            vis, verb = pattern
            t = cogact.clock
            node1 = cogact.visual.recognise_and_learn(vis)
            node2 = cogact.verbal.recognise_and_learn(verb)
            if cogact.clock != t:
                errors += 1
                continue
            node1.label = node2.idx
            node2.label = node1.idx
        if errors == 0:
            break
    return cogact

In [None]:
folder = 'testing2024/txt_simple/train_texts'
get_pats = lambda text: get_sentences(text, sep_number)

input_test = []
for label in os.listdir(folder):
    p = os.path.join(folder, label)
#     print(p, os.path.isdir(p))
    
    if os.path.isdir(p):
            for file in os.listdir(p):
                file = os.path.join(p, file)
                with open(file, 'r',  encoding='utf-8', errors='ignore') as f:
                    patterns = get_pats(f.read())
                input_test += list(zip((list(p) for p in patterns), [[*label, '$']] * len(patterns)))
print('input_test=', input_test[:2],'\n')

In [None]:
start = time.time()
def train_poets(folder, sep): #categorise WITHOUT using similarity links
    return train(folder, lambda text: get_sentences(text, sep))
end = time.time()
cogact = train_poets('testing2024/txt_simple/train_texts', sep_number)
print(end - start)

In [None]:
start = time.time()
def train_music(folder): #categorise WITHOUT using similarity links
    return train(folder, json.loads)

cogact = train_music('testing2024/music_exp/train_json')
end = time.time()
print(end - start)

# Music and Literature CATEGORISATION =========================================

In [None]:
def cat_file(file, get_patterns, correct_label):
    with open(file, 'r', errors='ignore') as f:
        tests = get_patterns(f.read())

    print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n')
    print('--', file, '\n')
    print('-------- MAX IMAGE METHOD -------------------------------------\n')
    t_start = time.time()
    labels = {}
    for test in tests:
        max_image = []
        label = None
        for offset in range(len(test) - 1):
            node = cogact.visual.recognise(test[offset:])
            if node.label is not None and len(node.image) > len(max_image):
                max_image = node.image
                # label = cogact.nodes[node.label].image
                labels[node.label] = labels.get(node.label, 0) + 1 #labels value = labels value (0 if none) + 1
        # print('recognises', test, 'as', max_image or None, 'label:', label, sep='\n', end='\n\n')
    
    
    

    lab_sum = sum(labels.values())
    cogact_normalised = [(k, v / lab_sum) for k, v in labels.items()]
    top = sorted(cogact_normalised, key=lambda x: -x[1])
    for x in top:
        print('%s (prob = %s)' % (''.join(cogact.nodes[x[0]].image), round(x[1], 2)))
    winner = ''.join(cogact.nodes[top[0][0]].image) if len(top) > 0 else None
    print('correct:', winner == correct_label)
    
    print('final labels scores', labels)
    print('ended in', time.time() - t_start, 'sec')

    print('-------- MAX FREQ METHOD --------\n')
    t_start = time.time()

    labels_full = {}  #scores for all labels
    for test in tests:
        labels = {}
        max_freq = 0
        max_freq_image = None
        label = None
        for offset in range(len(test) - 1):
            node = cogact.visual.recognise(test[offset:])
            if node.label:
                freq = labels.get(node.label, 0) + 1
                labels[node.label] = freq
                if freq > max_freq:
                    max_freq = freq
                    max_freq_image = node.image
                    label = node.label
        #print('recognises', test, 'as', max_freq_image, 'labels:', labels, sep='\n', end='\n\n')
        if label:
            labels_full[label] = labels_full.get(label, 0) + 1
    print('final labels scores', labels_full)

    lab_sum = sum(labels_full.values())
    cogact_normalised = [(k, v / lab_sum) for k, v in labels_full.items()]
    top = sorted(cogact_normalised, key=lambda x: -x[1])
    for x in top:
        print('%s (prob = %s)' % (''.join(cogact.nodes[x[0]].image), round(x[1], 2)))
    winnerF = ''.join(cogact.nodes[top[0][0]].image) if len(top) > 0 else None
    print('correct:', winnerF == correct_label)
    print('ended in', time.time() - t_start, 'sec')
    return winner == correct_label # winner =return winner of maxImage, winnerF = return winner of maxFreq

In [None]:
def cat(folder, get_patterns): #categ per folder
    correct_count = 0
    total_count = 0
    for label in os.listdir(folder):
        p = os.path.join(folder, label)
        if os.path.isdir(p):
            for file in os.listdir(p):
                total_count += 1
                correct_count += cat_file(  #use cat_file function to categorise
                    file=os.path.join(p, file),
                    get_patterns=get_patterns,
                    correct_label=label
                )
    print('-' * 70)
    print('TOTAL CORRECT:', correct_count, '/', total_count)

In [None]:
def cat_poets(folder, sep): #categorise WITHOUT using similarity links
    return cat(folder, lambda text: get_sentences(text, sep))

cat_poets('testing2023/lit_exp/predict_texts', sep_number)

In [None]:
def cat_music(folder): #categorise WITHOUT using similarity links
    return cat(folder, json.loads)

cat_music('testing2023\music_exp\predict_json_60')

# CATEGORISATION TESTS=====================================

In [9]:
def cat_music(folder): 
    return cat(folder, json.loads)

result = cat_music('testing2024/music_exp/predict_json_60')


+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

-- testing2024/music_exp/predict_json_60\bach\WTCII09A.json 

-------- MAX IMAGE METHOD -------------------------------------

labels {31: 144, 4: 74, 6: 26, 8: 28}
cogact_normalised [(31, 0.5294117647058824), (4, 0.27205882352941174), (6, 0.09558823529411764), (8, 0.10294117647058823)]
top [(31, 0.5294117647058824), (4, 0.27205882352941174), (8, 0.10294117647058823), (6, 0.09558823529411764)]
bach (prob = 0.53)
bthvn (prob = 0.27)
chopin (prob = 0.1)
mozart (prob = 0.1)
correct: True
final labels scores {31: 144, 4: 74, 6: 26, 8: 28}
ended in 1.162891149520874 sec
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

-- testing2024/music_exp/predict_json_60\bach\Wtcii09b.json 

-------- MAX IMAGE METHOD -------------------------------------

labels {4: 28, 8: 12, 31: 56, 6: 4}
cogact_normalised [(4, 0.28), (8, 0.12), (31, 0.56), (6, 0.04)]
top [(31, 0.56), (4, 0.28), (8, 0.12), (6, 0.04)]
bach (prob = 0.56)
bt

labels {31: 118, 4: 44, 6: 21, 8: 24}
cogact_normalised [(31, 0.5700483091787439), (4, 0.21256038647342995), (6, 0.10144927536231885), (8, 0.11594202898550725)]
top [(31, 0.5700483091787439), (4, 0.21256038647342995), (8, 0.11594202898550725), (6, 0.10144927536231885)]
bach (prob = 0.57)
bthvn (prob = 0.21)
chopin (prob = 0.12)
mozart (prob = 0.1)
correct: True
final labels scores {31: 118, 4: 44, 6: 21, 8: 24}
ended in 1.423194169998169 sec
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

-- testing2024/music_exp/predict_json_60\bach\WTCII23A.json 

-------- MAX IMAGE METHOD -------------------------------------

labels {4: 49, 31: 88, 6: 13, 8: 15}
cogact_normalised [(4, 0.296969696969697), (31, 0.5333333333333333), (6, 0.07878787878787878), (8, 0.09090909090909091)]
top [(31, 0.5333333333333333), (4, 0.296969696969697), (8, 0.09090909090909091), (6, 0.07878787878787878)]
bach (prob = 0.53)
bthvn (prob = 0.3)
chopin (prob = 0.09)
mozart (prob = 0.08)
correct: True
fin

labels {31: 188, 6: 24, 4: 231, 8: 81}
cogact_normalised [(31, 0.35877862595419846), (6, 0.04580152671755725), (4, 0.44083969465648853), (8, 0.15458015267175573)]
top [(4, 0.44083969465648853), (31, 0.35877862595419846), (8, 0.15458015267175573), (6, 0.04580152671755725)]
bthvn (prob = 0.44)
bach (prob = 0.36)
chopin (prob = 0.15)
mozart (prob = 0.05)
correct: True
final labels scores {31: 188, 6: 24, 4: 231, 8: 81}
ended in 1.7972168922424316 sec
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

-- testing2024/music_exp/predict_json_60\bthvn\Piano Sonata n18 The Hunt_1.json 

-------- MAX IMAGE METHOD -------------------------------------

labels {6: 26, 4: 78, 31: 54, 8: 41}
cogact_normalised [(6, 0.1306532663316583), (4, 0.39195979899497485), (31, 0.271356783919598), (8, 0.20603015075376885)]
top [(4, 0.39195979899497485), (31, 0.271356783919598), (8, 0.20603015075376885), (6, 0.1306532663316583)]
bthvn (prob = 0.39)
bach (prob = 0.27)
chopin (prob = 0.21)
mozart (pro

labels {31: 26, 6: 80, 8: 79, 4: 34}
cogact_normalised [(31, 0.1187214611872146), (6, 0.365296803652968), (8, 0.3607305936073059), (4, 0.1552511415525114)]
top [(6, 0.365296803652968), (8, 0.3607305936073059), (4, 0.1552511415525114), (31, 0.1187214611872146)]
mozart (prob = 0.37)
chopin (prob = 0.36)
bthvn (prob = 0.16)
bach (prob = 0.12)
correct: False
final labels scores {31: 26, 6: 80, 8: 79, 4: 34}
ended in 2.137310028076172 sec
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

-- testing2024/music_exp/predict_json_60\chopin\Sonata op35 n4 .json 

-------- MAX IMAGE METHOD -------------------------------------

labels {4: 43, 6: 64, 31: 5, 8: 55}
cogact_normalised [(4, 0.25748502994011974), (6, 0.38323353293413176), (31, 0.029940119760479042), (8, 0.32934131736526945)]
top [(6, 0.38323353293413176), (8, 0.32934131736526945), (4, 0.25748502994011974), (31, 0.029940119760479042)]
mozart (prob = 0.38)
chopin (prob = 0.33)
bthvn (prob = 0.26)
bach (prob = 0.03)
correct:

labels {6: 630, 31: 34, 8: 152, 4: 30}
cogact_normalised [(6, 0.7446808510638298), (31, 0.04018912529550828), (8, 0.17966903073286053), (4, 0.03546099290780142)]
top [(6, 0.7446808510638298), (8, 0.17966903073286053), (31, 0.04018912529550828), (4, 0.03546099290780142)]
mozart (prob = 0.74)
chopin (prob = 0.18)
bach (prob = 0.04)
bthvn (prob = 0.04)
correct: True
final labels scores {6: 630, 31: 34, 8: 152, 4: 30}
ended in 2.243997812271118 sec
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

-- testing2024/music_exp/predict_json_60\mozart\K333 Piano Sonata n13 1mov.json 

-------- MAX IMAGE METHOD -------------------------------------

labels {4: 86, 6: 370, 31: 58, 8: 234}
cogact_normalised [(4, 0.11497326203208556), (6, 0.4946524064171123), (31, 0.07754010695187166), (8, 0.31283422459893045)]
top [(6, 0.4946524064171123), (8, 0.31283422459893045), (4, 0.11497326203208556), (31, 0.07754010695187166)]
mozart (prob = 0.49)
chopin (prob = 0.31)
bthvn (prob = 0.11)
bach (

In [37]:
def cat_poets(folder, sep): 
    return cat(folder, lambda text: get_sentences(text, sep))

# result = cat_poets('testing2024/lit_exp/predict_texts_60', sep_number)
result = cat_poets('testing2024/txt_simple/train_texts', sep_number)


+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

-- testing2024/txt_simple/train_texts\homer\homer_train.txt 

-------- MAX IMAGE METHOD -------------------------------------

labels {6: 156}
cogact_normalised [(6, 1.0)]
top [(6, 1.0)]
homer (prob = 1.0)
correct: True
final labels scores {6: 156}
ended in 0.5674898624420166 sec
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

-- testing2024/txt_simple/train_texts\joyce\joyce_train.txt 

-------- MAX IMAGE METHOD -------------------------------------

labels {9: 17}
cogact_normalised [(9, 1.0)]
top [(9, 1.0)]
joyce (prob = 1.0)
correct: True
final labels scores {9: 17}
ended in 0.10073065757751465 sec
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

-- testing2024/txt_simple/train_texts\shakespeare\mid_summer_60k.txt 

-------- MAX IMAGE METHOD -------------------------------------

labels {4: 14}
cogact_normalised [(4, 1.0)]
top [(4, 1.0)]
shakespeare (prob = 1.0)
correct: True
final labe

In [61]:
len(cogact.nodes)

20044

In [161]:
cogact.verbal.print_tree()

< ROOT NODE > Node: 1 <  >
-------< w > Node: 4 < w s c o t t > (12483)
-------< j > Node: 6 < j o y c e > (18870)
-------< h > Node: 8 < h o m e r > (18869)
-------< c > Node: 11 < c h a u c e r > (12476)
-------< s > Node: 13 < s h a k e s p e a r e > (3967)
-------< d > Node: 15 < d i c k e n s > (12484)
-------< a > Node: 24 <  >
-------< o > Node: 26 <  >
-------< i > Node: 31 <  >
-------< t > Node: 34 <  >
-------< y > Node: 37 <  >
-------< k > Node: 39 <  >
-------< m > Node: 41 <  >
-------< e > Node: 51 <  >
-------< n > Node: 57 <  >
-------< u > Node: 60 <  >
-------< r > Node: 69 <  >
-------< p > Node: 72 <  >
