#### Abstract Runner
Abstract Runner which uses configuration to process data and caches answers 

In [1]:
import os
import json

class CachedRunner:
    def __init__(self, configuration):
        """directory: all files in given folder and subfolder will be used."""
        self.conf = configuration
        self.name = self.conf.name
        self.get_empty = self.conf.get_empty
        self.serialize = self.conf.serialize
        self.deserialize = self.conf.deserialize
        self.apply = self.conf.apply
        self.data_dir = "/analyzer/input"
        self.data_out = "/analyzer/tmp"

        
    def get_reduced_matrixes(self):
        reduced_matrixes = []
        for name, matrix in [
            ("Year", self.get_year_matrix()),
            ("Flat", self.get_flattened_matrix()),
            ("Category", self.get_category_matrix())
        ]:
            reduced_matrix = [[], self.conf.get_empty()]
            print("Reducing: " + name)
            for grouping, group_values in matrix.items():
                print("\n\nFound group: " + grouping, end=" with elements: ")
                for group_name, group_value in group_values:
                    print(group_name, end=" ")
                    reduced_matrix[0].append(group_name)
                    reduced_matrix[1] = self.conf.combine(reduced_matrix[1], group_value)
            print("\n")
            reduced_matrixes.append(reduced_matrix)
        return reduced_matrixes
    
    ### MATRIXES
    # Matrixes must have the type of Map<grouping, List<Tuple<name, Configuration.empty>>>
    
    def get_year_matrix(self):
        matrix = dict()
        for guild in self.get_guilds():
            row = []
            guild_name = guild["name"]
            for channel in self.get_channels_in_guild(guild):
                channel_name = channel["name"]
                channel = self.get_channel(guild, channel)
                row.append((channel_name, channel))
            matrix[guild_name] = row
        return matrix


    def get_flattened_matrix(self):
        matrix = dict()
        for guild in self.get_guilds():
            guild_name = guild["name"]
            for channel in self.get_channels_in_guild(guild):
                channel_name = channel["name"]
                channel = self.get_channel(guild, channel)
                matrix[channel_name] = [(channel_name, channel)]
        return matrix


    def get_category_matrix(self):
        matrix = dict()
        for guild in self.get_guilds():
            guild_name = guild["name"]
            for channel in self.get_channels_in_guild(guild):
                channel_name = channel["name"]
                grouping, _ = [x.strip() for x in channel_name.split("/")]
                channel = self.get_channel(guild, channel)
                if not matrix.get(grouping):
                    matrix[grouping] = []
                matrix[grouping].append((channel_name, channel))
        return matrix

    ### FILE GETTERS
    
    def get_guilds(self):
        try:
            guilds_file = open(os.path.join(self.data_out, self.conf.name + "_cache_guilds.json"), "r")
            guilds = json.load(guilds_file)
            guilds_file.close()
            return guilds["guilds"]
        except Exception as e:
            return []


    def get_channels_in_guild(self, guild):
        try:     
            channels_file = open(os.path.join(self.data_out, str(guild["id"]), self.conf.name + "_cache_channels.json"), "r")
            channels = json.load(channels_file)
            channels_file.close()
            return channels["channels"]
        except Exception as e:
            return []


    def get_channel(self, guild, channel):
        try:    
            channel_path = os.path.join(self.data_out, str(guild["id"]), str(channel["id"]), self.conf.name + "_cache_channel.json")
            channel_file = open(channel_path, "r")
            channel = configuration.deserialize(json.load(channel_file))
            channel_file.close()
            return channel
        except Exception as e:
            return self.get_empty()
        
    ### UPDATE CACHE
        
    def invoke(self, directory="/analyzer/input", clean=False):
        """
        Update cache or create one with given and return the results.

        preconditions: 
        term count of apply must be 2: (given_structure, discord_message)
        term count of serializer must be 1
        term count of deserializer must be 1

        next functions must make inplace mutations:
        serializer, deserializer, function_to_apply

        postcondition:
        a list of filled versions of default_structures are returned
        """

        for f in os.listdir(directory):
            cache_file_prefix = self.name + "_cache_"
            cache_location = os.path.join(directory.replace(self.data_dir, self.data_out), cache_file_prefix + f)
            cur_path = os.path.join(directory, f)

            if not os.path.isfile(cur_path):
                self.invoke(cur_path)

            else:
                print("processing: " + cur_path + " cache: " + cache_location)
                os.makedirs(os.path.dirname(cache_location), exist_ok=True)
                
                # Create empty cache if needed
                if not os.path.isfile(cache_location) or clean:
                    cache = open(cache_location, "w")
                    json.dump(self.serialize(self.get_empty()), cache)
                    cache.close()

                try:
                    # Read current version
                    cache = open(cache_location, "r")
                    cur_layer = self.deserialize(json.load(cache))
                    cache.close()
                except Exception:
                    # Delete and create a new one
                    os.remove(cache_location)
                    cache = open(cache_location, "w")
                    json.dump(self.serialize(self.get_empty()), cache)
                    cache.close()

                # Update cache
                discord_data = open(cur_path, "r")
                raw_json = json.load(discord_data)
                
                try:
                    for message in raw_json["messages"]:
                        self.apply(cur_layer, message)

                    discord_data.close()

                    # Save updated version
                    cache = open(cache_location, "w")
                    json.dump(self.serialize(cur_layer), cache)
                    cache.close()
                except Exception:
                    # Save initial version
                    cache = open(cache_location, "w")
                    json.dump(raw_json, cache)
                    cache.close()

#### Configuration
Configuration template to be used in the Abstract Runner

In [2]:
from abc import ABC, ABCMeta, abstractmethod, abstractproperty

class Configuration():
    __metaclass__ = ABCMeta

    @abstractproperty 
    def name(self): 
        return "generic"

    @abstractmethod 
    def combine(self, first, second): 
        return first
    
    @abstractmethod 
    def get_empty(self): 
        return {
        }
    
    @abstractmethod 
    def apply(self, layer, message): 
        return {}
    
    @abstractmethod 
    def serialize(self, layer):
        return layer

    @abstractmethod 
    def deserialize(self, layer): 
        return layer


#### Count configuration
Configuration implementation which counts the repetitions of neighbouring words

In [3]:
from estnltk import Text
from datetime import datetime
from collections import Counter
from collections import deque
import html
import re

class CountConfiguration(Configuration):
    
    @property
    def name(self):
        return "count"
    
    def combine(self, first, second): 
        combined = dict()
        people = first["people"]
        for i in range(1, 5):
            combined["counter_" + str(i)] = first["counter_" + str(i)] + second["counter_" + str(i)]
        
        for name in second["people"]:
            if not people.get(name):
                people[name] = second["people"][name]
            else:
                for item in second["people"][name]:
                    if not people[name].get(item):
                        people[name][item] = second["people"][name][item]
                    else:
                        people[name][item] += second["people"][name][item]
        
        combined["people"] = people
        return combined
    
    
    def get_empty(self):
        return {
            "last_timestamp": 0,
            "counter_1": Counter(),
            "counter_2": Counter(),
            "counter_3": Counter(),
            "counter_4": Counter(),
            "people": dict()
        }
    
    
    def apply(self, layer, message): 
        timestamp = re.sub(r"(\.\d+)?[\+\-]\d+:\d+", "", message["timestamp"])
        new_time = datetime.strptime(''.join(timestamp), '%Y-%m-%dT%H:%M:%S').timestamp()
        if new_time > layer["last_timestamp"]:
            layer["last_timestamp"] = new_time

            parsed_text = Text(html.unescape(message["content"])).tag_layer(["morph_analysis"])
            parsed_user = html.unescape(message["author"]["name"])
            
            deques = [
                ("counter_1", deque(maxlen=1)),
                ("counter_2", deque(maxlen=2)),
                ("counter_3", deque(maxlen=3)),
                ("counter_4", deque(maxlen=4))
            ]

            for word in parsed_text.words:
                for key, q in deques:
                    q.extend([word.lemma])
                    if q.maxlen == len(q):
                        if not layer["people"].get(parsed_user):
                            layer["people"][parsed_user] = dict()
                        if not layer["people"][parsed_user].get(key):
                            layer["people"][parsed_user][key] = Counter()
                        layer["people"][parsed_user][key][" ".join(q)] += 1
                        layer[key][" ".join(q)] += 1
    
    
    def serialize(self, layer):
        for i in range(1, 5):
            layer["counter_" + str(i)] = dict(layer["counter_" + str(i)])
        for person in layer["people"].keys():
            for counter in layer["people"][person].keys():
                layer["people"][person][counter] = dict(layer["people"][person][counter])
        return layer

    
    def deserialize(self, layer): 
        for i in range(1, 5):
            layer["counter_" + str(i)] = Counter(layer["counter_" + str(i)])
        for person in layer["people"].keys():
            for counter in layer["people"][person].keys():
                layer["people"][person][counter] = Counter(layer["people"][person][counter])
        return layer

In [4]:
configuration = CountConfiguration()
count_runner = CachedRunner(configuration)
count_runner.invoke(clean=True)
matrices = count_runner.get_reduced_matrixes()

processing: /analyzer/input/523436468908064768/545193393672814593/channel.json cache: /analyzer/tmp/523436468908064768/545193393672814593/count_cache_channel.json
processing: /analyzer/input/523436468908064768/557379696472358912/channel.json cache: /analyzer/tmp/523436468908064768/557379696472358912/count_cache_channel.json
processing: /analyzer/input/523436468908064768/550823329544667149/channel.json cache: /analyzer/tmp/523436468908064768/550823329544667149/count_cache_channel.json
processing: /analyzer/input/523436468908064768/538662191998697472/channel.json cache: /analyzer/tmp/523436468908064768/538662191998697472/count_cache_channel.json
processing: /analyzer/input/523436468908064768/559695674384318464/channel.json cache: /analyzer/tmp/523436468908064768/559695674384318464/count_cache_channel.json
processing: /analyzer/input/523436468908064768/542288295682375681/channel.json cache: /analyzer/tmp/523436468908064768/542288295682375681/count_cache_channel.json
processing: /analyzer/

processing: /analyzer/input/523436468908064768/557456044629164033/channel.json cache: /analyzer/tmp/523436468908064768/557456044629164033/count_cache_channel.json
processing: /analyzer/input/523436468908064768/567348322562736148/channel.json cache: /analyzer/tmp/523436468908064768/567348322562736148/count_cache_channel.json
processing: /analyzer/input/523436468908064768/538330246349324289/channel.json cache: /analyzer/tmp/523436468908064768/538330246349324289/count_cache_channel.json
processing: /analyzer/input/523436468908064768/538332551496859650/channel.json cache: /analyzer/tmp/523436468908064768/538332551496859650/count_cache_channel.json
processing: /analyzer/input/523436468908064768/552516649291612161/channel.json cache: /analyzer/tmp/523436468908064768/552516649291612161/count_cache_channel.json
processing: /analyzer/input/523436468908064768/574729588597587968/channel.json cache: /analyzer/tmp/523436468908064768/574729588597587968/count_cache_channel.json
processing: /analyzer/

processing: /analyzer/input/742745493570060302/751336247733256232/channel.json cache: /analyzer/tmp/742745493570060302/751336247733256232/count_cache_channel.json
processing: /analyzer/input/742745493570060302/753932659461455942/channel.json cache: /analyzer/tmp/742745493570060302/753932659461455942/count_cache_channel.json
processing: /analyzer/input/742745493570060302/760206996296564745/channel.json cache: /analyzer/tmp/742745493570060302/760206996296564745/count_cache_channel.json
processing: /analyzer/input/742745493570060302/754941297760010270/channel.json cache: /analyzer/tmp/742745493570060302/754941297760010270/count_cache_channel.json
processing: /analyzer/input/742745493570060302/755506573639418026/channel.json cache: /analyzer/tmp/742745493570060302/755506573639418026/count_cache_channel.json
processing: /analyzer/input/742745493570060302/770058103877664778/channel.json cache: /analyzer/tmp/742745493570060302/770058103877664778/count_cache_channel.json
processing: /analyzer/

processing: /analyzer/input/610782839805444126/620694223901032449/channel.json cache: /analyzer/tmp/610782839805444126/620694223901032449/count_cache_channel.json
processing: /analyzer/input/610782839805444126/618441963372544026/channel.json cache: /analyzer/tmp/610782839805444126/618441963372544026/count_cache_channel.json
processing: /analyzer/input/610782839805444126/640838601290154014/channel.json cache: /analyzer/tmp/610782839805444126/640838601290154014/count_cache_channel.json
processing: /analyzer/input/610782839805444126/625029505471217682/channel.json cache: /analyzer/tmp/610782839805444126/625029505471217682/count_cache_channel.json
processing: /analyzer/input/610782839805444126/639264565644754981/channel.json cache: /analyzer/tmp/610782839805444126/639264565644754981/count_cache_channel.json
processing: /analyzer/input/610782839805444126/625576955684585482/channel.json cache: /analyzer/tmp/610782839805444126/625576955684585482/count_cache_channel.json
processing: /analyzer/

processing: /analyzer/input/610782839805444126/640852795154759680/channel.json cache: /analyzer/tmp/610782839805444126/640852795154759680/count_cache_channel.json
processing: /analyzer/input/610782839805444126/614571380201095238/channel.json cache: /analyzer/tmp/610782839805444126/614571380201095238/count_cache_channel.json
processing: /analyzer/input/610782839805444126/613840014270398476/channel.json cache: /analyzer/tmp/610782839805444126/613840014270398476/count_cache_channel.json
processing: /analyzer/input/610782839805444126/646326463167201280/channel.json cache: /analyzer/tmp/610782839805444126/646326463167201280/count_cache_channel.json
processing: /analyzer/input/610782839805444126/641783762769936417/channel.json cache: /analyzer/tmp/610782839805444126/641783762769936417/count_cache_channel.json
processing: /analyzer/input/610782839805444126/633439569496375307/channel.json cache: /analyzer/tmp/610782839805444126/633439569496375307/count_cache_channel.json
processing: /analyzer/

processing: /analyzer/input/664729755991408660/695984939132256326/channel.json cache: /analyzer/tmp/664729755991408660/695984939132256326/count_cache_channel.json
processing: /analyzer/input/664729755991408660/668262905204178979/channel.json cache: /analyzer/tmp/664729755991408660/668262905204178979/count_cache_channel.json
processing: /analyzer/input/664729755991408660/680104705707606100/channel.json cache: /analyzer/tmp/664729755991408660/680104705707606100/count_cache_channel.json
processing: /analyzer/input/664729755991408660/688126455187767320/channel.json cache: /analyzer/tmp/664729755991408660/688126455187767320/count_cache_channel.json
processing: /analyzer/input/664729755991408660/695985003548508220/channel.json cache: /analyzer/tmp/664729755991408660/695985003548508220/count_cache_channel.json
processing: /analyzer/input/guilds.json cache: /analyzer/tmp/count_cache_guilds.json
Reducing: Year


Found group: Python 2020 with elements: EX / ex01_cashier EX / ex02_cypher EX / ex0

Found group: ex / ex09-typegame with elements: ex / ex09-typegame 

Found group: tugigrupid / kaugõpe-3 with elements: tugigrupid / kaugõpe-3 

Found group: TUGIGRUPID / risto-tugigrupp with elements: TUGIGRUPID / risto-tugigrupp 

Found group: TUGIGRUPID / timo-tugigrupp with elements: TUGIGRUPID / timo-tugigrupp 

Found group: täiendusõpe / t09 with elements: täiendusõpe / t09 

Found group: tugigrupid / k14-1 with elements: tugigrupid / k14-1 

Found group: EX / ex09_recursive_calories with elements: EX / ex09_recursive_calories 

Found group: TUGIGRUPID / yes-you-can with elements: TUGIGRUPID / yes-you-can 

Found group: fun / arvaära with elements: fun / arvaära 

Found group: EX / ex13_board_games with elements: EX / ex13_board_games 

Found group: PR / pr15_lambda with elements: PR / pr15_lambda 

Found group: Text Channels / stat with elements: Text Channels / stat 

Found group: PR / pr13-singleton with elements: PR / pr13-singleton 

Found group: PR / pr08-geometry with eleme

Found group: EX / ex11_order with elements: EX / ex11_order 

Found group: PR / pr02-datastructures with elements: PR / pr02-datastructures 

Found group: EX / ex02-cpu with elements: EX / ex02-cpu 

Found group: xp / xp07_stargate with elements: xp / xp07_stargate 

Found group: EX / ex04_hobbies with elements: EX / ex04_hobbies 

Found group: EX / ex07_minesweeper with elements: EX / ex07_minesweeper 

Found group: xp / xp08_investor with elements: xp / xp08_investor 

Found group: EX / ex06-files with elements: EX / ex06-files 

Found group: Text Channels / gomoku with elements: Text Channels / gomoku 

Found group: Text Channels / tk with elements: Text Channels / tk 

Found group: PR / pr10-recursion with elements: PR / pr10-recursion 

Found group: EX / ex01 with elements: EX / ex01 

Found group: FUN / anime with elements: FUN / anime 

Found group: Kalmo / java with elements: Kalmo / java 

Found group: tugigrupid / r4 with elements: tugigrupid / r4 

Found group: general / eks

Found group: EX with elements: EX / ex01-id-code EX / ex02-cpu EX / ex03-social-network EX / ex04-bank EX / ex05-parking EX / ex06-files EX / ex07-typegame EX / ex08-cookie-clicker EX / ex09-computer-shop EX / ex11-birdwatching EX / ex12-sum100 EX / ex13-weather EX / ex14-furniture EX / ex15-casino EX / ex01 EX / ex02 EX / ex03_idcode EX / ex04_cipher EX / ex05_pies EX / ex06_schedule EX / ex07_minesweeper EX / ex08_solution_and_test EX / ex09_recursive_calories EX / ex11_order EX / ex12_adventure EX / ex13_blackjack EX / ex14_pokemon EX / ex15_magic EX / ex01_cashier EX / ex02_cypher EX / ex03_booksortation EX / ex04_hobbies EX / ex05_oee EX / ex06_messenger EX / ex07_train_station EX / ex08_formula_one EX / ex09_meta EX / ex11_bank EX / ex12_trees EX / ex13_board_games EX / ex14_line_following EX / ex15_santas_workshop 

Found group: TÄIENDUSÕPE with elements: TÄIENDUSÕPE / t01 TÄIENDUSÕPE / t02 TÄIENDUSÕPE / t03 TÄIENDUSÕPE / t04 TÄIENDUSÕPE / t05 TÄIENDUSÕPE / t06 TÄIENDUSÕPE / t07

In [5]:
print(len(matrices))

3
