conda install -c estnltk -c conda-forge estnltk=1.6.7b

#### Abstract Runner
Abstract Runner which uses configuration to process data and caches answers 

In [1]:
from os import listdir
from os.path import isfile, join
import json

data_dir = "data/discord"

class CachedRunner:
    def __init__(self, configuration):
        """
        directory: all files in given folder and subfolder will be used.
        """
        self.conf = configuration
        self.name = self.conf.name
        self.get_empty = self.conf.get_empty
        self.serialize = self.conf.serialize
        self.deserialize = self.conf.deserialize
        self.apply = self.conf.apply
        self.set_position = self.conf.set_position

        
    def invoke(self, directory):
        """
        Update cache or create one with given and return the results.

        preconditions: 
        term count of apply must be 2: (given_structure, discord_message)
        term count of serializer must be 1
        term count of deserializer must be 1

        next functions must make inplace mutations:
        serializer, deserializer, function_to_apply

        postcondition:
        a list of filled versions of default_structures are returned
        """
        all_processed_data = []

        for f in listdir(directory):
            cache_file_prefix = self.name + "_cache_"
            
            if "_cache_" in f:
                continue

            cache_location = join(directory, cache_file_prefix + f)
            cur_path = join(directory, f)
            self.set_position(cur_path)

            if not isfile(cur_path):
                all_processed_data += self.invoke(cur_path)

            else:
                # Create empty cache if needed
                if not isfile(cache_location): 
                    cache = open(cache_location, "w")
                    json.dump(self.serialize(self.get_empty()), cache)
                    cache.close()

                # Read current version
                cache = open(cache_location, "r")
                cur_layer = self.deserialize(json.load(cache))
                cache.close()

                # Update cache
                discord_data = open(cur_path, "r")
                raw_json = json.load(discord_data)

                for message in raw_json["messages"]:
                    self.apply(cur_layer, message)

                discord_data.close()
                    
                # Save updated version
                cache = open(cache_location, "w")
                json.dump(self.serialize(cur_layer), cache)
                cache.close()
                
                all_processed_data.append(cur_layer)
    
        return all_processed_data

#### Configuration
Configuration template to be used in the Abstract Runner

In [2]:
from abc import ABC, ABCMeta, abstractmethod, abstractproperty

class Configuration():
    __metaclass__ = ABCMeta
    
    def __init__(self):
        self.pos = "unknown"
    
    @abstractproperty 
    def name(self): 
        return "generic"
    
    def get_empty(self): 
        return {
            "position": self.pos,
        }
    
    def get_position(self):
        return self.pos
    
    def set_position(self, pos):
        self.pos = pos
    
    @abstractmethod 
    def apply(self, layer, message): 
        return {}
    
    @abstractmethod 
    def serialize(self, layer):
        return layer

    @abstractmethod 
    def deserialize(self, layer): 
        return layer


#### Count configuration
Configuration implementation which counts the repetitions of neighbouring words

In [3]:
from estnltk import Text
from datetime import datetime
from collections import Counter
from collections import deque
import html
import re

class CountConfiguration(Configuration):
    
    @property
    def name(self):
        return "count"
    
    
    def get_empty(self):
        return {
            "position": self.get_position(),
            "last_timestamp": 0,
            "counter_1": Counter(),
            "counter_2": Counter(),
            "counter_3": Counter(),
            "counter_4": Counter()
        }
    
    
    def apply(self, layer, message): 
        timestamp = re.sub(r"(\.\d+)?[\+\-]\d+:\d+", "", message["timestamp"])
        new_time = datetime.strptime(''.join(timestamp), '%Y-%m-%dT%H:%M:%S').timestamp()
        if new_time > layer["last_timestamp"]:
            layer["last_timestamp"] = new_time

            parsed_text = Text(html.unescape(message["content"])).tag_layer(["words"])

            deques = [
                ("counter_1", deque(maxlen=1)),
                ("counter_2", deque(maxlen=2)),
                ("counter_3", deque(maxlen=3)),
                ("counter_4", deque(maxlen=4))
            ]

            for word in parsed_text.words:
                for key, q in deques:
                    q.extend([word.text])
                    if q.maxlen == len(q):
                        layer[key][" ".join(q)] += 1
    
    
    def serialize(self, layer):
        for i in range(1, 5):
            layer[f"counter_{i}"] = dict(layer[f"counter_{i}"])
        return layer

    
    def deserialize(self, layer): 
        for i in range(1, 5):
            layer[f"counter_{i}"] = Counter(layer[f"counter_{i}"])
        return layer

In [4]:
count_runner = CachedRunner(CountConfiguration()).invoke(data_dir)

[(x['position'], x["last_timestamp"]) for x in count_runner]

[('data\\2020\\general\\Python 2020 - Text Channels - general.json',
  1601387696.0)]