conda install -c estnltk -c conda-forge estnltk=1.6.7b

In [41]:
from os import listdir
from os.path import isfile, join
import json

data_dir = "data"

class AbstractRunner:
    def __init__(self, cache_prefix, default_structure, function_to_apply, serializer, deserializer):
        """
        directory: all files in given folder and subfolder will be used.
        """
        self.cache_prefix = cache_prefix
        self.default_structure = default_structure
        self.function_to_apply = function_to_apply
        self.serializer = serializer
        self.deserializer = deserializer
        
    def invoke(self, directory):
        """
        Update cache or create one with given and return the results.

        preconditions: 
        term count of function_to_apply must be 2: (given_structure, discord_message)
        term count of serializer must be 1
        term count of deserializer must be 1

        next functions must make inplace mutations:
        serializer, deserializer, function_to_apply

        postcondition:
        a list of filled versions of default_structures are returned
        """
        all_processed_data = []

        for f in listdir(directory):
            cache_file_prefix = self.cache_prefix + "_cache_"
            
            if "_cache_" in f:
                continue

            cache_location = join(directory, cache_file_prefix + f)
            cur_path = join(directory, f)

            if not isfile(cur_path):
                all_processed_data += self.invoke(cur_path)

            else:
                # Create empty cache if needed
                if not isfile(cache_location): 
                    cache = open(cache_location, "w")
                    json.dump(self.serializer(self.default_structure), cache)
                    cache.close()

                # Read current version
                cache = open(cache_location, "r")
                cur_layer = self.deserializer(json.load(cache))
                cache.close()

                # Update cache
                discord_data = open(cur_path, "r")
                raw_json = json.load(discord_data)

                for message in raw_json["messages"]:
                    self.function_to_apply(cur_layer, message)

                discord_data.close()
                    
                # Save updated version
                cache = open(cache_location, "w")
                json.dump(self.serializer(cur_layer), cache)
                cache.close()
                
                all_processed_data.append(cur_layer)
    
        return all_processed_data

In [32]:
from estnltk import Text
from datetime import datetime
from collections import Counter
from collections import deque
import html
import re

count_repetitions_empty = {
                                "last_timestamp": 0,
                                "counter_1": Counter(),
                                "counter_2": Counter(),
                                "counter_3": Counter(),
                                "counter_4": Counter()
                            }

def count_serializer(layer):
    for i in range(1, 5):
        layer[f"counter_{i}"] = dict(layer[f"counter_{i}"])
    return layer

def count_deserializer(layer):
    for i in range(1, 5):
        layer[f"counter_{i}"] = Counter(layer[f"counter_{i}"])
    return layer

def update_count_repetitions(layer, message):
    timestamp = re.sub(r"(\.\d+)?[\+\-]\d+:\d+", "", message["timestamp"])
    new_time = datetime.strptime(''.join(timestamp), '%Y-%m-%dT%H:%M:%S').timestamp()
    if new_time > layer["last_timestamp"]:
        layer["last_timestamp"] = new_time

        parsed_text = Text(html.unescape(message["content"])).tag_layer(["words"])
        
        deques = [
            ("counter_1", deque(maxlen=1)),
            ("counter_2", deque(maxlen=2)),
            ("counter_3", deque(maxlen=3)),
            ("counter_4", deque(maxlen=4))
        ]
        
        for word in parsed_text.words:
            for key, q in deques:
                q.extend([word.text])
                if q.maxlen == len(q):
                    layer[key][" ".join(q)] += 1


In [42]:
count_runner = AbstractRunner(
    "count",
    count_repetitions_empty,
    update_count_repetitions,
    count_serializer,
    count_deserializer
).invoke(data_dir)

count_runner

[{'last_timestamp': 1601387696.0,
  'counter_1': {'Tere': 24,
   '!': 86,
   '👋': 5,
   'hei': 10,
   'Hei': 33,
   'kõigele': 1,
   '`': 73,
   'py': 4,
   'print': 18,
   '(': 210,
   "'": 38,
   'Hello': 2,
   'World': 2,
   ')': 246,
   '😉': 14,
   '>': 153,
   'pm': 2,
   'viis': 5,
   'käes': 3,
   ':': 149,
   'carmsHi': 1,
   '"': 268,
   'on': 1189,
   'vist': 117,
   'kõige': 24,
   'populaarsem': 1,
   '.': 1204,
   'Kõige': 1,
   'ilusama': 1,
   'kõlaga': 1,
   '\\:': 6,
   'D': 8,
   'monkahmm': 1,
   'lõpuks': 17,
   'peaks': 106,
   'siin': 94,
   'u': 7,
   '500': 11,
   'kasutajat': 2,
   'olema': 87,
   'saame': 17,
   'hei"-d': 2,
   '?': 918,
   '🙂': 119,
   '👋🏻': 1,
   'Heia': 1,
   '😊🤟': 1,
   'Heihoo': 1,
   '😊': 2,
   '😄': 170,
   '@': 296,
   'Ago': 102,
   'See': 76,
   'siis': 478,
   'üks': 36,
   'esimesi': 1,
   'grupiülesandeid': 1,
   ',': 2182,
   'kus': 77,
   'meeskonna': 1,
   'tööd': 16,
   'hinnata': 1,
   'HEI': 1,
   '👋🏼': 1,
   'TriinuAdelaide'