In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys
import json
from functools import reduce

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer


sys.path.append("../")
nlp = spacy.load("nl_core_news_sm")

In [None]:
sources = []
for entry in os.scandir("../data/stimuleringsregeling/"):
    if entry.is_file():
        with open(entry.path) as source_file:
            source = json.load(source_file)
        if any(doc for doc in source.get("documents", []) if doc.get("text", None)):
            sources.append(source)

source_documents = [doc["text"] for source in sources for doc in source["documents"]]

In [None]:
documents = [nlp(doc) for doc in source_documents if len(doc) < 1000000]  # text size limit comes from spaCy

documents_count = len(documents)
token_count = reduce(lambda count, doc: count+len(doc), documents, 0)
sentence_count = reduce(lambda count, doc: count+len(list(doc.sents)), documents, 0)
entity_count = reduce(
    lambda count, doc: count+len([
        ent for ent in doc.ents 
        if len(ent.text.strip())  # not sure why there are empty entities from spaCy
    ]),  
    documents, 
    0
)

print("Total sources: {}".format(documents_count))
print("Average document length: {}".format(token_count / documents_count))
print("Average sentences: {}".format(sentence_count / documents_count))
print("Average entities: {}".format(entity_count / documents_count))

In [None]:
doc = documents[0]
print(list(doc.sents))
print([ent for ent in doc.ents if len(ent.text.strip())])

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(source_documents)

In [None]:
for source in sources:
    documents = source.get("documents")
    vectors = tfidf_vectorizer.transform([doc["text"] for doc in documents])
    source_vector = vectors.sum(axis=0)
    keyword_tfidf_values = {
        word.lower(): source_vector[ 
            0, 
            tfidf_vectorizer.vocabulary_[word.lower()]
        ]
        for keyword in source.get("keywords", [])
        for word in keyword.split(" ")
    }
    print(keyword_tfidf_values)
    break