In [None]:
%load_ext autoreload
%autoreload 2

import os
import sys
import json
from functools import reduce

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer


sys.path.append("../")
nlp = spacy.load("nl_core_news_sm")

In [None]:
sources = []
for entry in os.scandir("../data/stimuleringsregeling/"):
    if entry.is_file():
        with open(entry.path) as source_file:
            source = json.load(source_file)
        if any(doc for doc in source.get("documents", []) if doc.get("text", None)):
            sources.append(source)

source_documents = [doc["text"] for source in sources for doc in source["documents"]]

In [None]:
documents = [nlp(doc) for doc in source_documents if len(doc) < 1000000]  # text size limit comes from spaCy

documents_count = len(documents)
token_count = reduce(lambda count, doc: count+len(doc), documents, 0)
sentence_count = reduce(lambda count, doc: count+len(list(doc.sents)), documents, 0)
entity_count = reduce(
    lambda count, doc: count+len([
        ent for ent in doc.ents 
        if len(ent.text.strip())  # not sure why there are empty entities from spaCy
    ]),  
    documents, 
    0
)

print("Total sources: {}".format(documents_count))
print("Average document length: {}".format(token_count / documents_count))
print("Average sentences: {}".format(sentence_count / documents_count))
print("Average entities: {}".format(entity_count / documents_count))

In [None]:
doc = documents[0]
print(list(doc.sents))
print([ent for ent in doc.ents if len(ent.text.strip())])

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(source_documents)
feature_names = tfidf_vectorizer.get_feature_names()

In [None]:
sources_by_keywords_tfidf = []

for source in sources:
    
    keywords = source.get("keywords", [])
    if not keywords:
        continue

    documents = source.get("documents")
    vectors = tfidf_vectorizer.transform([doc["text"] for doc in documents])
    source_vector = vectors.sum(axis=0).A1
    source["keywords_tfidf"] = {
        word.lower(): source_vector[ 
            tfidf_vectorizer.vocabulary_[word.lower()]  # at index for the word
        ]
        for keyword in keywords
        for word in keyword.split(" ")  # splitting on spaces because we're limited to unigram for now
        if word in tfidf_vectorizer.vocabulary_  # 'e-module' was not included in vocab??
    }

    keywords_tfidf_values = list(source["keywords_tfidf"].values())
    if not keywords_tfidf_values:  # for sources with keywords only outside of vocab :(
        continue
    keywords_tfidf_score = sum(keywords_tfidf_values) / len(keywords_tfidf_values)
    sources_by_keywords_tfidf.append((keywords_tfidf_score, source))

sources_by_keywords_tfidf.sort(key=lambda element: element[0], reverse=True)
target_sources = sources_by_keywords_tfidf[:20]

In [None]:
target_sources_ids = set([source["id"] for _, source in target_sources])

for keywords_tfidf, source in target_sources:
    
    if source["id"] not in target_sources_ids:
        continue
    
    documents = source.get("documents")
    noun_documents = []
    for doc in documents:
        nouns = [token.text for token in nlp(doc["text"]) if token.pos == spacy.parts_of_speech.NOUN] 
        noun_documents.append(" ".join(nouns))
    vectors = tfidf_vectorizer.transform(noun_documents)
    source_vector = vectors.sum(axis=0).A1
    highest_tfidf_indexes = source_vector.argsort()[-10:]

    source["noun_tfidf"] = {
        feature_names[ix]: source_vector[ix]
        for ix in highest_tfidf_indexes
    }


In [None]:
with open("../results/tfidf-keywords.json", "w") as json_file:
    json.dump(target_sources, json_file, indent=4)