In [None]:
import os
os.chdir('../..')

In [None]:
import spacy
import srsly

In [None]:
import pandas as pd

In [None]:
concepts = pd.read_csv("assets/concepts-source-index.tsv", sep="\t")
countries = pd.read_csv("assets/countries-source-index.tsv", sep="\t")

In [None]:
concepts

In [None]:
countries

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
import re
srsly.write_jsonl('data/linkedsdg_concepts.jsonl', ({"label":"CONCEPT","pattern":[{"LOWER": tok.lower_} for tok in concept]} for concept in nlp.tokenizer.pipe(concepts.label.str.lower())))
concepts.label.str.lower().to_csv("../assets/linkedsdg_concepts.txt", index=False, header=False)

In [None]:
spacy.displacy.serve(nlp("I live in south of Europe."), style='ent')

In [None]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

In [None]:
patterns = list(nlp.tokenizer.pipe(concepts.label.str.lower()))

In [None]:
matcher.add("Concepts", patterns)

In [None]:
doc = nlp("I'm not sure about the 02 Crimea questions...")

In [None]:
doc[6].lemma_

In [None]:
for match_id, start, end in matcher(doc):
    print("Matched based on 02 lowercase token text:", doc[start:end])

In [None]:
spacy.displacy.serve(doc)

In [None]:
def clean_nounchunk(nc):
    return ' '.join([tok.text.lower().strip() for tok in nc if tok.pos_ not in ['DET', 'NUM', 'PUNCT']]).strip()

In [None]:
texts = []
for item in srsly.read_jsonl('../assets/annotations_raw.jsonl'):
    text = item['text']
    doc = nlp(text)
    texts.extend([[clean_nounchunk(nc) for nc in sent.noun_chunks] for sent in doc.sents])


In [None]:
for item in srsly.read_jsonl('../assets/documents.jsonl'):
    text = item['text']
    doc = nlp(text)
    texts.extend([[clean_nounchunk(nc) for nc in sent.noun_chunks] for sent in doc.sents])


In [None]:
len(texts)*0.001

In [None]:
stop_words = nlp.Defaults.stop_words
stop_words |= set(countries.label.str.lower())
stop_words |= set(concepts.label.str.lower())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=3, max_df=0.005, stop_words="english", ngram_range=(1,4), analyzer=lambda x:[w.lower() for w in x if w.lower() not in stop_words and not re.match(r'.*\d+', w)])
X = vectorizer.fit_transform(texts)
vectorizer.get_feature_names()

In [None]:
sdgio = pd.read_json('../../sdgio/sdgio.json', encoding="cp1252", )

In [None]:
sdgio

In [None]:
import json
with open('../../sdgio/sdgio.json', encoding="cp1252") as f:
    sdgio = json.load(f, )
    
nodes = sdgio['graphs'][0]['nodes']

In [None]:
df = pd.DataFrame(nodes)
df.iloc[:10]

In [None]:
df[df.type == 'INDIVIDUAL'][-10:]

In [None]:
df.id.str.split('.org').apply(lambda x: x[0]).unique()