In [1]:
import srsly
import os
os.chdir('..')

In [2]:
def lines_to_text(lines, title_field, text_field):
    for line in lines:
        result = ''
        title = line[title_field].strip()
        if title:
            result += title
            if title[-1] != '.':
                result += '. '
            else: 
                result += " "
        try:
            result += line[text_field].strip()
        except:
            print(line)
            breakpoint()
        yield result


In [21]:
metrics = lines_to_text(srsly.read_jsonl('data/metrics.jsonl'), "title", "text")

In [22]:
envo = lines_to_text(srsly.read_jsonl('data/envo.jsonl'), "title", "text")

In [23]:
eu_projects = (item['text'] for item in srsly.read_jsonl('data/eu_projects.jsonl'))

In [24]:
from itertools import chain
lines = chain(metrics, envo, eu_projects)

In [25]:
corpus = [line for line in lines]

In [26]:
len(corpus)

56899

In [30]:
from tqdm.autonotebook import tqdm
chunk_corpus = [line for line in tqdm(get_nounchunks(corpus, "en_core_web_sm"))]

56899it [15:35, 60.80it/s]
0it [17:29, ?it/s]


In [31]:
chunk_corpus[0]

['ratio',
 'basic salary',
 'remuneration',
 'women',
 'men',
 'employee category',
 'significant locations',
 'operations']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scripts.nounchunker import get_nounchunks
import spacy
import re

stop_words = spacy.load('en_core_web_sm').Defaults.stop_words

vectorizer = TfidfVectorizer(
    min_df=50,
    max_df=0.1,
    # preprocessor=lambda x: x.lower(),
    stop_words="english",
    token_pattern=r"(?u)\b[A-Za-z-]{2,}\b",
    sublinear_tf=True,
    smooth_idf=True,
    analyzer=lambda x: [
        re.sub(r'\s+', ' ', w)
        for w in x
        if len(w) > 2 and w.lower() not in stop_words and not re.match(r".*[^-a-zA-Z\s]+", w)
    ],
)

X = vectorizer.fit_transform(
    chunk_corpus
    # get_nouncbhunks(lines)
)
print(len(vectorizer.get_feature_names()))
print(vectorizer.get_feature_names())


In [45]:
with open('data/extracted_concepts.txt', 'w') as f:
    for feature in vectorizer.get_feature_names():
        f.writelines(feature + "\n")

In [68]:
import pandas as pd
envo_concepts = pd.read_csv('data/envo_concepts.txt', header=None)
linkedsdg_concepts = pd.read_csv('data/linkedsdg_concepts.txt', header=None)
extracted_concepts = pd.read_csv('data/extracted_concepts.txt', header=None)

In [69]:
df = pd.concat([envo_concepts, linkedsdg_concepts, extracted_concepts])

In [70]:
print('Number of duplicates')
len(df) - len(df.drop_duplicates())

Number of duplicates


1032

In [71]:
df.to_csv('data/concepts.txt', index=False, header=False)

In [72]:
df = df.rename({0: "concept"}, axis=1)

In [73]:
df = df.fillna('')

In [75]:
nlp = spacy.blank("en")

srsly.write_jsonl(
    "data/concepts.jsonl",
    (
        {"label": "CONCEPT", "pattern": [{"LOWER": tok.lower_} for tok in concept]}
        for concept in nlp.tokenizer.pipe(df.concept.str.lower()) if len(concept) > 0
    ),
)
