In [1]:
import os, json

import spacy
import pytextrank

from tqdm import tqdm

from textrank import Textrank

In [2]:
data_path = "../data/Inspec/"
data_path_doc = data_path + 'docsutf8/'
data_path_key = data_path + 'keys/'
data_path_train_jsonl = data_path + 'train.jsonl'
data_path_test_jsonl = data_path + 'test.jsonl'
data_path_valid_jsonl = data_path + 'valid.jsonl'

In [3]:

# dirname = os.path.abspath(os.path.dirname(__file__))
dirname = os.path.abspath("")

dataset_inspec = []

data_path_doc_abs = os.path.join(dirname, data_path_doc)
data_path_key_abs = os.path.join(dirname, data_path_key)

for root, dirs, files in os.walk(data_path_doc):
    for file_doc in sorted(files):
        file_key = file_doc.replace(".txt", ".key")
        file_key_abs = os.path.join(data_path_key_abs, file_key)
        file_doc_abs = os.path.join(data_path_doc_abs, file_doc)
        if not os.path.exists(file_doc_abs) or not os.path.exists(file_key_abs):
            continue

        document = ""
        keywords = []
        with open(file_doc_abs, "r") as f:
            document = f.read().replace("\n", "").replace("\t", " ")
        with open(file_key_abs, "r") as f:
            keywords = f.readlines()
            keywords = [keyword.strip().replace("\t", "") for keyword in keywords]

        dataset_inspec.append((document, keywords))

In [4]:
dirname = os.path.abspath("")

dataset_inspec = []

data_path_abs = os.path.join(dirname, data_path_valid_jsonl)

with open(data_path_abs, "r") as f:
    dataset_inspec = list(f)

dataset_inspec = [json.loads(json_str) for json_str in dataset_inspec]

for idx, dict_doc in enumerate(dataset_inspec):
    dataset_inspec[idx]['doc'] = " ".join([token for token in dict_doc['document'] if token not in ["-LRB-", "-RRB-", "-LSB-", "-RSB-"]])

dataset_inspec = [(dict_doc['doc'], dict_doc['extractive_keyphrases']) for dict_doc in dataset_inspec]

In [5]:
import datetime


nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank")

def get_key_phrases(doc, k=10, **config):
    method = config['method']
    if method == "textrank":
        textrank = Textrank(k=k, modified=config['modified'], window_size=config['window_size'], nlp=config['nlp'])
        textrank.pipeline(doc)
        return [candidate for candidate, _ in textrank.candidates]
    elif method == "spacy":
        start = datetime.datetime.now()
        spacy_doc = nlp(doc)
        return [phrase.text for phrase in spacy_doc._.phrases[:k]]

In [6]:
keyword_sizes = [len(key) for doc, key in dataset_inspec]
sum(keyword_sizes) / len(keyword_sizes) # average keyword k ~= 6

5.952

In [7]:
def experiment(**config):
    TP = FP = TN = FN = 0

    for doc, key in tqdm(dataset_inspec):

        k = 6 # len(key)

        result = get_key_phrases(doc, k=k, **config)

        for positive in result:
            if positive not in key:
                FP += 1
            else:
                TP += 1

        for real in key:
            if real not in result:
                FN += 1

    precision = TP / (TP + FP) if (TP + FP) > 0 else -1
    recall = TP / (TP + FN) if (TP + FN) > 0 else -1
    accuracy = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else -1

    return precision, recall, accuracy

In [8]:
method = ["textrank", "textrank_mod", "spacy"]

nlp_textrank = spacy.load("en_core_web_sm")

configs = [
    {
        "method": "textrank",
        "modified": False,
        "window_size": 2,
        "nlp": None
    },
    {
        "method": "textrank",
        "modified": False,
        "window_size": 3,
        "nlp": None
    },
    {
        "method": "textrank",
        "modified": False,
        "window_size": 4,
        "nlp": None
    },
    {
        "method": "textrank",
        "modified": True,
        "window_size": 2,
        "nlp": None
    },
    {
        "method": "textrank",
        "modified": True,
        "window_size": 3,
        "nlp": None
    },
    {
        "method": "textrank",
        "modified": True,
        "window_size": 4,
        "nlp": None
    },
    {
        "method": "textrank",
        "modified": True,
        "window_size": 2,
        "nlp": nlp_textrank
    },
    {
        "method": "textrank",
        "modified": True,
        "window_size": 3,
        "nlp": nlp_textrank
    },
    {
        "method": "textrank",
        "modified": True,
        "window_size": 4,
        "nlp": nlp_textrank
    },
    {
        "method": "spacy",
    },
]

for config in configs:
    precision, recall, accuracy = experiment(**config)
    print(precision, recall, accuracy, config)

100%|██████████| 500/500 [00:05<00:00, 87.08it/s] 


0.111 0.11118530884808013 0.11109257714762301 {'method': 'textrank', 'modified': False, 'window_size': 2, 'nlp': None}


  9%|▊         | 43/500 [00:00<00:04, 100.19it/s]

In [None]:
text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types systems and systems of mixed types."


In [None]:
textrank = Textrank(k=10, modified=True, window_size=2, nlp=nlp_textrank)
textrank.pipeline(text)
# [candidate for candidate, _ in textrank.candidates]
sorted(textrank.candidates, key=lambda x: x[1], reverse=True)

[('linear diophantine equations', 0.16634255545508467),
 ('minimal generating sets', 0.16335541664182152),
 ('mixed types', 0.1611750408423712),
 ('strict inequations', 0.1488961335626997),
 ('nonstrict inequations', 0.1488961335626997),
 ('natural numbers', 0.14101651488230055),
 ('upper bounds', 0.14101651488230055),
 ('linear constraints', 0.14101651488230052),
 ('algorithms', 0.10576238616172541),
 ('a minimal supporting set', 0.09640728505395027)]