In [1]:
import os, json

import spacy
import pytextrank

from tqdm import tqdm

from textrank import Textrank

In [2]:
data_path = "../data/Inspec/"
data_path_train_jsonl = data_path + 'train.jsonl'
data_path_test_jsonl = data_path + 'test.jsonl'
data_path_valid_jsonl = data_path + 'valid.jsonl'

In [3]:
dirname = os.path.abspath("")

dataset_inspec = []

data_path_abs = os.path.join(dirname, data_path_valid_jsonl)

with open(data_path_abs, "r") as f:
    dataset_inspec = list(f)

dataset_inspec = [json.loads(json_str) for json_str in dataset_inspec]

for idx, dict_doc in enumerate(dataset_inspec):
    dataset_inspec[idx]['doc'] = " ".join([token for token in dict_doc['document'] if token not in ["-LRB-", "-RRB-", "-LSB-", "-RSB-"]])

dataset_inspec = [(dict_doc['doc'], dict_doc['extractive_keyphrases']) for dict_doc in dataset_inspec]

In [4]:
import datetime

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank")

def get_key_phrases(textrank, doc, k=10, **config):
    method = config['method']
    if method == "textrank":
        textrank.pipeline(doc, k)
        return [candidate for candidate, _ in textrank.candidates]
    elif method == "spacy":
        start = datetime.datetime.now()
        spacy_doc = nlp(doc)
        return [phrase.text for phrase in spacy_doc._.phrases[:k]]

In [5]:
keyword_sizes = [len(key) for doc, key in dataset_inspec]
sum(keyword_sizes) / len(keyword_sizes) # average keyword k ~= 6

5.952

In [6]:
def experiment(**config):
    TP = FP = TN = FN = 0

    textrank = Textrank(modified=config['modified'], window_size=config['window_size'], nlp=config['nlp'])

    for doc, key in tqdm(dataset_inspec):

        k = 6 # len(key)

        result = get_key_phrases(textrank, doc, k=k, **config)

        for positive in result:
            if positive not in key:
                FP += 1
            else:
                TP += 1

        for real in key:
            if real not in result:
                FN += 1

    precision = TP / (TP + FP) if (TP + FP) > 0 else -1
    recall = TP / (TP + FN) if (TP + FN) > 0 else -1
    accuracy = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else -1

    return precision, recall, accuracy

In [7]:
method = ["textrank", "textrank_mod", "spacy"]

nlp_textrank = spacy.load("en_core_web_sm")

configs = [
    {
        "method": "textrank",
        "modified": False,
        "window_size": 2,
        "nlp": None
    },
    {
        "method": "textrank",
        "modified": False,
        "window_size": 3,
        "nlp": None
    },
    {
        "method": "textrank",
        "modified": False,
        "window_size": 4,
        "nlp": None
    },
    {
        "method": "textrank",
        "modified": True,
        "window_size": 2,
        "nlp": None
    },
    {
        "method": "textrank",
        "modified": True,
        "window_size": 3,
        "nlp": None
    },
    {
        "method": "textrank",
        "modified": True,
        "window_size": 4,
        "nlp": None
    },
    {
        "method": "textrank",
        "modified": True,
        "window_size": 2,
        "nlp": nlp_textrank
    },
    {
        "method": "textrank",
        "modified": True,
        "window_size": 3,
        "nlp": nlp_textrank
    },
    {
        "method": "textrank",
        "modified": True,
        "window_size": 4,
        "nlp": nlp_textrank
    },
    {
        "method": "spacy",
        "modified": False,
        "window_size": 2,
        "nlp": None
    },
]

for config in configs:
    precision, recall, accuracy = experiment(**config)
    print(precision, recall, accuracy, config)

100%|██████████| 500/500 [00:05<00:00, 86.19it/s] 


0.111 0.11118530884808013 0.11109257714762301 {'method': 'textrank', 'modified': False, 'window_size': 2, 'nlp': None}


100%|██████████| 500/500 [00:04<00:00, 116.25it/s]


0.11466666666666667 0.11493484797861678 0.11480060070081763 {'method': 'textrank', 'modified': False, 'window_size': 3, 'nlp': None}


100%|██████████| 500/500 [00:04<00:00, 120.56it/s]


0.11366666666666667 0.11389445557782231 0.11378044711378044 {'method': 'textrank', 'modified': False, 'window_size': 4, 'nlp': None}


100%|██████████| 500/500 [00:04<00:00, 115.09it/s]


0.24404961448206502 0.24380442062960483 0.24392695593901828 {'method': 'textrank', 'modified': True, 'window_size': 2, 'nlp': None}


100%|██████████| 500/500 [00:04<00:00, 113.60it/s]


0.2500838082467315 0.24983255190890824 0.24995811693751047 {'method': 'textrank', 'modified': True, 'window_size': 3, 'nlp': None}


100%|██████████| 500/500 [00:04<00:00, 108.95it/s]


0.2484076433121019 0.24824120603015076 0.24832439678284182 {'method': 'textrank', 'modified': True, 'window_size': 4, 'nlp': None}


100%|██████████| 500/500 [00:14<00:00, 33.67it/s]


0.270497311827957 0.2701342281879195 0.27031564808596376 {'method': 'textrank', 'modified': True, 'window_size': 2, 'nlp': <spacy.lang.en.English object at 0x7ff06122d1c0>}


100%|██████████| 500/500 [00:11<00:00, 45.07it/s]


0.26411290322580644 0.263669909426367 0.26389122041295954 {'method': 'textrank', 'modified': True, 'window_size': 3, 'nlp': <spacy.lang.en.English object at 0x7ff06122d1c0>}


100%|██████████| 500/500 [00:10<00:00, 46.06it/s]


0.2654569892473118 0.26492287055667335 0.2651896609600537 {'method': 'textrank', 'modified': True, 'window_size': 4, 'nlp': <spacy.lang.en.English object at 0x7ff06122d1c0>}


100%|██████████| 500/500 [00:12<00:00, 40.62it/s]

0.197460741730705 0.19858870967741934 0.19802311945049422 {'method': 'spacy', 'modified': False, 'window_size': 2, 'nlp': None}





In [8]:
text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types systems and systems of mixed types."


In [10]:
textrank = Textrank(modified=True, window_size=2, nlp=nlp_textrank)
textrank.pipeline(text, k=10)
# sorted(textrank.candidates, key=lambda x: x[1], reverse=True)
textrank.show_candidates()


0.166343: linear diophantine equations
0.163355: minimal generating sets
0.161175: mixed types
0.148896: strict inequations
0.148896: nonstrict inequations
0.141017: natural numbers
0.141017: upper bounds
0.141017: linear constraints
0.105762: algorithms
0.096407: a minimal supporting set
