In [1]:
import os, json

import pandas as pd
from tabulate import tabulate

import spacy
import pytextrank

from tqdm import tqdm

from textrank import Textrank

In [2]:
data_path = "../data/Inspec/"
data_path_train_jsonl = data_path + 'train.jsonl'
data_path_test_jsonl = data_path + 'test.jsonl'
data_path_valid_jsonl = data_path + 'valid.jsonl'

In [3]:
dirname = os.path.abspath("")

dataset_inspec = []

for data_path_jsonl in [data_path_train_jsonl, data_path_test_jsonl, data_path_valid_jsonl]:

    data_path_abs = os.path.join(dirname, data_path_jsonl)

    with open(data_path_abs, "r") as f:
        dataset_inspec_subset = list(f)

    dataset_inspec_subset = [json.loads(json_str) for json_str in dataset_inspec_subset]

    for idx, dict_doc in enumerate(dataset_inspec_subset):
        dataset_inspec_subset[idx]['doc'] = " ".join([token for token in dict_doc['document'] if token not in ["-LRB-", "-RRB-", "-LSB-", "-RSB-"]])

    dataset_inspec_subset = [(dict_doc['doc'], dict_doc['extractive_keyphrases']) for dict_doc in dataset_inspec_subset]

    dataset_inspec += dataset_inspec_subset

len(dataset_inspec)

2000

In [4]:
import datetime

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank")

def get_key_phrases(textrank, doc, k=10, **config):
    method = config['method']
    if method == "textrank":
        textrank.pipeline(doc, k)
        return [candidate for candidate, _ in textrank.candidates]
    elif method == "spacy":
        start = datetime.datetime.now()
        spacy_doc = nlp(doc)
        return [phrase.text for phrase in spacy_doc._.phrases[:k]]

In [5]:
keyword_sizes = [len(key) for doc, key in dataset_inspec]
sum(keyword_sizes) / len(keyword_sizes) # average keyword k ~= 6

6.326

In [6]:
def experiment(**config):
    TP = FP = TN = FN = 0

    textrank = Textrank(modified=config['modified'], window_size=config['window_size'], nlp=config['nlp'])

    for doc, key in tqdm(dataset_inspec):

        k = 6 # len(key)

        result = get_key_phrases(textrank, doc, k=k, **config)

        for positive in result:
            if positive not in key:
                FP += 1
            else:
                TP += 1

        for real in key:
            if real not in result:
                FN += 1

    precision = TP / (TP + FP) if (TP + FP) > 0 else -1
    recall = TP / (TP + FN) if (TP + FN) > 0 else -1
    accuracy = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else -1

    return precision, recall, accuracy

In [7]:
method = ["textrank", "textrank_mod", "spacy"]

nlp_textrank = spacy.load("en_core_web_sm")

configs = [
    {
        "method": "textrank",
        "modified": False,
        "window_size": 2,
        "nlp": None
    },
    {
        "method": "textrank",
        "modified": False,
        "window_size": 3,
        "nlp": None
    },
    {
        "method": "textrank",
        "modified": False,
        "window_size": 4,
        "nlp": None
    },
    {
        "method": "textrank",
        "modified": True,
        "window_size": 2,
        "nlp": None
    },
    {
        "method": "textrank",
        "modified": True,
        "window_size": 3,
        "nlp": None
    },
    {
        "method": "textrank",
        "modified": True,
        "window_size": 4,
        "nlp": None
    },
    {
        "method": "textrank",
        "modified": True,
        "window_size": 2,
        "nlp": nlp_textrank
    },
    {
        "method": "textrank",
        "modified": True,
        "window_size": 3,
        "nlp": nlp_textrank
    },
    {
        "method": "textrank",
        "modified": True,
        "window_size": 4,
        "nlp": nlp_textrank
    },
    {
        "method": "spacy",
        "modified": False,
        "window_size": 2,
        "nlp": None
    },
]

results = []
for config in configs:
    precision, recall, accuracy = experiment(**config)
    results.append({
        "precision": precision,
        "recall": recall,
        "accuracy": accuracy,
        "method": config['method'],
        "modified": config['modified'],
        "window_size": config['window_size'],
        "spacy_preprocessing": True if config['nlp'] else False,
    })

100%|██████████| 2000/2000 [00:19<00:00, 102.68it/s]
100%|██████████| 2000/2000 [00:17<00:00, 114.98it/s]
100%|██████████| 2000/2000 [00:17<00:00, 115.31it/s]
100%|██████████| 2000/2000 [00:17<00:00, 111.68it/s]
100%|██████████| 2000/2000 [00:16<00:00, 117.95it/s]
100%|██████████| 2000/2000 [00:16<00:00, 120.38it/s]
100%|██████████| 2000/2000 [00:48<00:00, 41.34it/s]
100%|██████████| 2000/2000 [00:44<00:00, 45.28it/s]
100%|██████████| 2000/2000 [00:56<00:00, 35.36it/s]
100%|██████████| 2000/2000 [00:50<00:00, 39.29it/s]


In [8]:
df_results = pd.DataFrame(results).sort_values(by="accuracy", ascending=False).reset_index(drop=True)

print(df_results.to_markdown())

|    |   precision |   recall |   accuracy | method   | modified   |   window_size | spacy_preprocessing   |
|---:|------------:|---------:|-----------:|:---------|:-----------|--------------:|:----------------------|
|  0 |    0.279819 | 0.263457 |   0.271391 | textrank | True       |             2 | True                  |
|  1 |    0.276471 | 0.260304 |   0.268144 | textrank | True       |             3 | True                  |
|  2 |    0.276471 | 0.260284 |   0.268133 | textrank | True       |             4 | True                  |
|  3 |    0.250439 | 0.235961 |   0.242985 | textrank | True       |             3 | False                 |
|  4 |    0.250021 | 0.235475 |   0.24253  | textrank | True       |             2 | False                 |
|  5 |    0.247931 | 0.23358  |   0.240542 | textrank | True       |             4 | False                 |
|  6 |    0.215591 | 0.204157 |   0.209719 | spacy    | False      |             2 | False                 |
|  7 |    0.110176 

In [9]:
text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types systems and systems of mixed types."


In [10]:
textrank = Textrank(modified=True, window_size=2, nlp=nlp_textrank)
textrank.pipeline(text, k=10)
textrank.show_candidates()


0.166343: linear diophantine equations
0.163355: minimal generating sets
0.161175: mixed types
0.148896: strict inequations
0.148896: nonstrict inequations
0.141017: natural numbers
0.141017: upper bounds
0.141017: linear constraints
0.105762: algorithms
0.096407: a minimal supporting set
