# Packages

In [1]:
!pip install allennlp==0.8.4 scikit-learn==0.22.2 overrides==3.1.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en_core_web_sm==2.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz (11.1 MB)
[K     |████████████████████████████████| 11.1 MB 527 kB/s 
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [3]:
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Imports

In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [5]:
dir_path = "/content/drive/MyDrive/shared/Workspace/NUS/Lyrics/SIFRank/"

%cd $dir_path

/content/drive/MyDrive/shared/Workspace/NUS/Lyrics/SIFRank


In [6]:
import os, json

from tqdm import tqdm
import spacy

import pandas as pd

from sifrank import SIFRank



In [7]:

# data_path = "../data/Inspec/"
data_path = dir_path + "../data/Inspec/"
data_path_train_jsonl = data_path + 'train.jsonl'
data_path_test_jsonl = data_path + 'test.jsonl'
data_path_valid_jsonl = data_path + 'valid.jsonl'

In [9]:
dirname = os.path.abspath("")

dataset_inspec = []

for data_path_jsonl in [data_path_train_jsonl, data_path_test_jsonl, data_path_valid_jsonl]:

    data_path_abs = os.path.join(dirname, data_path_jsonl)

    with open(data_path_abs, "r") as f:
        dataset_inspec_subset = list(f)

    dataset_inspec_subset = [json.loads(json_str) for json_str in dataset_inspec_subset]

    for idx, dict_doc in enumerate(dataset_inspec_subset):
        dataset_inspec_subset[idx]['doc'] = " ".join([token for token in dict_doc['document'] if token not in ["-LRB-", "-RRB-", "-LSB-", "-RSB-"]])

    dataset_inspec_subset = [(dict_doc['doc'], dict_doc['extractive_keyphrases']) for dict_doc in dataset_inspec_subset]

    dataset_inspec += dataset_inspec_subset

len(dataset_inspec)

2000

# Evaluation on Inspec dataset

In [10]:
keyword_sizes = [len(key) for doc, key in dataset_inspec]
sum(keyword_sizes) / len(keyword_sizes)

6.326

In [11]:
def get_key_phrases(sifrank, doc, k=10, **config):
    sifrank.pipeline(doc, k=k)
    return [word for word, _ in sifrank.candidates]

def experiment(**config):

    TP = FP = TN = FN = 0

    sifrank = SIFRank(doc_seg=config['doc_seg'],
                      emb_align=config['emb_align'],
                      sifrank_plus=config['sifrank_plus'],
                      nlp=config['nlp'])

    for doc, key in tqdm(dataset_inspec):

        k = 6 # len(key)
        result = get_key_phrases(sifrank, doc, k=k, **config)

        for positive in result:
            if positive not in key:
                FP += 1
            else:
                TP += 1

        for real in key:
            if real not in result:
                FN += 1

    precision = TP / (TP + FP) if (TP + FP) > 0 else -1
    recall = TP / (TP + FN) if (TP + FN) > 0 else -1
    accuracy = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else -1

    return precision, recall, accuracy

In [12]:
nlp = spacy.load("en_core_web_sm")

configs = [
    {
        'doc_seg': True,
        "emb_align": True,
        "sifrank_plus": False,
        "nlp": None
    },
    {
        'doc_seg': True,
        "emb_align": True,
        "sifrank_plus": True,
        "nlp": None
    },
    {
        'doc_seg': True,
        "emb_align": True,
        "sifrank_plus": False,
        "nlp": nlp
    },
    {
        'doc_seg': True,
        "emb_align": True,
        "sifrank_plus": True,
        "nlp": nlp
    },
]

results = []
for config in configs:
    precision, recall, accuracy = experiment(**config)
    results.append({
        "precision": precision,
        "recall": recall,
        "accuracy": accuracy,
        "document_segmentation": config['doc_seg'],
        "embedding_alignment": config['emb_align'],
        "SIFRank+": config['sifrank_plus'],
        "spacy_preprocessing": True if config['nlp'] else False,
    })

100%|██████████| 2000/2000 [04:29<00:00,  7.41it/s]
100%|██████████| 2000/2000 [04:28<00:00,  7.44it/s]
100%|██████████| 2000/2000 [05:23<00:00,  6.18it/s]
100%|██████████| 2000/2000 [05:17<00:00,  6.29it/s]


In [19]:
df_results = pd.DataFrame(results).sort_values(by="accuracy", ascending=False).reset_index(drop=True)

print(df_results.to_markdown())

|    |   precision |   recall |   accuracy | document_segmentation   | embedding_alignment   | SIFRank+   | spacy_preprocessing   |
|---:|------------:|---------:|-----------:|:------------------------|:----------------------|:-----------|:----------------------|
|  0 |    0.255726 | 0.24178  |   0.248558 | True                    | True                  | False      | False                 |
|  1 |    0.249373 | 0.235773 |   0.242382 | True                    | True                  | True       | False                 |
|  2 |    0.19142  | 0.18092  |   0.186022 | True                    | True                  | True       | True                  |
|  3 |    0.168088 | 0.158868 |   0.163348 | True                    | True                  | False      | True                  |


# Extract keywords from unlabelled lyrics dataset

In [12]:
lyrics_path = dir_path + "../data/lyrics_dataset.csv"

nlp = spacy.load("en_core_web_sm")
sifrank = SIFRank(sifrank_plus=False, nlp=None)

df_lyrics = pd.read_csv(lyrics_path, encoding= 'unicode_escape')
lyric_keywords = []
for lyric in tqdm(df_lyrics['lyrics']):
    keywords = sifrank.pipeline(document=lyric, k=10)
    lyric_keywords.append(keywords)
df_lyrics['keywords'] = lyric_keywords
df_lyrics.to_csv(lyrics_path.split(".csv")[0] + "_sifrank.csv", index=False)

100%|██████████| 19492/19492 [52:56<00:00,  6.14it/s]
