In [23]:
from rouge import Rouge
from rouge.rouge_score import *
import nltk
import ssl

def read_list_asline(path):
    data = []
    with open(path,'r',encoding='utf-8')  as file:
        for line in file:
            data.append(line.strip())
    return data


def download_nltk():
    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = _create_unverified_https_context

    nltk.download("punkt")


def _text_to_ngrams(text, n=1):
    ngrams = list(nltk.ngrams(nltk.word_tokenize(text), n))
    return Ngrams(ngrams)

def _get_rouge_from_ngram(reference_ngrams: Ngrams, evaluated_ngrams: Ngrams)-> dict:
    reference_count = len(reference_ngrams)
    evaluated_count = len(evaluated_ngrams)

    # Gets the overlapping ngrams between evaluated and reference
    overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
    overlapping_count = len(overlapping_ngrams)
    return f_r_p_rouge_n(evaluated_count, reference_count, overlapping_count)

download_nltk()

In [25]:
data = read_list_asline('/Users/silas.rudolf/projects/School/MA/experiments/data/stage_1/test.source')
labels = read_list_asline('/Users/silas.rudolf/projects/School/MA/experiments/data/stage_1/test_duplicated.target')

In [30]:
MAX_LENGTH = 10

# This function is faster than seg_based_on_rouge because it uses the ngrams to computer rouge rather than text.
def fast_rouge(sou, tar, name=None, verbose=False):
    cur_new = ''
    cur_ngram = Ngrams()
    best_score = 0
    best_sents = []

    # use ngram to represent each text
    sou = _text_to_ngrams(sou)
    seg = [(x, _text_to_ngrams(x), i) for i, x in enumerate(nltk.sent_tokenize(tar))]

    tot_len = len(seg)
    for i in range(min(MAX_LENGTH, tot_len)):
        scores = [(x, _get_rouge_from_ngram(cur_ngram.union(seg_ngram), sou), i) for x, seg_ngram, i in seg]
        best_seg = max(scores, key=lambda x: x[1]['f'])
        seg = [x for x in seg if x[2] != best_seg[2]]  # remove dup
        cur_new += ' ' + best_seg[0]
        cur_ngram = _text_to_ngrams(cur_new)
        cur_score = _get_rouge_from_ngram(cur_ngram, sou)['f']
        if cur_score > best_score:
            best_score = cur_score
            best_sents.append(best_seg)
        else:
            break

    if verbose:
        print("id:", name, "input/output:", tot_len, len(best_sents), "best:", best_score)
    best_string = list(set((x[0], x[2]) for x in best_sents))
    best_string.sort(key=lambda x: x[1])
    best_string = ' '.join([x[0] for x in best_string])

    return best_sents, best_string

<rouge.rouge_score.Ngrams at 0x7ff74dbe6f10>

In [29]:
[(x, _text_to_ngrams(x), i) for i, x in enumerate(nltk.sent_tokenize(labels[0]))]

[('This meeting was the eleventh evidence session on the Children Abolition of Defense of Reasonable Publishment Wales Bill.',
  <rouge.rouge_score.Ngrams at 0x7ff74fd13a50>,
  0),
 ('Barry Hughes was there to answer questions related to the Bill.',
  <rouge.rouge_score.Ngrams at 0x7ff74fd13050>,
  1),
 ('Six main topics that had been discussed or explained.',
  <rouge.rouge_score.Ngrams at 0x7ff74dc26590>,
  2),
 ('The first one was how the Bill protected the children in terms of prosecutions.',
  <rouge.rouge_score.Ngrams at 0x7ff74dc268d0>,
  3),
 ("The second part explained the divergence between regions and the Bill's implication on England.",
  <rouge.rouge_score.Ngrams at 0x7ff74dc26e10>,
  4),
 ('The third part explained how the law was implemented, and the fourth talked about prosecutions and potential alternatives.',
  <rouge.rouge_score.Ngrams at 0x7ff74dc26d90>,
  5),
 ('Barry Hughes then further explained our-of-court disposals and responded to a specific infrastructure fo