In [1]:
from rouge import Rouge
from rouge.rouge_score import *
import nltk
import ssl

def read_list_asline(path):
    data = []
    with open(path,'r',encoding='utf-8')  as file:
        for line in file:
            data.append(line.strip())
    return data


def download_nltk():
    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = _create_unverified_https_context

    nltk.download("punkt")


def _text_to_ngrams(text, n=1):
    ngrams = list(nltk.ngrams(nltk.word_tokenize(text), n))
    return Ngrams(ngrams)

def _get_rouge_from_ngram(reference_ngrams: Ngrams, evaluated_ngrams: Ngrams)-> dict:
    reference_count = len(reference_ngrams)
    evaluated_count = len(evaluated_ngrams)

    # Gets the overlapping ngrams between evaluated and reference
    overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
    overlapping_count = len(overlapping_ngrams)
    return f_r_p_rouge_n(evaluated_count, reference_count, overlapping_count)

download_nltk()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/silas.rudolf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
MAX_LENGTH = 10

# This function is faster than seg_based_on_rouge because it uses the ngrams to computer rouge rather than text.
def fast_rouge(sou, tar, name=None, verbose=False):
    cur_new = ''
    cur_ngram = Ngrams()
    best_score = 0
    best_sents = []

    # use ngram to represent each text
    sou = _text_to_ngrams(sou)
    seg = [(x, _text_to_ngrams(x), i) for i, x in enumerate(nltk.sent_tokenize(tar))]

    tot_len = len(seg)
    for i in range(min(MAX_LENGTH, tot_len)):
        scores = [(x, _get_rouge_from_ngram(cur_ngram.union(seg_ngram), sou), i) for x, seg_ngram, i in seg]
        best_seg = max(scores, key=lambda x: x[1]['f'])
        seg = [x for x in seg if x[2] != best_seg[2]]  # remove dup
        cur_new += ' ' + best_seg[0]
        cur_ngram = _text_to_ngrams(cur_new)
        cur_score = _get_rouge_from_ngram(cur_ngram, sou)['f']
        if cur_score > best_score:
            best_score = cur_score
            best_sents.append(best_seg)
        else:
            break

    if verbose:
        print("id:", name, "input/output:", tot_len, len(best_sents), "best:", best_score)
    best_string = list(set((x[0], x[2]) for x in best_sents))
    best_string.sort(key=lambda x: x[1])
    best_string = ' '.join([x[0] for x in best_string])

    return best_sents, best_string

In [29]:
tar = 'on the implementation. the implementation. working on. gauge icon. input from marco. from marco. flavia .. gauge icon. and fl ##avia. fl ##avia. in development channel. development channel'
sou = 'Ivan has started working on the implementation of gauge icon yesterday. He needs some input from Marco and Flavia. Philipp will share the link in development Channel.'

fast_rouge(sou, tar)

([('on the implementation.',
   {'f': 0.2580645138813736, 'p': 0.14814814814814814, 'r': 1.0},
   0),
  ('gauge icon.',
   {'f': 0.363636360661157, 'p': 0.2222222222222222, 'r': 1.0},
   3),
  ('input from marco.',
   {'f': 0.44444444069444444,
    'p': 0.2962962962962963,
    'r': 0.8888888888888888},
   4),
  ('in development channel.',
   {'f': 0.5128205085601578,
    'p': 0.37037037037037035,
    'r': 0.8333333333333334},
   9),
  ('working on.',
   {'f': 0.5499999956125, 'p': 0.4074074074074074, 'r': 0.8461538461538461},
   2)],
 'on the implementation. working on. gauge icon. input from marco. in development channel.')