In [1]:
import json
from generate_graphs import *
import tqdm
import pandas as pd
from generate_graphs_transformers import json_load_one2seq
from utilities.utils import *
from generate_graphs_transformers import *
from utilities.utils import load_t5_preds, load_bart_preds

def process_predictions(predictions, beam=False):
    stemmer = PorterStemmer()

    processed_predictions = []
    for beam_prediction in predictions:
        if beam:
            prediction_ = ""
            for prediction in beam_prediction:
                prediction = prediction.replace(";", "<sep>")
                prediction = prediction.split("<eos>")[0]
                if not prediction_:
                    prediction_ += prediction
                else:
                    prediction_ += ' <sep> ' + prediction
            prediction = prediction_
        else:
            beam_prediction = beam_prediction.replace(";", "<sep>")
            prediction = beam_prediction.split("<eos>")[0]

        prediction = prediction.split(",")

        stemed_prediction = []
        for kp in prediction:
            kp = kp.lower().strip()
            if kp != "" and kp != "<peos>" and kp!="," and kp != "." and kp != "<unk>":  # and "." not in kp and "," not in kp
                tokenized_kp = kp.split(" ")  # nltk.word_tokenize(kp)
                tokenized_stemed_kp = [stemmer.stem(kw).strip() for kw in tokenized_kp]
                stemed_kp = " ".join(tokenized_stemed_kp).replace("< digit >", "<digit>")
                if stemed_kp.strip() != "":
                    stemed_prediction.append(stemed_kp.strip())

        # make prediction duplicates free but preserve order for @topk

        prediction_dict = {}
        stemed_prediction_ = []
        for kp in stemed_prediction:
            if kp not in prediction_dict:
                prediction_dict[kp] = 1
                stemed_prediction_.append(kp)
        stemed_prediction = stemed_prediction_

        processed_predictions.extend(stemed_prediction)

    return processed_predictions

def process_srcs(srcs):
    stemmer = PorterStemmer()
    processed_srcs = []
    tokenized_src = srcs.split()  # Split the string into words
    tokenized_stemed_src = [stemmer.stem(token.strip().lower()).strip() for token in tokenized_src]
    stemed_src = " ".join(tokenized_stemed_src).strip()
    processed_srcs.append(stemed_src)
    return processed_srcs

In [2]:
def process_exhird_h(model,dataset):
    scores, predictions, _ = json_load(model, dataset.lower())
    return predictions 

def process_llama(dataset):
    kp_predictions, probabilities, token_predictions, src, targets, all_kpp_values = load_llama(dataset)
    token_predictions, scores = add_sep_and_kpp(token_predictions, probabilities)
    predictions = [process_predictions(tp) for tp in kp_predictions]
    context_lines = [process_srcs(ts) for ts in src]
    gt_targets = [process_predictions(tt) for tt in targets]
    return predictions, gt_targets, context_lines, kp_predictions

def process_phi(dataset):
    kp_predictions, probabilities, token_predictions, src, targets, all_kpp_values = load_phi(dataset)
    token_predictions, scores = add_sep_and_kpp(token_predictions, probabilities)
    predictions = [process_predictions(tp) for tp in kp_predictions]
    context_lines = [process_srcs(ts) for ts in src]
    gt_targets = [process_predictions(tt) for tt in targets]
    return predictions, gt_targets, context_lines, kp_predictions

def process_one2set(model,dataset):
    scores, predictions, context_lines = json_load_one2set(model, dataset.lower())
    return scores, predictions, context_lines

def process_one2seq(model,dataset):
    scores, predictions,entropies,context_lines = json_load_one2seq(model, dataset.lower())
    return scores, predictions, context_lines


In [3]:
def get_exhird_context(dataset):
    path = 'data/test_datasets/processed_' + dataset + '_testing_context.txt'
    target_file = open(path, encoding='utf-8')
    target_lines = target_file.readlines()
    targets = []
    for i, preds in enumerate(target_lines):
        preds = preds.split(';')
        for j, pred in enumerate(preds):
            preds[j] = pred.strip()
        targets.append(preds)
    return targets

In [4]:
def join_phrases_with_index(preds):
    joined_phrases_with_index = []
    for index, phrase_list in enumerate(preds):
        temp_phrase = []
        result = []
        for word in phrase_list:
            if word == '<sep>':
                if temp_phrase:
                    result.append(' '.join(temp_phrase))
                    temp_phrase = []
            else:
                temp_phrase.append(word)
        if temp_phrase:
            result.append(' '.join(temp_phrase))
        
        joined_phrases_with_index.append(result)
    return joined_phrases_with_index

In [5]:
model = 'one2seq_'  # Example model
dataset = 'kp20k'  # Example dataset
#context_lines=get_exhird_context(dataset)
#preds = process_exhird_h(model, dataset)
#src, pred, probabilities, predicted_tokens, targets = load_bart_preds(dataset)
scores, pred, src = process_one2seq(model, dataset.lower())


In [6]:

preds=[process_predictions(tp) for tp in join_phrases_with_index(pred)]


context_lines = [process_srcs(ts) for ts in src]

predictions = [process_predictions(tp) for tp in join_phrases_with_index(pred)]
predictions=[list(set(target)) for target in predictions]

In [7]:
preds[0:2]

[['feedback vertex set', 'degener graph', 'algorithm', 'graph algorithm'],
 ['data prefetch',
  'perform',
  'superscalar',
  'microprocessor',
  'memori latenc',
  'design',
  'experiment',
  'measur',
  'measur',
  'cach hit model',
  'cach perform']]

In [8]:
#only for exhird
#present_keywords_by_index = []
#absent_keywords_by_index = []
#
## Iterate through preds
#for pred in preds:
#    present_keywords = []  # Present keyphrases for the current index
#    absent_keywords = []   # Absent keyphrases for the current index
#    kp_type = None  # Track whether we are in 'present' or 'absent' state
#    current_phrase = []  # To accumulate the words of the current keyphrase
#    
#    for token in pred:
#        if token == '<p_start>':
#            kp_type = 'present'  # Switch to collecting present keyphrases
#            current_phrase = []  # Reset the current keyphrase accumulator
#        elif token == '<a_start>':
#            kp_type = 'absent'  # Switch to collecting absent keyphrases
#            current_phrase = []  # Reset the current keyphrase accumulator
#        elif token == ';':  # End of the current keyphrase
#            if kp_type == 'present' and current_phrase:
#                present_keywords.append(' '.join(current_phrase))  # Join words to form the keyphrase
#            elif kp_type == 'absent' and current_phrase:
#                absent_keywords.append(' '.join(current_phrase))
#            current_phrase = []  # Reset the keyphrase accumulator for the next phrase
#        else:
#            current_phrase.append(token)  # Accumulate the words of the current keyphrase
#    
#    # Append the present and absent keyphrases for the current index
#    present_keywords_by_index.append(present_keywords)
#    absent_keywords_by_index.append(absent_keywords)

#preds = []
#for present, absent in zip(present_keywords_by_index, absent_keywords_by_index):
#    combined = present + absent  
#    preds.append(combined)
#predictions=[list(set(target)) for target in preds]


In [9]:
def get_overall_average_keyphrase_length(predictions):
    total_keyphrases = sum(len(target) for target in (predictions))
    print(total_keyphrases)
    return total_keyphrases / len(predictions)

overall_average_length = get_overall_average_keyphrase_length(predictions)

lists_higher_than_average = 0

for keyphrase in predictions:
    if len(keyphrase) > overall_average_length:
        lists_higher_than_average += 1
        
print(f"Overall average length: {overall_average_length:.2f}")
print(f"Number of keyphrases longer than average: {lists_higher_than_average}")

115336
Overall average length: 5.77
Number of keyphrases longer than average: 10055


In [10]:
def count_keyphrases(predictions, context):
    present_count = 0
    absent_count = 0
    # Convert context to a single string
    context_string = " ".join(context).lower()
    for phrase in predictions:
        if phrase.lower() in context_string:
            present_count += 1
        else:
            absent_count += 1
    return {
        "present_count": present_count,
        "absent_count": absent_count
    }

In [11]:
results = [count_keyphrases(predictions[i], context_lines[i]) for i in range(len(predictions))]
averages = []
for entry in results:
    present_count = entry['present_count']
    absent_count = entry['absent_count']
    total_count = present_count + absent_count
    if present_count + absent_count==0:
        total_count=1
    average_present = present_count / total_count
    average_absent = absent_count / total_count
    averages.append((average_present, average_absent))

# Calculate the overall average
total_present_count = sum(entry['present_count'] for entry in results)
total_absent_count = sum(entry['absent_count'] for entry in results)
overall_average = total_absent_count / (total_present_count + total_absent_count)

print(total_present_count)
print(total_present_count/len(context_lines))
print(total_absent_count)
print(total_absent_count/len(context_lines))      

92290
4.6145
23046
1.1523


In [12]:
sum(len(target) for target in (preds))-sum(len(target) for target in (predictions))

45632

In [13]:
results = [count_keyphrases(preds[i], context_lines[i]) for i in range(len(preds))]

averages = []
for entry in results:
    present_count = entry['present_count']
    absent_count = entry['absent_count']
    total_count = present_count + absent_count
    if present_count + absent_count==0:
        total_count=1
    average_present = present_count / total_count
    average_absent = absent_count / total_count
    averages.append((average_present, average_absent))

# Calculate the overall average
total_present_count_dup = sum(entry['present_count'] for entry in results)
total_absent_count_dup = sum(entry['absent_count'] for entry in results)
overall_average_dup = total_absent_count / (total_present_count + total_absent_count)

print(total_present_count_dup-total_present_count)
print(total_absent_count_dup-total_absent_count)
    


34888
10744
