In [1]:
import json
with open("graph_outputs/one2seq_semeval_all_output.json", 'r') as f:
    dic = json.load(f)

scores = dic['scores']
predictions = dic['predictions']
entropies = dic['entropies']
context_lines = dic['context_lines']

In [50]:
from nltk.stem import PorterStemmer

In [38]:
context_lines[0:2]

['scalable grid service discovery based on uddi . efficient discovery of grid services is essential for the success of grid computing . the standardization of grids based on web services has resulted in the need for scalable web service discovery mechanisms to be deployed in grids even though uddi has been the de facto industry standard for web services discovery , imposed requirements of tight replication among registries and lack of autonomous control has severely hindered its widespread deployment and usage . with the advent of grid computing the scalability issue of uddi will become a roadblock that will prevent its deployment in grids . in this paper we present our distributed web service discovery architecture , called dude ( distributed uddi deployment engine ) . dude leverages dht ( distributed hash tables ) as a rendezvous mechanism between multiple uddi registries . dude enables consumers to query multiple registries , still at the same time allowing organizations to have aut

In [39]:
predictions[0:2]

[['grid',
  'service',
  'discovery',
  '<sep>',
  'uddi',
  '<sep>',
  'grid',
  'computing',
  '<sep>',
  'web',
  'service',
  '<sep>',
  'distributed',
  'hash',
  'tables',
  '<sep>',
  'distributed',
  'hash',
  'tables',
  '<sep>',
  'grid',
  'computing'],
 ['grid',
  'computing',
  '<sep>',
  'performance',
  'modeling',
  '<sep>',
  'performance',
  'evaluation',
  '<sep>',
  'adaptive',
  'systems']]

In [40]:
def load_llama(dataset):
    with open('graph_outputs/'+dataset+'_llama3_kpp_test.json') as file:
        json_data = json.load(file)

    kp_predictions = json_data['dup_removed_kp_predictions']
    probabilities = json_data['probabilities']
    token_predictions = json_data['dup_removed_token_predictions']
    src = json_data['src']
    targets = json_data['targets']
    all_kpp_values = json_data['dup_removed_kpp']

    return kp_predictions, probabilities, token_predictions, src, targets, all_kpp_values


kp_predictions, probabilities, token_predictions, src, targets, all_kpp_values=load_llama("semeval")


In [41]:
token_predictions[0:2]

[[['Web', ' search', ' interaction'],
  [' popular', ' destinations'],
  [' search', ' results'],
  [' user', ' behavior'],
  [' query', '-level', ' granularity'],
  [' web', ' search'],
  [' search', ' interaction', ' feature'],
  [' authority', ' resources'],
  [' information', ' needs'],
  [' explor', 'atory', ' tasks'],
  [' query', ' topic']],
 [['Trading', ' Networks'],
  [' Price', '-', 'Setting', ' Agents'],
  [' Nash', ' Equ', 'ilibrium'],
  [' Game', ' Theory'],
  [' Market', ' Efficiency'],
  [' Strategic', ' Pricing'],
  [' Graph', ' Theory'],
  [' Matching', ' Market'],
  [' Social', ' Opt', 'imal', ' Allocation'],
  [' Graph', ' Structure'],
  [' Competition', ' among', ' Tr', 'aders']]]

In [42]:
def add_sep_and_kpp(token_predictions, all_kpp_values):
    new_token_list = []
    new_kpp_list = []

    for tokens, kpp_values in zip(token_predictions, all_kpp_values):
        merged_tokens = []
        merged_kpp = []
        kpp_index = 0

        for i, sublist in enumerate(tokens):
            merged_tokens.extend(sublist)
            merged_kpp.extend(kpp_values[kpp_index:kpp_index + len(sublist)])
            kpp_index += len(sublist)

            if i < len(tokens) - 1:
                merged_tokens.append('<sep>')
                merged_kpp.append(0)  # Add corresponding 0 for <sep>

        new_token_list.append(merged_tokens)
        new_kpp_list.append(merged_kpp)

    return new_token_list, new_kpp_list
new_token_predictions, new_kpp_values = add_sep_and_kpp(token_predictions, probabilities)


In [44]:
new_token_predictions[0]

['Web',
 ' search',
 ' interaction',
 '<sep>',
 ' popular',
 ' destinations',
 '<sep>',
 ' search',
 ' results',
 '<sep>',
 ' user',
 ' behavior',
 '<sep>',
 ' query',
 '-level',
 ' granularity',
 '<sep>',
 ' web',
 ' search',
 '<sep>',
 ' search',
 ' interaction',
 ' feature',
 '<sep>',
 ' authority',
 ' resources',
 '<sep>',
 ' information',
 ' needs',
 '<sep>',
 ' explor',
 'atory',
 ' tasks',
 '<sep>',
 ' query',
 ' topic']

In [70]:
def process_predictions(predictions, beam=False):
    stemmer = PorterStemmer()

    processed_predictions = []
    for beam_prediction in predictions:
        if beam:
            prediction_ = ""
            for prediction in beam_prediction:
                prediction = prediction.replace(";", "<sep>")
                prediction = prediction.split("<eos>")[0]
                if not prediction_:
                    prediction_ += prediction
                else:
                    prediction_ += ' <sep> ' + prediction
            prediction = prediction_
        else:
            beam_prediction = beam_prediction.replace(";", "<sep>")
            prediction = beam_prediction.split("<eos>")[0]

        prediction = prediction.split(",")

        stemed_prediction = []
        for kp in prediction:
            kp = kp.lower().strip()
            if kp != "" and kp != "<peos>" and kp!="," and kp != "." and kp != "<unk>":  # and "." not in kp and "," not in kp
                tokenized_kp = kp.split(" ")  # nltk.word_tokenize(kp)
                tokenized_stemed_kp = [stemmer.stem(kw).strip() for kw in tokenized_kp]
                stemed_kp = " ".join(tokenized_stemed_kp).replace("< digit >", "<digit>")
                if stemed_kp.strip() != "":
                    stemed_prediction.append(stemed_kp.strip())

        # make prediction duplicates free but preserve order for @topk

        prediction_dict = {}
        stemed_prediction_ = []
        for kp in stemed_prediction:
            if kp not in prediction_dict:
                prediction_dict[kp] = 1
                stemed_prediction_.append(kp)
        stemed_prediction = stemed_prediction_

        processed_predictions.extend(stemed_prediction)

    return processed_predictions

def process_srcs(srcs):
    stemmer = PorterStemmer()
    processed_srcs = []
    tokenized_src = srcs.split()  # Split the string into words
    tokenized_stemed_src = [stemmer.stem(token.strip().lower()).strip() for token in tokenized_src]
    stemed_src = " ".join(tokenized_stemed_src).strip()
    processed_srcs.append(stemed_src)
    return processed_srcs

In [71]:
a=['Web',
 ' search',
 ' interaction',
 '<sep>',
 ' popular',
 ' destinations',
 '<sep>',
 ' search',
 ' results',
 '<sep>',
 ' user',
 ' behavior',
 '<sep>',
 ' query',
 '-level',
 ' granularity',
 '<sep>',
 ' web',
 ' search',
 '<sep>',
 ' search',
 ' interaction',
 ' feature',
 '<sep>',
 ' authority',
 ' resources',
 '<sep>',
 ' information',
 ' needs',
 '<sep>',
 ' explor',
 'atory',
 ' tasks',
 '<sep>',
 ' query',
 ' topic']

In [75]:
def process_predictions(predictions, beam=False):
    stemmer = PorterStemmer()

    processed_predictions = []
    for beam_prediction in predictions:
        if beam:
            prediction_ = ""
            for prediction in beam_prediction:
                prediction = prediction.replace(";", "<sep>")
                prediction = prediction.split("<eos>")[0]
                if not prediction_:
                    prediction_ += prediction
                else:
                    prediction_ += ' <sep> ' + prediction
            prediction = prediction_
        else:
            beam_prediction = beam_prediction.replace(";", "<sep>")
            prediction = beam_prediction.split("<eos>")[0]

        prediction = prediction.split(",")

        stemed_prediction = []
        for kp in prediction:
            kp = kp.lower().strip()
            if kp != "" and kp != "<peos>" and kp!="," and kp != "." and kp != "<unk>":  # and "." not in kp and "," not in kp
                tokenized_kp = kp.split(" ")  # nltk.word_tokenize(kp)
                tokenized_stemed_kp = [stemmer.stem(kw).strip() for kw in tokenized_kp]
                stemed_kp = " ".join(tokenized_stemed_kp).replace("< digit >", "<digit>")
                if stemed_kp.strip() != "":
                    stemed_prediction.append(stemed_kp.strip())

        # make prediction duplicates free but preserve order for @topk

        prediction_dict = {}
        stemed_prediction_ = []
        for kp in stemed_prediction:
            if kp not in prediction_dict:
                prediction_dict[kp] = 1
                stemed_prediction_.append(kp)
        stemed_prediction = stemed_prediction_

        processed_predictions.extend(stemed_prediction)

    return processed_predictions

def process_srcs(srcs):
    stemmer = PorterStemmer()
    processed_srcs = []
    tokenized_src = srcs.split()  # Split the string into words
    tokenized_stemed_src = [stemmer.stem(token.strip().lower()).strip() for token in tokenized_src]
    stemed_src = " ".join(tokenized_stemed_src).strip()
    processed_srcs.append(stemed_src)
    return processed_srcs

def probab_llama_boxplots(dataset): #taken from line 811 one2seq code and adapted for llama

    kp_predictions, probabilities, token_predictions, src, targets, all_kpp_values=load_llama(dataset)
    context_lines=src
    token_predictions,scores=add_sep_and_kpp(token_predictions, probabilities)
    
    predictions=[]
    srcs=[]
    
    for i in range(len(src)):
        temp_predictions = process_predictions(token_predictions[i])
        #temp_srcs = process_srcs(src[i])
        predictions.append(temp_predictions)
        #srcs.append(temp_srcs)
    print(predictions[0:2])
        

In [76]:
probab_llama_boxplots("semeval")

[['web', 'search', 'interact', '<sep>', 'popular', 'destin', '<sep>', 'search', 'result', '<sep>', 'user', 'behavior', '<sep>', 'queri', '-level', 'granular', '<sep>', 'web', 'search', '<sep>', 'search', 'interact', 'featur', '<sep>', 'author', 'resourc', '<sep>', 'inform', 'need', '<sep>', 'explor', 'atori', 'task', '<sep>', 'queri', 'topic'], ['trade', 'network', '<sep>', 'price', '-', 'set', 'agent', '<sep>', 'nash', 'equ', 'ilibrium', '<sep>', 'game', 'theori', '<sep>', 'market', 'effici', '<sep>', 'strateg', 'price', '<sep>', 'graph', 'theori', '<sep>', 'match', 'market', '<sep>', 'social', 'opt', 'imal', 'alloc', '<sep>', 'graph', 'structur', '<sep>', 'competit', 'among', 'tr', 'ader']]


In [None]:
[['grid',
  'service',
  'discovery',
  '<sep>',
  'uddi',
  '<sep>',
  'grid',
  'computing',
  '<sep>',
  'web',
  'service',
  '<sep>',
  'distributed',
  'hash',
  'tables',
  '<sep>',
  'distributed',
  'hash',
  'tables',
  '<sep>',
  'grid',
  'computing'],
 ['grid',
  'computing',
  '<sep>',
  'performance',
  'modeling',
  '<sep>',
  'performance',
  'evaluation',
  '<sep>',
  'adaptive',
  'systems']]
token_predictions


In [55]:
predictions=[]
srcs=[]

for i in range(len(src)):
    temp_predictions = process_predictions(token_predictions[i][0])
    temp_srcs = process_srcs(src[i])
    predictions.append(temp_predictions)
    srcs.append(temp_srcs)

In [58]:
token_predictions

['Web', ' search', ' interaction']

In [23]:
separated_list=combine_with_separator(token_predictions)
sep_kpp=combine_with_separator1(all_kpp_values)

In [24]:
sep_kpp[0:2]

[[1.1497876701364411,
  1.1143242899866512,
  2.3891409365149947,
  1.7717787304326484,
  1.2276765139403374,
  5.770539652041531,
  1.9125475609114326,
  3.736857856164639,
  3.7247930349605762,
  1.0913318776857792,
  3.0313200339968684,
  '<sep>',
  0],
 [1.5681105111721405,
  1.0073730011965898,
  4.972799635990106,
  2.3086282395514695,
  2.849925088934097,
  6.806432206131389,
  1.832622051593755,
  1.7361748951641185,
  3.780131091318533,
  9.331080740925575,
  1.1584410684403077,
  '<sep>',
  0]]

In [13]:
separated_list[0:2]

[['Web',
  ' search',
  ' interaction',
  '<sep>',
  ' popular',
  ' destinations',
  '<sep>',
  ' search',
  ' results',
  '<sep>',
  ' user',
  ' behavior',
  '<sep>',
  ' query',
  '-level',
  ' granularity',
  '<sep>',
  ' web',
  ' search',
  '<sep>',
  ' search',
  ' interaction',
  ' feature',
  '<sep>',
  ' authority',
  ' resources',
  '<sep>',
  ' information',
  ' needs',
  '<sep>',
  ' explor',
  'atory',
  ' tasks',
  '<sep>',
  ' query',
  ' topic'],
 ['Trading',
  ' Networks',
  '<sep>',
  ' Price',
  '-',
  'Setting',
  ' Agents',
  '<sep>',
  ' Nash',
  ' Equ',
  'ilibrium',
  '<sep>',
  ' Game',
  ' Theory',
  '<sep>',
  ' Market',
  ' Efficiency',
  '<sep>',
  ' Strategic',
  ' Pricing',
  '<sep>',
  ' Graph',
  ' Theory',
  '<sep>',
  ' Matching',
  ' Market',
  '<sep>',
  ' Social',
  ' Opt',
  'imal',
  ' Allocation',
  '<sep>',
  ' Graph',
  ' Structure',
  '<sep>',
  ' Competition',
  ' among',
  ' Tr',
  'aders']]

In [64]:
[['grid',
  'service',
  'discovery',
  '<sep>',
  'uddi',
  '<sep>',
  'grid',
  'computing',
  '<sep>',
  'web',
  'service',
  '<sep>',
  'distributed',
  'hash',
  'tables',
  '<sep>',
  'distributed',
  'hash',
  'tables',
  '<sep>',
  'grid',
  'computing'],
 ['grid',
  'computing',
  '<sep>',
  'performance',
  'modeling',
  '<sep>',
  'performance',
  'evaluation',
  '<sep>',
  'adaptive',
  'systems']]
token_predictions


In [65]:
result[0]

[['Web', ' search', ' interaction'],
 [' popular', ' destinations'],
 [' search', ' results'],
 [' user', ' behavior'],
 [' query', '-level', ' granularity'],
 [' web', ' search'],
 [' search', ' interaction', ' feature'],
 [' authority', ' resources'],
 [' information', ' needs'],
 [' explor', 'atory', ' tasks'],
 [' query', ' topic']]

In [None]:
for keyphrases in result:
    lengths = count_words_in_keyphrases(keyphrases[0])
    print(lengths)

In [52]:
def count_words_in_keyphrases(keyphrases):
    # Remove semicolons
    keyphrases = keyphrases.replace(';', '')
    # Split keyphrases by commas and strip whitespace
    phrases = [phrase.strip() for phrase in keyphrases.split(',') if phrase.strip()]
    # Count the number of words in each phrase
    lengths = [len(phrase.split()) for phrase in phrases]
    return lengths

In [53]:
length_counts = defaultdict(int)
# Process each list in the result
for keyphrases in result:
    lengths = count_words_in_keyphrases(keyphrases[0])
    for length in lengths:
        length_counts[length] += 1

# Convert defaultdict to a regular dict for cleaner output
length_counts = dict(length_counts)

# Print the dictionary of counts
print(length_counts)

{1: 182, 2: 413, 3: 121, 4: 22, 5: 4}


In [54]:
182+413+121+26

742