In [1]:
!pip install --quiet tqdm 

In [2]:
input_path='/kaggle/Input/'
output_path='/kaggle/working/'

In [3]:
import pandas as pd
import numpy as np
import re
import pickle
import os

In [4]:
import nltk
from nltk import pos_tag
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')



[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
from nltk.corpus import stopwords
import string
stop_words=set(stopwords.words("english"))
symbols=set(string.punctuation)

In [6]:
MODELS={"name":["SciBERT","SciNCL","SPECTER","SciBERT-based","SciNCL-based","SPECTER-based"]}
id=0
MODELS["name"][id]

'SciBERT'

In [7]:
with open(input_path+'Dictionary/domain_list.txt') as f:
    domain_list= [line.strip() for line in f]

In [8]:
with open(input_path+'Dictionary/language_list.txt') as f:
    language_list= [line.strip() for line in f]

## load data

In [9]:
folder_path=input_path+"Results/"+MODELS["name"][id]
data=pd.DataFrame()
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    df= pd.read_parquet(file_path)
    data=pd.concat([data,df])
data

Unnamed: 0,id,ID,true_predictions,data_tuples
0,0,34000,"[B-MTD, I-MTD, I-MTD, I-MTD, O, O, B-TSK, I-TS...","[[FastIF:, B-MTD], [Scalable, I-MTD], [Influen..."
2,2,34000,"[O, O, B-MTD, O, O, O, O, O, O, B-MTD, I-MTD, ...","[[FASTIF,, B-MTD], [influence, B-MTD], [functi..."
3,3,34000,"[O, O, B-MTD, I-MTD, I-MTD, O, O, O, O, O, O, ...","[[k-Nearest, B-MTD], [Neighbors, I-MTD], [(kNN..."
5,5,34000,"[O, O, O, O, O, B-MTD, I-MTD, I-MTD, O, O, O, ...","[[fast, B-MTD], [influence, I-MTD], [functions..."
6,6,34000,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[[simulatability., B-MTD]]"
...,...,...,...,...
176610,176610,47999,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[[back, B-TSK], [translation,, I-TSK], [back-t..."
176611,176611,47999,"[O, O, O, O, O, O, O, O, O, O, O, O, O, B-MTD,...","[[synthetic, B-MTD], [data, I-MTD], [generatio..."
176614,176614,47999,"[O, O, O, O, O, O, O, O, B-MTD, I-MTD, I-MTD, ...","[[Adam, B-MTD], [optimizer, I-MTD], [(Kingma, ..."
176616,176616,47999,"[O, O, O, B-MTD, I-MTD, O, O, O, O, O, O, O, O...","[[fairseq, B-MTD], [10.2, I-MTD], [GPT, B-MTD]..."


In [10]:
data=data[data['data_tuples'].str.len() != 0]
data=data.sort_values(by=['ID']).reset_index(drop=True)
data['id'] =data.groupby('ID').cumcount()
data

Unnamed: 0,id,ID,true_predictions,data_tuples
0,141,0,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-D...","[[Taiwan, B-DST], [corpus., I-DST]]"
1,77,0,"[O, O, O, B-TSK, I-TSK, I-TSK, O, O, O, O, O, ...","[[data, B-TSK], [sparseness, I-TSK], [problem,..."
2,78,0,"[O, B-MTD, I-MTD, I-MTD, O, O, O, O, O]","[[generalization, B-MTD], [process, I-MTD], [r..."
3,79,0,"[O, O, O, O, O, O, O, O, O, O, B-TSK, I-TSK, O...","[[characterization, B-TSK], [since, I-TSK]]"
4,81,0,"[O, O, O, O, O, O, O, O, O, O, B-MTD, I-MTD, O...","[[TFIDF, B-MTD], [value, I-MTD]]"
...,...,...,...,...
6041389,30920,59202,"[O, O, B-TSK, I-TSK, I-TSK, I-TSK, O, O, O, O,...","[[real, B-TSK], [world, I-TSK], [dialogue, I-T..."
6041390,30919,59202,"[O, O, O, B-MTD, I-MTD, O, O, O, O, O, O, O, B...","[[POMDP, B-MTD], [algorithms, I-MTD], [policy,..."
6041391,30918,59202,"[O, O, B-TSK, I-TSK, I-TSK, O, B-MTD, I-MTD, I...","[[speech, B-TSK], [recognition, I-TSK], [degra..."
6041392,30930,59202,"[O, O, O, O, O, B-MTD, I-MTD, O, O, O, O, O, O...","[[machine, B-MTD], [interpreter, I-MTD]]"


In [12]:
data[["ID","id","true_predictions","data_tuples"]].to_parquet(output_path+'Results_'+MODELS["name"][id]+'.parquet')

## Merged workflow for post-processing

In [24]:
import re
def remove_symbols(text):
    # Define the pattern to match symbols (non-alphanumeric characters)
    pattern = r'[^a-zA-Z\s-]'
    return re.sub(pattern, '', text)

In [25]:
def extract_keyphrases(output):
    keyphrases = {
        "TSK": [],
        "MTD": [],
        "DST":[]
    }

    current_phrase = []
    current_tag = None

    for token, tag in output:
        if tag.startswith("B-"):
            # If the tag is the beginning of a new keyphrase, add the current phrase to the list
            if current_tag is not None:
                keyphrases[current_tag].append(" ".join(current_phrase))
                current_phrase = []
            # Start a new keyphrase with the current token
            current_phrase.append(token.lower())
            current_tag = tag.split("-")[1]
        elif tag.startswith("I-"):
            # If the tag is inside an existing keyphrase, append the current token to the phrase
            current_phrase.append(token.lower())
        else:
            # If the tag is not part of a keyphrase, reset the current phrase and tag
            if current_tag is not None:
                keyphrases[current_tag].append(" ".join(current_phrase))
                current_phrase = []
                current_tag = None

    # Add the last keyphrase if it exists
    if current_tag is not None:
        keyphrases[current_tag].append(" ".join(current_phrase))

    return keyphrases

In [39]:
# filter out stop_words
def filter_token(result_list,stop_words):
    # Remove stop words, special symbols and numbers 
    target = [(token,label) for token,label in result_list if (token.lower() not in stop_words)]
    # Filter out non-noun, non-adjective, non-adverb, and non-verb words
    pos_tags = {'NN', 'NNS', 'NNP', 'NNPS',  # Nouns
                'JJ', 'JJR', 'JJS',       # Adjectives
                'RB', 'RBR', 'RBS',       # Adverbs
                'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}  # Verbs
    tagged_tokens = []
    for (token,label) in target:
        try:
            tagged_token = pos_tag([token])[0]
            tagged_tokens.append((tagged_token,label))
        except IndexError:
            # If the token cannot be tagged, include it as is
            tagged_tokens.append(((token,label), ''))
    filtered_tokens = [(token, label) for ((token, pos),label) in tagged_tokens if pos in pos_tags or not pos]
    return filtered_tokens
#     return target

In [27]:
def remove_subsets(input_dict):
    output_dict = {}
    for key, value in input_dict.items():
        final_values = []
        for item1 in value:
            is_subset = False
            for item2 in value:
                if item1 != item2 and all(word in item2.split() for word in item1.split()):
                    is_subset = True
                    break
            if not is_subset:
                final_values.append(item1)
        output_dict[key] = final_values
    return output_dict

In [28]:
def search_domain(text_list, domain_list):
    domain_key = []
    for text in text_list:
        for domain in domain_list:
            matches = re.findall(domain.lower(), text.lower())
            domain_key.extend(matches)
    domain_key = list(domain_key)
    return domain_key

In [29]:
def search_language(text_list, language_list):
    language_key = []
    for text in text_list:
        for language in language_list:
            # full case match
            pattern = r'\b' + re.escape(language) + r'\b'
            if re.search(pattern, text, re.IGNORECASE):
                language_key.append(language)
#     language_key = list(set(language_key))
    language_key = list(language_key)
    return language_key

In [30]:
def calculate_precision(dict1, dict2):
    precision = {}
    for key in dict1.keys():
        if key in dict2.keys():
            if dict2[key] <dict1[key]:
                if dict1[key]!=0:
                    precision[key] = float(dict2[key]) / float(dict1[key])
            else:
                if dict2[key]!=0:
                    precision[key] = float(dict1[key]) / float(dict2[key])
                else:
                    precision[key] =1
    return precision

In [31]:
import pickle
with open(input_path+"Data/100_random_ids.pkl", "rb") as file:
    sampled_ids=pickle.load(file)

In [40]:
# process all the acl papers takes around 2 hours
from tqdm import tqdm
post_doc={}
micro_precisions={}
ID_list=data.ID.unique()
for _, ID in tqdm(enumerate(sampled_ids), total=len(sampled_ids)):
# for _, ID in tqdm(enumerate(ID_list), total=len(ID_list)):
    doc=[i for item in data[data['ID'] == ID]['data_tuples'] for i in item]
    new_output=[]
    # remove special remarks
    for item in doc:
        if len(remove_symbols(item[0]))>0:
            new_output.append((remove_symbols(item[0]),item[1]))
    # filter out stopwords
    new_output=filter_token(new_output,stop_words)
    # transform into key-value pairs
    keyphrases=extract_keyphrases(new_output)
    # extract domain and language
    keyphrases["DOM"]=[]
    keyphrases["LAN"]=[]
    for item in keyphrases.values():
        keyphrases.update({"DOM":keyphrases["DOM"]+search_domain(item,domain_list)})
        keyphrases.update({"LAN":keyphrases["LAN"]+search_language(item,language_list)})
    # remove the duplicates
    new_keyphrases ={}
    for key, value in keyphrases.items():
        unique_values = list(set(value))
        new_keyphrases[key] = unique_values
    # remove the subsets
    new_keyphrases = remove_subsets(new_keyphrases)
    lengths1 = {key: len(value) for key, value in keyphrases.items()}
    lengths2 = {key: len(value) for key, value in new_keyphrases.items()}
    micro_precision = calculate_precision(lengths1, lengths2)
    micro_precisions.update({ID:micro_precision})
    # special handle of language and domain
    if len(new_keyphrases["LAN"]) == 0:
        new_keyphrases["LAN"]=["English"]
    if len(new_keyphrases["DOM"]) == 0:
        new_keyphrases["DOM"]=["Computer Science"]
    # sorted
    new_keyphrases = {key: sorted(value) for key, value in new_keyphrases.items()}
    post_doc.update({ID:new_keyphrases})

100%|██████████| 100/100 [00:28<00:00,  3.53it/s]


In [33]:
import pickle
with open(output_path+"Eval/keyphrases_"+MODELS["name"][id]+".pkl", "wb") as file:
    pickle.dump(post_doc, file)
with open(output_path+"Eval/precisions_"+MODELS["name"][id]+".pkl", "wb") as file:
    pickle.dump(micro_precisions, file)