In [41]:
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [42]:
df = pd.read_pickle("df_mixed.pkl")

Split multi-word keywords into single keywords for more simple labeling (optimization, optimization algorithm, numerical optimization, ... ->)

In [43]:
def split_list(keywords : list) -> list:

    return [word for phrase in keywords for word in phrase.split()]

df['single keys']   = df['keywords'].apply(lambda x: split_list(x))

Add the tokenized title for more information about the documents: 

In [44]:
stop_words   = set(stopwords.words('english'))

def split_text(title : str) -> list:

    return [word for word in title.split() if word.lower() not in stop_words]


df['title split']   = df['title'].apply(lambda x: split_text(x))

df['abstract split']   = df['abstract'].apply(lambda x: split_text(x))

In [45]:
# Function to merge lists
def merge_lists(row):
    return row['single keys'] + row['title split'] + row['abstract split']

# Apply the function to each row
df['all keys'] = df.apply(merge_lists, axis=1)

Define rules and stem functions to catch plural, gerund, ....

The sequence of  the detailed rules decides the priorization in case of multiple labels.

In [46]:
rules_paradigm      =   {   'learning'      : {'learning', 'neural', 'artificial', 'intelligence', 'regression', 'prediction', 'ann', 'dnn', 'ffnn', 'rnn', 'agent'}
                        ,   'procedure'     : {'procedure', 'strategy', 'script', 'method', 'methodology'}
                        ,   'optimization'  : {'algorithm', 'optimizer', 'optimization', 'metaheuristic', 'heuristic'}
                        #,   'modeling'      : {'model'}
                        }

rules_application   =   {   'car'           : {'car', 'hev', 'ev'}
                        ,   'aerospace'     : {'aerospace', 'aerial' 'aircraft', 'uav'}
                        ,   'battery'       : {'battery', 'lithium', 'ion', 'charging', 'charger'}
                        ,   'grid'          : {'grid', 'microgrid'}
                        ,   'industry'      : {'industrial', 'industry'}
                        ,   'audio'         : {'audio', 'speaker', 'amplifier'}
                        ,   'renewables'    : {'solar', 'wind', 'photovoltaic'}
                        ,   'drives'        : {'rotor', 'stator', 'drive', 'motor','torque'}
                        ,   'inverter'      : {'dc-dc','ac-ac', 'dc-ac', 'ac-dc', 'inverter', 'converter'}
                        }

rules_lifecycle     =   {   'design'        : {'design', 'model', 'twin', 'optimization', 'procedure', 'methodology', 'analysis'}
                        ,   'maintenance'   : {'maintenance', 'condition', 'monitoring', 'life', 'test', 'verification', 'validation', 'anomaly', 'detection', 'identification'}
                        ,   'control'       : {'control', 'controller', 'modulation', 'modulator', 'tuning', 'mppt'}
                        }

rules_components    =   {   'capacitors'    : {'capacitor', 'supercapacitor'}
                        ,   'magnetics'     : {'inductor', 'inductance', 'magnetic', 'litz', 'transformer', 'wire', 'winding'}
                        ,   'widebandgap'   : {'gallium', 'nitride', 'carbide', 'sic', 'gan', 'wbg', 'gap'}
                        ,   'semiconductors': {'silicon', 'transistor', 'mosfet', 'diode', 'switch', 'multichip', 'multi-chip'}             
                        ,   'machinery'     : {'motor', 'generator', 'rotor', 'stator'}
                        ,   'cooling'       : {'cooling', 'heat', 'sink' 'pipe', 'fan', 'fin'}
                        }

rules_task           =  {   'selection'     : {'component', 'selection', 'choice'}
                        ,   'implementation': {'layout', 'physical', 'placement', 'routing'}
                        ,   'concept'       : {'concept', 'topology'}
                        ,   'integration'   : {'integration', 'co-design'}
                        ,   'control'       : {'control', 'controller', 'tuning', 'modulation'}
                        ,   'modeling'      : {'model', 'twin', 'surrogate'}
                        }


Rules for more detailed classification

In [47]:
rules_topology      =   {   'dc-dc'         : {'dc-dc', 'dcdc', 'dc/dc'}
                        ,   'ac-ac'         : {'ac-ac', 'acac', 'ac/ac'}
                        ,   'dc-ac'         : {'dc-ac', 'dcac', 'dc/ac'}
                        ,   'ac-dc'         : {'ac-dc', 'acdc', 'ac/dc'}
                        ,   'resonant'      : {'resonant', 'resonance', 'llc', 'cllc'}
                        ,   'isolated'      : {'isolated', 'galvanic'}
                        ,   'multi-level'   : {'multi', 'level'}
                        ,   'interleaved'   : {'interleaved'}
                        }

rules_algorithm     =   {   'genetic'       : {'genetic'}
                        ,   'diffEvolution' : {'differential', 'evolution'}
                        ,   'pso'           : {'particle', 'swarm'}
                        ,   'gradient'      : {'gradient', 'adam', 'sgd'}
                        ,   'annealing'     : {'annealing'}
                        ,   'whale'         : {'whale'}
                        ,   'ant'           : {'ant'}
                        ,   'wolf'          : {'wolve', 'wolf'}
                        ,   'simplex'       : {'simplex', 'nelder-mead', 'fmin'}
                        ,   'newton'        : {'newton', 'bfgs'}
                        ,   'combinatorial' : {'combinatorial', 'branch'}
                        ,   'bayesian'      : {'bayes', 'bayesian'}
                        }

rules_ml            =   {   'unsupervised'  : {'clustering', 'pca', 'unsupervised'}
                        ,   'supervised'    : {'regression', 'classification', 'supervised', 'neural'}
                        ,   'reinforcement' : {'reinforcement', 'agent'}
                        }

In [48]:
# Initialize the stemmer
stemmer = PorterStemmer()

# Function to stem keywords
def stem_keywords(keywords):
    return [stemmer.stem(word) for word in keywords]

# Function to assign labels based on rules
def assign_label(keywords, rules, extension : str):
    labels = list()
    stemmed_keywords = stem_keywords(keywords)
    for label, keywords_set in rules.items():
        stemmed_keywords_set = {stemmer.stem(word) for word in keywords_set}
        if any(keyword in stemmed_keywords_set for keyword in stemmed_keywords):
            labels.append(label)
    return ','.join(labels) if labels else 'unknown' + '_' + extension


Perform the rule based labeling

In [49]:
#df['paradigm'] = df['single keys'].apply(lambda x: assign_label(x, rules_paradigm))

In [50]:
ruleset     =   [ ('paradigm'   , rules_paradigm)
                , ('application', rules_application)
                , ('components' , rules_components)
                , ('life_cycle' , rules_lifecycle)
                , ('task'       , rules_task)
                , ('algorithm'  , rules_algorithm)
                , ('ml_method'  , rules_ml)
                ]


for column, rule in ruleset:

    df[column] = df['all keys'].apply(lambda x: assign_label(x, rule, column))

In [51]:
#df.to_pickle('new_df_labled_with_tasks_and_tpe_and_algos.pkl')

In [52]:
df.to_pickle('df_with_abstracts.pkl')