<a href="https://colab.research.google.com/github/PhaZer1604/Keyword-Extraction/blob/main/NLP_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import spacy
from spacy.displacy.render import EntityRenderer
from IPython.core.display import display, HTML

In [None]:
# overload the spacy builtin rendering to allow custom POS tags
def custom_render(doc, df, column, options={}, page=False, minify=False, idx=0):
    renderer, converter = EntityRenderer, parse_custom_ents
    renderer = renderer(options=options)
    parsed = [converter(doc, df=df, idx=idx, column=column)]
    html = renderer.render(parsed, page=page, minify=minify).strip()  
    return display(HTML(html))

# parse custom entity types that are not in the original spacy module
def parse_custom_ents(doc, df, idx, column):
    if column in df.columns:
        entities = df[column][idx]
        ents = [{'start': ent[1], 'end': ent[2], 'label': ent[3]} 
                for ent in entities]
    else:
        ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
            for ent in doc.ents]
    return {'text': doc.text, 'ents': ents, 'title': None}


def render_entities(idx, df, options={}, column='named_ents'):
    text = df['text'][idx]
    custom_render(nlp(text), df=df, column=column, options=options, idx=idx)

In [None]:
options = {'colors': {'COMPOUND': '#FE6BFE', 'PROPN': '#18CFE6', 'NOUN': '#18CFE6', 'NP': '#1EECA6', 'ENTITY': '#FF8800'}}

pd.set_option('display.max_rows', 10) # edit how jupyter will render our pandas dataframes
pd.options.mode.chained_assignment = None # prevent warning about working on a copy of a dataframe

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Papers.csv')
df.head()

Unnamed: 0,Id,Title,EventType,PdfName,Abstract,PaperText
0,5677,Double or Nothing: Multiplicative Incentive Me...,Poster,5677-double-or-nothing-multiplicative-incentiv...,Crowdsourcing has gained immense popularity in...,Double or Nothing: Multiplicative\nIncentive M...
1,5941,Learning with Symmetric Label Noise: The Impor...,Spotlight,5941-learning-with-symmetric-label-noise-the-i...,Convex potential minimisation is the de facto ...,Learning with Symmetric Label Noise: The\nImpo...
2,6019,Algorithmic Stability and Uniform Generalization,Poster,6019-algorithmic-stability-and-uniform-general...,One of the central questions in statistical le...,Algorithmic Stability and Uniform Generalizati...
3,6035,Adaptive Low-Complexity Sequential Inference f...,Poster,6035-adaptive-low-complexity-sequential-infere...,We develop a sequential low-complexity inferen...,Adaptive Low-Complexity Sequential Inference f...
4,5978,Covariance-Controlled Adaptive Langevin Thermo...,Poster,5978-covariance-controlled-adaptive-langevin-t...,Monte Carlo sampling for Bayesian posterior in...,Covariance-Controlled Adaptive Langevin\nTherm...


In [None]:
lower = lambda x: x.lower() # make lowercase

df = pd.DataFrame(df['Abstract'].apply(lower))
df.columns = ['text']
display(df)

Unnamed: 0,text
0,crowdsourcing has gained immense popularity in...
1,convex potential minimisation is the de facto ...
2,one of the central questions in statistical le...
3,we develop a sequential low-complexity inferen...
4,monte carlo sampling for bayesian posterior in...
...,...
398,the continuous-time hidden markov model (ct-hm...
399,we propose an original particle-based implemen...
400,"in many statistical problems, a more coarse-gr..."
401,this paper proposes a distributionally robust ...


In [None]:
# extract a few nouns and beginning, middle and end index using spacy POS tagger
def extract_nouns(text):
    keep_pos = ['PROPN', 'NOUN']
    return [(tok.text, tok.idx, tok.idx+len(tok.text), tok.pos_) for tok in nlp(text) if tok.pos_ in keep_pos]

# create new column in df with extracted nouns
def add_nouns(df):
    df['nouns'] = df['text'].apply(extract_nouns)

In [None]:
add_nouns(df)
display(df)

Unnamed: 0,text,nouns
0,crowdsourcing has gained immense popularity in...,"[(crowdsourcing, 0, 13, NOUN), (popularity, 33..."
1,convex potential minimisation is the de facto ...,"[(minimisation, 17, 29, NOUN), (approach, 46, ..."
2,one of the central questions in statistical le...,"[(questions, 19, 28, NOUN), (learning, 44, 52,..."
3,we develop a sequential low-complexity inferen...,"[(complexity, 28, 38, NOUN), (inference, 39, 4..."
4,monte carlo sampling for bayesian posterior in...,"[(monte, 0, 5, PROPN), (carlo, 6, 11, PROPN), ..."
...,...,...
398,the continuous-time hidden markov model (ct-hm...,"[(time, 15, 19, NOUN), (markov, 27, 33, NOUN),..."
399,we propose an original particle-based implemen...,"[(particle, 23, 31, NOUN), (implementation, 38..."
400,"in many statistical problems, a more coarse-gr...","[(problems, 20, 28, NOUN), (model, 52, 57, NOU..."
401,this paper proposes a distributionally robust ...,"[(paper, 5, 10, NOUN), (approach, 46, 54, NOUN..."


In [None]:
column = 'nouns'
render_entities(0, df, options=options, column=column)

In [None]:
def extract_noun_phrases(text):
    return [(chunk.text, chunk.start_char, chunk.end_char, chunk.label_) for chunk in nlp(text).noun_chunks]

# create new column in data frame with noun phrase
def add_noun_phrases(df):
    df['noun_phrases'] = df['text'].apply(extract_noun_phrases)

In [None]:
def visualize_noun_phrases(text):
    df = pd.DataFrame([text]) 
    df.columns = ['text']
    add_noun_phrases(df)
    column = 'noun_phrases'
    render_entities(0, df, options=options, column=column)

In [None]:
add_noun_phrases(df)
display(df)

Unnamed: 0,text,nouns,noun_phrases
0,crowdsourcing has gained immense popularity in...,"[(crowdsourcing, 0, 13, NOUN), (popularity, 33...","[(crowdsourcing, 0, 13, NP), (immense populari..."
1,convex potential minimisation is the de facto ...,"[(minimisation, 17, 29, NOUN), (approach, 46, ...","[(convex potential minimisation, 0, 29, NP), (..."
2,one of the central questions in statistical le...,"[(questions, 19, 28, NOUN), (learning, 44, 52,...","[(the central questions, 7, 28, NP), (statisti..."
3,we develop a sequential low-complexity inferen...,"[(complexity, 28, 38, NOUN), (inference, 39, 4...","[(we, 0, 2, NP), (a sequential low-complexity ..."
4,monte carlo sampling for bayesian posterior in...,"[(monte, 0, 5, PROPN), (carlo, 6, 11, PROPN), ...","[(bayesian posterior inference, 25, 53, NP), (..."
...,...,...,...
398,the continuous-time hidden markov model (ct-hm...,"[(time, 15, 19, NOUN), (markov, 27, 33, NOUN),...","[(the continuous-time hidden markov model, 0, ..."
399,we propose an original particle-based implemen...,"[(particle, 23, 31, NOUN), (implementation, 38...","[(we, 0, 2, NP), (an original particle-based i..."
400,"in many statistical problems, a more coarse-gr...","[(problems, 20, 28, NOUN), (model, 52, 57, NOU...","[(many statistical problems, 3, 28, NP), (a mo..."
401,this paper proposes a distributionally robust ...,"[(paper, 5, 10, NOUN), (approach, 46, 54, NOUN...","[(this paper, 0, 10, NP), (a distributionally ..."


In [None]:
column = 'noun_phrases'
render_entities(0, df, options=options, column=column)

In [None]:
# Extract compound noun phrases with beginning and end idx
def extract_compounds(text):
    comp_idx = 0
    compound = []
    compound_nps = []
    tok_idx = 0
    for idx, tok in enumerate(nlp(text)):
        if tok.dep_ == 'compound':       # .dep_ is used to get the type of syntactic relation

            # capture if any hyphenated compounds are there
            children = ''.join([c.text for c in tok.children])
            if '-' in children:
                compound.append(''.join([children, tok.text]))
            else:
                compound.append(tok.text)

            # store starting index of compound phrase
            try:
                tok_idx = [c for c in tok.children][0].idx
            except IndexError:
                if len(compound) == 1:
                    tok_idx = tok.idx
            comp_idx = tok.i

        # append the last word in a compound phrase
        if tok.i - comp_idx == 1:
            compound.append(tok.text)
            if len(compound) > 1: 
                compound = ' '.join(compound)
                compound_nps.append((compound, tok_idx, tok_idx+len(compound), 'COMPOUND'))

            # reset parameters
            tok_idx = 0 
            compound = []

    return compound_nps


def add_compounds(df):
    df['compounds'] = df['text'].apply(extract_compounds)

In [None]:
add_compounds(df)
display(df)

Unnamed: 0,text,nouns,noun_phrases,compounds
0,crowdsourcing has gained immense popularity in...,"[(crowdsourcing, 0, 13, NOUN), (popularity, 33...","[(crowdsourcing, 0, 13, NP), (immense populari...","[(machine learning applications, 47, 76, COMPO..."
1,convex potential minimisation is the de facto ...,"[(minimisation, 17, 29, NOUN), (approach, 46, ...","[(convex potential minimisation, 0, 29, NP), (...","[(label noise, 143, 154, COMPOUND), (linear fu..."
2,one of the central questions in statistical le...,"[(questions, 19, 28, NOUN), (learning, 44, 52,...","[(the central questions, 7, 28, NP), (statisti...","[(learning theory, 32, 47, COMPOUND), (inferen..."
3,we develop a sequential low-complexity inferen...,"[(complexity, 28, 38, NOUN), (inference, 39, 4...","[(we, 0, 2, NP), (a sequential low-complexity ...","[(low-complexity inference procedure, 24, 58, ..."
4,monte carlo sampling for bayesian posterior in...,"[(monte, 0, 5, PROPN), (carlo, 6, 11, PROPN), ...","[(bayesian posterior inference, 25, 53, NP), (...","[(monte carlo sampling, 0, 20, COMPOUND), (mac..."
...,...,...,...,...
398,the continuous-time hidden markov model (ct-hm...,"[(time, 15, 19, NOUN), (markov, 27, 33, NOUN),...","[(the continuous-time hidden markov model, 0, ...","[(markov model, 27, 39, COMPOUND), (ct -, 41, ..."
399,we propose an original particle-based implemen...,"[(particle, 23, 31, NOUN), (implementation, 38...","[(we, 0, 2, NP), (an original particle-based i...","[(belief propagation, 60, 78, COMPOUND), (pair..."
400,"in many statistical problems, a more coarse-gr...","[(problems, 20, 28, NOUN), (model, 52, 57, NOU...","[(many statistical problems, 3, 28, NP), (a mo...","[(population -, 78, 90, COMPOUND), (population..."
401,this paper proposes a distributionally robust ...,"[(paper, 5, 10, NOUN), (approach, 46, 54, NOUN...","[(this paper, 0, 10, NP), (a distributionally ...","[(wasserstein distance, 90, 110, COMPOUND), (p..."


In [None]:
column = 'compounds'
render_entities(0, df, options=options, column=column)

In [None]:
# combine compound noun phrases and entities
def extract_comp_nouns(row_series, cols=[]):
    return {noun_tuple[0] for col in cols for noun_tuple in row_series[col]}


def add_comp_nouns(df, cols=[]):
    df['comp_nouns'] = df.apply(extract_comp_nouns, axis=1, cols=cols)

In [None]:
cols = ['nouns', 'compounds']
add_comp_nouns(df, cols=cols)
display(df)

Unnamed: 0,text,nouns,noun_phrases,compounds,comp_nouns
0,crowdsourcing has gained immense popularity in...,"[(crowdsourcing, 0, 13, NOUN), (popularity, 33...","[(crowdsourcing, 0, 13, NP), (immense populari...","[(machine learning applications, 47, 76, COMPO...","{incentive, machine learning applications, mac..."
1,convex potential minimisation is the de facto ...,"[(minimisation, 17, 29, NOUN), (approach, 46, ...","[(convex potential minimisation, 0, 29, NP), (...","[(label noise, 143, 154, COMPOUND), (linear fu...","{potential, label noise, solution, losses, los..."
2,one of the central questions in statistical le...,"[(questions, 19, 28, NOUN), (learning, 44, 52,...","[(the central questions, 7, 28, NP), (statisti...","[(learning theory, 32, 47, COMPOUND), (inferen...","{experience, relationship, pac, process, dimen..."
3,we develop a sequential low-complexity inferen...,"[(complexity, 28, 38, NOUN), (inference, 39, 4...","[(we, 0, 2, NP), (a sequential low-complexity ...","[(low-complexity inference procedure, 24, 58, ...","{dirichlet process mixtures, parameter, asympt..."
4,monte carlo sampling for bayesian posterior in...,"[(monte, 0, 5, PROPN), (carlo, 6, 11, PROPN), ...","[(bayesian posterior inference, 25, 53, NP), (...","[(monte carlo sampling, 0, 20, COMPOUND), (mac...","{parameter, langevin, setting, schemes, machin..."
...,...,...,...,...,...
398,the continuous-time hidden markov model (ct-hm...,"[(time, 15, 19, NOUN), (markov, 27, 33, NOUN),...","[(the continuous-time hidden markov model, 0, ...","[(markov model, 27, 39, COMPOUND), (ct -, 41, ...","{parameter, ct-hmm models, ct -, models, disea..."
399,we propose an original particle-based implemen...,"[(particle, 23, 31, NOUN), (implementation, 38...","[(we, 0, 2, NP), (an original particle-based i...","[(belief propagation, 60, 78, COMPOUND), (pair...","{expectation, scheme, ep, family, proposal, pr..."
400,"in many statistical problems, a more coarse-gr...","[(problems, 20, 28, NOUN), (model, 52, 57, NOU...","[(many statistical problems, 3, 28, NP), (a mo...","[(population -, 78, 90, COMPOUND), (population...","{populations, electricity, bayesian melding, s..."
401,this paper proposes a distributionally robust ...,"[(paper, 5, 10, NOUN), (approach, 46, 54, NOUN...","[(this paper, 0, 10, NP), (a distributionally ...","[(wasserstein distance, 90, 110, COMPOUND), (p...","{misclassification probability, wasserstein di..."


In [None]:
# take a look at combined entities
df['comp_nouns'][0] 

{'amounts',
 'applications',
 'benefit',
 'challenge',
 'crowdsourcing',
 'data',
 'error',
 'error rates',
 'expenditure',
 'experiments',
 'form',
 'incentive',
 'learning',
 'low-quality data',
 'lunch',
 'machine',
 'machine learning applications',
 'mechanism',
 'mechanisms',
 'no-free-lunch requirement',
 'payment',
 'payment mechanism',
 'popularity',
 'problem',
 'quality',
 'questions',
 'rates',
 'reduction',
 'requirement',
 'rest',
 'simplicity',
 'spammers',
 'workers'}

In [None]:
df2 = pd.read_csv('/content/drive/MyDrive/freq_words.csv')
df2

Unnamed: 0,Rank,Word,Part of speech,Frequency,Dispersion,Unnamed: 5,Unnamed: 6
0,,,,,,,
1,1.0,the,a,22038615.0,0.98,,
2,2.0,be,v,12545825.0,0.97,,
3,3.0,and,c,10741073.0,0.99,,
4,4.0,of,i,10343885.0,0.97,,
...,...,...,...,...,...,...,...
4996,4996.0,plaintiff,n,5312.0,0.88,,
4997,4997.0,kid,v,5094.0,0.92,,
4998,4998.0,middle-class,j,5025.0,0.93,,
4999,4999.0,apology,n,4972.0,0.94,,


In [None]:
freq_words = df2['Word'].iloc[1:]
display(freq_words)

1                the
2                 be
3                and
4                 of
5                  a
            ...     
4996       plaintiff
4997             kid
4998    middle-class
4999         apology
5000            till
Name: Word, Length: 5000, dtype: object

In [None]:
# drop any entities that are already captured by noun phrases
def drop_duplicate_np_splits(ents):
    drop_ents = set()
    for ent in ents:
        if len(ent.split(' ')) > 1:
            for e in ent.split(' '):
                if e in ents:
                    drop_ents.add(e)
    return ents - drop_ents


# within an entity, drop single characters
def drop_single_char_nps(ents):
    return {' '.join([e for e in ent.split(' ') if not len(e) == 1]) for ent in ents}


# drop any entities that are less than three characters
def drop_double_char(ents):
    drop_ents = {ent for ent in ents if len(ent) < 3}
    return ents - drop_ents


# keep only entities with alphabets, hyphens, and spaces
def keep_alpha(ents):
    keep_char = set('-abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ')
    drop_ents = {ent for ent in ents if not set(ent).issubset(keep_char)}
    return ents - drop_ents

In [None]:
# drop any entities in the 5000 most common words in the English langauge
def remove_freq_words(ents):
    freq_words = pd.read_csv('/content/drive/MyDrive/freq_words.csv')['Word'].iloc[1:]
    for word in freq_words:
        try:
            ents.remove(word)
        except KeyError:
            continue
    return ents


def add_clean_ents(df, funcs=[]):
    col = 'clean_ents'
    df[col] = df['comp_nouns']
    for f in funcs:
        df[col] = df[col].apply(f)

In [None]:
funcs = [drop_duplicate_np_splits, drop_double_char, keep_alpha, drop_single_char_nps, remove_freq_words]
add_clean_ents(df, funcs)
display(df)

Unnamed: 0,text,nouns,noun_phrases,compounds,comp_nouns,clean_ents
0,crowdsourcing has gained immense popularity in...,"[(crowdsourcing, 0, 13, NOUN), (popularity, 33...","[(crowdsourcing, 0, 13, NP), (immense populari...","[(machine learning applications, 47, 76, COMPO...","{incentive, machine learning applications, mac...","{machine learning applications, questions, pay..."
1,convex potential minimisation is the de facto ...,"[(minimisation, 17, 29, NOUN), (approach, 46, ...","[(convex potential minimisation, 0, 29, NP), (...","[(label noise, 143, 154, COMPOUND), (linear fu...","{potential, label noise, solution, losses, los...","{label noise, losses, modification, svm, linea..."
2,one of the central questions in statistical le...,"[(questions, 19, 28, NOUN), (learning, 44, 52,...","[(the central questions, 7, 28, NP), (statisti...","[(learning theory, 32, 47, COMPOUND), (inferen...","{experience, relationship, pac, process, dimen...","{dimensionality reduction methods, questions, ..."
3,we develop a sequential low-complexity inferen...,"[(complexity, 28, 38, NOUN), (inference, 39, 4...","[(we, 0, 2, NP), (a sequential low-complexity ...","[(low-complexity inference procedure, 24, 58, ...","{dirichlet process mixtures, parameter, asympt...","{dirichlet process mixtures, data sets, low-co..."
4,monte carlo sampling for bayesian posterior in...,"[(monte, 0, 5, PROPN), (carlo, 6, 11, PROPN), ...","[(bayesian posterior inference, 25, 53, NP), (...","[(monte carlo sampling, 0, 20, COMPOUND), (mac...","{parameter, langevin, setting, schemes, machin...","{parameter, schemes, target distribution, equa..."
...,...,...,...,...,...,...
398,the continuous-time hidden markov model (ct-hm...,"[(time, 15, 19, NOUN), (markov, 27, 33, NOUN),...","[(the continuous-time hidden markov model, 0, ...","[(markov model, 27, 39, COMPOUND), (ct -, 41, ...","{parameter, ct-hmm models, ct -, models, disea...","{parameter, ct-hmm models, computation, diseas..."
399,we propose an original particle-based implemen...,"[(particle, 23, 31, NOUN), (implementation, 38...","[(we, 0, 2, NP), (an original particle-based i...","[(belief propagation, 60, 78, COMPOUND), (pair...","{expectation, scheme, ep, family, proposal, pr...","{beliefs, particle belief propagation, expecta..."
400,"in many statistical problems, a more coarse-gr...","[(problems, 20, 28, NOUN), (model, 52, 57, NOU...","[(many statistical problems, 3, 28, NP), (a mo...","[(population -, 78, 90, COMPOUND), (population...","{populations, electricity, bayesian melding, s...","{bayesian melding, source separation problem, ..."
401,this paper proposes a distributionally robust ...,"[(paper, 5, 10, NOUN), (approach, 46, 54, NOUN...","[(this paper, 0, 10, NP), (a distributionally ...","[(wasserstein distance, 90, 110, COMPOUND), (p...","{misclassification probability, wasserstein di...","{misclassification probability, wasserstein di..."


In [None]:
def visualize_entities(df, idx=0):
    ents = []
    abstract = df['text'][idx]
    for ent in df['clean_ents'][idx]:
        i = abstract.find(ent) # locate the index of the entity in the abstract
        ents.append((ent, i, i+len(ent), 'ENTITY')) 
    ents.sort(key=lambda tup: tup[1])

    dummy_df = pd.DataFrame([abstract, ents]).T
    dummy_df.columns = ['text', 'clean_ents']
    column = 'clean_ents'
    render_entities(0, dummy_df, options=options, column=column)

In [None]:
visualize_entities(df, 0)

In [None]:
column = 'nouns'
render_entities(0, df, options=options, column=column)

In [None]:
# text = df.at[0, 'text']
text = "When designing Natural Language Processing applications that use Machine Learning techniques, feature extraction becomes a significant part of the development effort, whether developing a new application or attempting to reproduce results reported for existing NLP tasks. We present EDISON, a Java library of feature generation functions used in a suite of state-of-the-art NLP tools, based on a set of generic NLP data structures. These feature extractors populate simple data structures encoding the extracted features, which the package can also serialize to an intuitive JSON file format that can be easily mapped to formats used by ML packages. EDISON can also be used programmatically with JVM-based (Java/Scala) NLP software to provide the feature extractor input. The collection of feature extractors is organised hierarchically and a simple search interface is provided. In this paper we include examples that demonstrate the versatility and ease-of-use of the EDISON feature extraction suite to show that this can significantly reduce the time spent by developers on feature extraction design for NLP systems. "
doc = nlp(text)

from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)

In [None]:
text = "When designing Natural Language Processing applications that use Machine Learning techniques, feature extraction becomes a significant part of the development effort, whether developing a new application or attempting to reproduce results reported for existing NLP tasks. We present EDISON, a Java library of feature generation functions used in a suite of state-of-the-art NLP tools, based on a set of generic NLP data structures. These feature extractors populate simple data structures encoding the extracted features, which the package can also serialize to an intuitive JSON file format that can be easily mapped to formats used by ML packages. EDISON can also be used programmatically with JVM-based (Java/Scala) NLP software to provide the feature extractor input. The collection of feature extractors is organised hierarchically and a simple search interface is provided. In this paper we include examples that demonstrate the versatility and ease-of-use of the EDISON feature extraction suite to show that this can significantly reduce the time spent by developers on feature extraction design for NLP systems. "
text = text.lower()

sample_df = pd.DataFrame({'text': [text]})

add_nouns(sample_df)

add_compounds(sample_df)

cols = ['nouns', 'compounds']
add_comp_nouns(sample_df, cols=cols)

funcs = [drop_duplicate_np_splits, drop_double_char, keep_alpha, drop_single_char_nps, remove_freq_words]
add_clean_ents(sample_df, funcs)

visualize_entities(sample_df, 0)