## **Bag of words - tf-idf - keyword extraction techniques**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# load the dataset
df = pd.read_csv('/content/drive/MyDrive/ai-5/papers_with_abstract_cleaned.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,source_id,year,title,abstract,full_text
0,0,0,27,1987,Bit-Serial Neural Networks,a bit serial vlsi neural network is described ...,bit serial neural networks alan f murray antho...
1,1,1,63,1987,Connectivity Versus Entropy,how does the connectivity of a neural network ...,connectivity versus entropy yaser s abu mostaf...
2,2,2,60,1987,The Hopfield Model with Multi-Level Neurons,the hopfield neural network model for associat...,the hopfield model with mul ti level neurons m...
3,3,3,59,1987,How Neural Nets Work,there is presently great interest in the abili...,alan lapedes robert farber theoretical divisio...
4,4,4,69,1987,Spatial Organization of Neural Networks: A Pro...,the aim of this paper is to explore the spatia...,spatial organization of neural nen orks a prob...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9312 entries, 0 to 9311
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    9312 non-null   int64 
 1   Unnamed: 0.1  9312 non-null   int64 
 2   source_id     9312 non-null   int64 
 3   year          9312 non-null   int64 
 4   title         9312 non-null   object
 5   abstract      9311 non-null   object
 6   full_text     9304 non-null   object
dtypes: int64(4), object(3)
memory usage: 509.4+ KB


In [None]:
import pprint
sample = 5657
pprint.pprint("TITLE:{}".format(df['title'][sample]))
pprint.pprint("ABSTRACT:{}".format(df['abstract'][sample]))
pprint.pprint("FULL TEXT:{}".format(df['full_text'][sample][:1000]))

'TITLE:Threshold Bandits, With and Without Censored Feedback'
('ABSTRACT:we consider the emph threshold bandit setting a variant of the '
 'classical multi armed bandit problem in which the reward on each round '
 'depends on a piece of side information known as a emph threshold value the '
 'learner selects one of k actions arms this action generates a random sample '
 'from a fixed distribution and the action then receives a unit payoff in the '
 'event that this sample exceeds the threshold value we consider two versions '
 'of this problem the emph uncensored and emph censored case that determine '
 'whether the sample is always observed or only when the threshold is not met '
 'using new tools to understand the popular ucb algorithm we show that the '
 'uncensored case is essentially no more difficult than the classical multi '
 'armed bandit setting finally we show that the censored case exhibits more '
 'challenges but we give guarantees in the event that the sequence of '
 'thr

This dataset contains 5 columns: source_id, year, title,  title, abstract and full_text. We are mostly interested in the full_text which include both title and abstract.

## Pre-processing

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
##Creating a list of custom stopwords
new_words = ["fig","figure","image","sample","using", 
             "show", "result", "large", 
             "also", "one", "two", "three", 
             "four", "five", "seven","eight","nine"]
stop_words = list(stop_words.union(new_words))

def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    ##Convert to list from string
    text = text.split()
    
    # remove stopwords
    text = [word for word in text if word not in stop_words]

    # remove words less than three letters
    text = [word for word in text if len(word) >= 3]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    text = [lmtzr.lemmatize(word) for word in text]
    
    return ' '.join(text)

In [None]:
docs = df['full_text'].sample(100).apply(lambda x:pre_process(x))

In [None]:
docs

9036    quadratic video interpolation xiangyu carnegie...
5727    active learning oracle epiphany tzu kuo huang ...
1135    competitive line linear regression vovk pepart...
8512    loaded dice trading bias variance order score ...
8650    complexity simplicity adaptive active subspace...
                              ...                        
4627    ocsvm quantile estimator high dimensional dist...
1807    information geometric decomposition spike anal...
1568    support vector novelty detection applied jet e...
8014    copula high dimensional generative model vine ...
7781    direct estimation difference causal graph yuha...
Name: full_text, Length: 100, dtype: object

## Bag of words

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
word2count = {}
for data in docs:
    words = nltk.word_tokenize(data)
    for word in words:
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1

In [None]:
word2count

{'quadratic': 108,
 'video': 185,
 'interpolation': 82,
 'xiangyu': 9,
 'carnegie': 18,
 'mellon': 18,
 'university': 234,
 'xuxiangyu': 1,
 'gmail': 9,
 'com': 113,
 'siyao': 1,
 'sensetime': 4,
 'research': 233,
 'wenxiu': 1,
 'sun': 80,
 'lisiyao': 1,
 'sunwenxiu': 1,
 'qian': 1,
 'yin': 5,
 'beijing': 1,
 'normal': 92,
 'yinqian': 1,
 'bnu': 1,
 'edu': 144,
 'ming': 7,
 'hsuan': 3,
 'yang': 70,
 'california': 21,
 'merced': 1,
 'google': 34,
 'mhyang': 1,
 'ucmerced': 1,
 'abstract': 106,
 'important': 150,
 'problem': 1045,
 'computer': 267,
 'vision': 176,
 'help': 53,
 'overcome': 16,
 'temporal': 111,
 'limitation': 47,
 'camera': 44,
 'sensor': 34,
 'existing': 133,
 'method': 1354,
 'usually': 91,
 'assume': 228,
 'uniform': 112,
 'motion': 157,
 'consecutive': 19,
 'frame': 205,
 'use': 699,
 'linear': 544,
 'model': 2262,
 'can': 106,
 'not': 106,
 'well': 380,
 'approximate': 189,
 'complex': 139,
 'real': 242,
 'world': 92,
 'address': 65,
 'issue': 63,
 'propose': 202,
 

In [None]:
import heapq
freq_words = heapq.nlargest(100, word2count, key=word2count.get)

In [None]:
X = []
for data in docs:
    vector = []
    for word in freq_words:
        if word in nltk.word_tokenize(data):
            vector.append(1)
        else:
            vector.append(0)
    X.append(vector)
X = np.asarray(X)

In [None]:
X

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 0, 1],
       [1, 0, 1, ..., 1, 0, 1],
       ...,
       [1, 1, 1, ..., 0, 1, 1],
       [0, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 0, 1]])

## 1.TF-IDF and Scikit-learn

Based on the tutorial of [Kavita Ganesan](https://github.com/kavgan/nlp-in-practice/blob/master/tf-idf/Keyword%20Extraction%20with%20TF-IDF%20and%20SKlearn.ipynb)

TF-IDF stands for Text Frequency Inverse Document Frequency. The importance of each word increases proportionally to the number of times a word appears in the document (Text Frequency - TF) but is offset by the frequency of the word in the corpus (Inverse Document Frequency - IDF). Using the tf-idf weighting scheme, the keywords are the words with the higherst TF-IDF score.

### 1.1 CountVectorizer to create a vocabulary and generate word counts

In [None]:
%%time
from sklearn.feature_extraction.text import CountVectorizer
#docs = docs.tolist()
#create a vocabulary of words, 
cv=CountVectorizer(max_df=0.95,         # ignore words that appear in 95% of documents
                   max_features=10000,  # the size of the vocabulary
                   ngram_range=(1,3)    # vocabulary contains single words, bigrams, trigrams
                  )
word_count_vector=cv.fit_transform(docs)

CPU times: user 447 ms, sys: 5.94 ms, total: 453 ms
Wall time: 455 ms


### 1.2 TfidfTransformer to Compute Inverse Document Frequency (IDF)

In [None]:
%%time
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

CPU times: user 4.67 ms, sys: 20 µs, total: 4.69 ms
Wall time: 5.51 ms


TfidfTransformer()

Once we have our IDF computed, we are now ready to compute TF-IDF and extract the top keywords.

In [None]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [None]:
# get feature names
feature_names=cv.get_feature_names()

def get_keywords(idx, docs):

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([docs[idx]]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords

def print_results(idx,keywords, df):
    # now print the results
    print("\n=====Title=====")
    print(df['title'][idx])
    print("\n=====Abstract=====")
    print(df['abstract'][idx])
    print("\n===Keywords===")
    for k in keywords:
        print(k,keywords[k])



In [None]:
idx=7429
keywords=get_keywords(idx, docs)
print_results(idx,keywords, df)


=====Title=====
Bayesian Distributed Stochastic Gradient Descent

=====Abstract=====
we introduce bayesian distributed stochastic gradient descent bdsgd a high throughput algorithm for training deep neural networks on parallel clusters this algorithm uses amortized inference in a deep generative model to perform joint posterior predictive inference of mini batch gradient computation times in a compute cluster specific manner specifically our algorithm mitigates the straggler effect in synchronous gradient based optimization by choosing an optimal cutoff beyond which mini batch gradient messages from slow workers are ignored in our experiments we show that eagerly discarding the mini batch gradient computations of stragglers not only increases throughput but actually increases the overall rate of convergence as a function of wall clock time by virtue of eliminating idleness the principal novel contribution and finding of this work goes beyond this by demonstrating that using the predic

## 2. Gensim implementation of TextRank summarization algorithm

Gensim is a free Python library designed to automatically extract semantic topics from documents. The gensim implementation is based on the popular TextRank algorithm. 

[Documentation](https://radimrehurek.com/gensim/summarization/keywords.html)

[Tutorial](https://rare-technologies.com/text-summarization-with-gensim/)

### 2.1 Small text

In [None]:
import gensim
text = "Non-negative matrix factorization (NMF) has previously been shown to " + \
"be a useful decomposition for multivariate data. Two different multiplicative " + \
"algorithms for NMF are analyzed. They differ only slightly in the " + \
"multiplicative factor used in the update rules. One algorithm can be shown to " + \
"minimize the conventional least squares error while the other minimizes the  " + \
"generalized Kullback-Leibler divergence. The monotonic convergence of both  " + \
"algorithms can be proven using an auxiliary function analogous to that used " + \
"for proving convergence of the Expectation-Maximization algorithm. The algorithms  " + \
"can also be interpreted as diagonally rescaled gradient descent, where the  " + \
"rescaling factor is optimally chosen to ensure convergence."
gensim.summarization.keywords(text, 
         ratio=0.5,               # use 50% of original text
         words=None,              # Number of returned words
         split=True,              # Whether split keywords
         scores=False,            # Whether score of keyword
         pos_filter=('NN', 'JJ'), # Part of speech (nouns, adjectives etc.) filters
         lemmatize=True,         # If True - lemmatize words
         deacc=True)              # If True - remove accentuation

['factor',
 'convergence',
 'rescaling',
 'multiplicative',
 'function',
 'kullback',
 'gradient',
 'algorithm',
 'matrix',
 'multivariate',
 'data',
 'useful decomposition',
 'update']

In [None]:
print("SUMMARY: ", gensim.summarization.summarize(text,
                                                  ratio = 0.5,
                                                  split = True))



SUMMARY:  ['Non-negative matrix factorization (NMF) has previously been shown to be a useful decomposition for multivariate data.', 'Two different multiplicative algorithms for NMF are analyzed.', 'They differ only slightly in the multiplicative factor used in the update rules.']


### 2.2 Large text

In [None]:
def get_keywords_gensim(idx, docs):
    
    keywords=gensim.summarization.keywords(docs[idx], 
                                  ratio=None, 
                                  words=10,         
                                  split=True,             
                                  scores=False,           
                                  pos_filter=None, 
                                  lemmatize=True,         
                                  deacc=True)              
    
    return keywords

def print_results_gensim(idx,keywords, df):
    # now print the results
    print("\n=====Title=====")
    print(df['title'][idx])
    print("\n=====Abstract=====")
    print(df['abstract'][idx])
    print("\n===Keywords===")
    for k in keywords:
        print(k)

In [None]:
idx=9036
keywords=get_keywords_gensim(idx, docs)
print_results_gensim(idx,keywords, df)


=====Title=====
Quadratic Video Interpolation

=====Abstract=====
video interpolation is an important problem in computer vision which helps overcome the temporal limitation of camera sensors existing video interpolation methods usually assume uniform motion between consecutive frames and use linear models for interpolation which cannot well approximate the complex motion in the real world to address these issues we propose a quadratic video interpolation method which exploits the acceleration information in videos this method allows prediction with curvilinear trajectory and variable velocity and generates more accurate interpolation results for high quality frame synthesis we develop a flow reversal layer to estimate flow fields starting from the unknown target frame to the source frame in addition we present techniques for flow refinement extensive experiments demonstrate that our approach performs favorably against the existing linear models on a wide variety of video datasets 

=

The keywords highlight the main point , but still miss valuable information

## 3. Python implementation of the Rapid Automatic Keyword Extraction algorithm (RAKE) using NLTK

[Documentation](https://github.com/csurfer/rake-nltk)

### Setup using pip

In [None]:
!pip install rake-nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### or directly from the repository

In [None]:
!git clone https://github.com/csurfer/rake-nltk.git
!python rake-nltk/setup.py install

fatal: destination path 'rake-nltk' already exists and is not an empty directory.
running install
running build
running build_py
error: package directory 'rake_nltk' does not exist


### 3.1 Small text

In [None]:
text = "Non-negative matrix factorization (NMF) has previously been shown to " + \
"be a useful decomposition for multivariate data. Two different multiplicative " + \
"algorithms for NMF are analyzed. They differ only slightly in the " + \
"multiplicative factor used in the update rules. One algorithm can be shown to " + \
"minimize the conventional least squares error while the other minimizes the  " + \
"generalized Kullback-Leibler divergence. The monotonic convergence of both  " + \
"algorithms can be proven using an auxiliary function analogous to that used " + \
"for proving convergence of the Expectation-Maximization algorithm. The algorithms  " + \
"can also be interpreted as diagonally rescaled gradient descent, where the  " + \
"rescaling factor is optimally chosen to ensure convergence."

In [None]:
from rake_nltk import Rake
nltk.download('punkt')
r = Rake()
r.extract_keywords_from_text(text)
r.get_ranked_phrases_with_scores()[:10]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[(16.0, 'diagonally rescaled gradient descent'),
 (16.0, 'conventional least squares error'),
 (13.5, 'two different multiplicative algorithms'),
 (9.0, 'negative matrix factorization'),
 (9.0, 'auxiliary function analogous'),
 (8.0, 'multiplicative factor used'),
 (4.5, 'rescaling factor'),
 (4.0, 'useful decomposition'),
 (4.0, 'update rules'),
 (4.0, 'proving convergence')]

Wow! We see well interbretable machine learning terminology! But why diagonally rescaled gradient descent is more important than negative matrix factorization? 

### 3.2 Large Text

In [None]:
def get_keywords_rake(idx, docs, n=10):
    # Uses stopwords for english from NLTK, and all puntuation characters by default
    r = Rake()
    
    # Extraction given the text.
    r.extract_keywords_from_text(docs[idx][1000:2000])
    
    # To get keyword phrases ranked highest to lowest.
    keywords = r.get_ranked_phrases()[0:n]
    
    return keywords

def print_results(idx,keywords, df):
    # now print the results
    print("\n=====Title=====")
    print(df['title'][idx])
    print("\n=====Abstract=====")
    print(df['abstract'][idx])
    print("\n===Keywords===")
    for k in keywords:
        print(k)

idx=9036
keywords = get_keywords_rake(idx, docs, n=10)
print_results(idx, keywords, df)


=====Title=====
Quadratic Video Interpolation

=====Abstract=====
video interpolation is an important problem in computer vision which helps overcome the temporal limitation of camera sensors existing video interpolation methods usually assume uniform motion between consecutive frames and use linear models for interpolation which cannot well approximate the complex motion in the real world to address these issues we propose a quadratic video interpolation method which exploits the acceleration information in videos this method allows prediction with curvilinear trajectory and variable velocity and generates more accurate interpolation results for high quality frame synthesis we develop a flow reversal layer to estimate flow fields starting from the unknown target frame to the source frame in addition we present techniques for flow refinement extensive experiments demonstrate that our approach performs favorably against the existing linear models on a wide variety of video datasets 

=

Oups! Something goes wrong! Algorithm does not work for the preprocessed text without punctuations. Let's treat the raw text.

In [None]:
idx=9036
keywords = get_keywords_rake(idx, df['full_text'], n=10)
print_results(idx, keywords, df)


=====Title=====
Quadratic Video Interpolation

=====Abstract=====
video interpolation is an important problem in computer vision which helps overcome the temporal limitation of camera sensors existing video interpolation methods usually assume uniform motion between consecutive frames and use linear models for interpolation which cannot well approximate the complex motion in the real world to address these issues we propose a quadratic video interpolation method which exploits the acceleration information in videos this method allows prediction with curvilinear trajectory and variable velocity and generates more accurate interpolation results for high quality frame synthesis we develop a flow reversal layer to estimate flow fields starting from the unknown target frame to the source frame in addition we present techniques for flow refinement extensive experiments demonstrate that our approach performs favorably against the existing linear models on a wide variety of video datasets 

=

Presented implementation works well on sentences, but it is not flexible enough for large text. However, those who are interested in RANK can expand the capabilities of this code to their needs. We will consider next options.

## 4. Yet Another Keyword Extractor (Yake)

[Documentation](https://github.com/LIAAD/yake)

In [None]:
!pip install git+https://github.com/LIAAD/yake

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/LIAAD/yake
  Cloning https://github.com/LIAAD/yake to /tmp/pip-req-build-r0vm6u_e
  Running command git clone -q https://github.com/LIAAD/yake /tmp/pip-req-build-r0vm6u_e
Collecting segtok
  Downloading segtok-1.5.11-py3-none-any.whl (24 kB)
Collecting jellyfish
  Downloading jellyfish-0.9.0.tar.gz (132 kB)
[K     |████████████████████████████████| 132 kB 19.0 MB/s 
Building wheels for collected packages: yake, jellyfish
  Building wheel for yake (setup.py) ... [?25l[?25hdone
  Created wheel for yake: filename=yake-0.4.8-py2.py3-none-any.whl size=62600 sha256=a42aa17daab86f1bc328ad7fc64628782aec7002ecdf2afdaea67f1f7bff4572
  Stored in directory: /tmp/pip-ephem-wheel-cache-dsy46z4a/wheels/52/79/f4/dae9309f60266aa3767a4381405002b6f2955fbcf038d804da
  Building wheel for jellyfish (setup.py) ... [?25l[?25hdone
  Created wheel for jellyfish: filename=jell

In [None]:
import yake

def get_keywords_yake(idx, docs):
    y = yake.KeywordExtractor(lan='en',          # language
                             n = 3,              # n-gram size
                             dedupLim = 0.9,     # deduplicationthresold
                             dedupFunc = 'seqm', #  deduplication algorithm
                             windowsSize = 1,
                             top = 10,           # number of keys
                             features=None)           
    
    keywords = y.extract_keywords(text)
    return keywords

idx=9036
keywords = get_keywords_yake(idx, docs[idx])
print_results(idx, keywords, df)


=====Title=====
Quadratic Video Interpolation

=====Abstract=====
video interpolation is an important problem in computer vision which helps overcome the temporal limitation of camera sensors existing video interpolation methods usually assume uniform motion between consecutive frames and use linear models for interpolation which cannot well approximate the complex motion in the real world to address these issues we propose a quadratic video interpolation method which exploits the acceleration information in videos this method allows prediction with curvilinear trajectory and variable velocity and generates more accurate interpolation results for high quality frame synthesis we develop a flow reversal layer to estimate flow fields starting from the unknown target frame to the source frame in addition we present techniques for flow refinement extensive experiments demonstrate that our approach performs favorably against the existing linear models on a wide variety of video datasets 

=

Key phrases are repeated, and the text needs pre-processing to remove stop words

## 5. Keyphrases extraction using pke

`pke` an open source python-based keyphrase extraction toolkit. It provides an end-to-end keyphrase extraction pipeline in which each component can be easily modified or extended to develop new models.

`

In [None]:
!pip install git+https://github.com/boudinfl/pke.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/boudinfl/pke.git
  Cloning https://github.com/boudinfl/pke.git to /tmp/pip-req-build-zq57oju5
  Running command git clone -q https://github.com/boudinfl/pke.git /tmp/pip-req-build-zq57oju5
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting unidecode
  Downloading Unidecode-1.3.6-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 10.3 MB/s 
Building wheels for collected packages: pke, sklearn
  Building wheel for pke (setup.py) ... [?25l[?25hdone
  Created wheel for pke: filename=pke-2.0.0-py3-none-any.whl size=6160276 sha256=7abb6ae5576310ba54dc65f8206f39af0d8de0b4091fba0eece002628a136422
  Stored in directory: /tmp/pip-ephem-wheel-cache-fizb3r8l/wheels/fa/b3/09/612ee93bf3ee4164bcd5783e742942cdfc892a86039d3e0a33
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filenam

In [None]:
import pke

### 5  SingleRank

This model is an extension of the TextRank model that uses the number of co-occurrences to weigh edges in the graph.

In [None]:
# define the set of valid Part-of-Speeches
pos = {'NOUN', 'PROPN', 'ADJ'}

# 1. create a SingleRank extractor.
extractor = pke.unsupervised.SingleRank()

# 2. load the content of the document.
extractor.load_document(input=text,
                        language='en',
                        normalization=None)

# 3. select the longest sequences of nouns and adjectives as candidates.
extractor.candidate_selection(pos=pos)

# 4. weight the candidates using the sum of their word's scores that are
#    computed using random walk. In the graph, nodes are words of
#    certain part-of-speech (nouns and adjectives) that are connected if
#    they occur in a window of 10 words.
extractor.candidate_weighting(window=10,
                              pos=pos)

# 5. get the 10-highest scored candidates as keyphrases
keyphrases = extractor.get_n_best(n=10)

idx = 9036
# now print the results
print("\n=====Title=====")
print(df['title'][idx])
print("\n=====Abstract=====")
print(df['abstract'][idx])
print("\n===Keywords===")
for k in keyphrases:
    print(k[0])

  config_value=config["nlp"][key],



=====Title=====
Learning under uncertainty: a comparison between R-W and Bayesian approach

=====Abstract=====
accurately differentiating between what are truly unpredictably random and systematic changes that occur at random can have profound effect on affect and cognition to examine the underlying computational principles that guide different learning behavior in an uncertain environment we compared an r w model and a bayesian approach in a visual search task with different volatility levels both r w model and the bayesian approach reflected an individual s estimation of the environmental volatility and there is a strong correlation between the learning rate in r w model and the belief of stationarity in the bayesian approach in different volatility conditions in a low volatility condition r w model indicates that learning rate positively correlates with lose shift rate but not choice optimality inverted u shape the bayesian approach indicates that the belief of environmental statio

# vectorization

### **Skip-gram Training Sample**

In [None]:
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in docs:
    if token not in vocab:
        vocab[token] = index
        index += 1
vocab_size = len(vocab)
print(vocab)

In [None]:
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

In [None]:
example_sequence = [vocab[word] for word in docs]
print(example_sequence)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]


# Skip-gram Training Sample

In [None]:
import io
import re
import string
import tensorflow as tf
import tqdm

from tensorflow.keras import Model
from tensorflow.keras.layers import Dot, Embedding, Flatten
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [None]:
window_size = 2
vocab_size = len(vocab)
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples = 0)
print(len(positive_skip_grams))

394


In [None]:
for target, context in positive_skip_grams[:5]:
    print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

(71, 70): (metric learning temporal sequence alignment damien garreau en damien garreau en emi lajugie inria remi lajugie inria sylvain arlot cnrs sylvain arlot en francis bach inria francis bach inria abstract paper propose learn mahalanobis distance perform alignment multivariate time series learning example task time series true alignment known cast alignment problem structured prediction task propose realistic loss alignment optimization tractable provide experiment real data audio audio context learning similarity measure lead improvement performance alignment task propose use metric learning framework perform feature selection basic audio feature build combination better alignment performance introduction problem aligning temporal sequence ubiquitous application ranging bioinformat ic audio processing goal align similar time series global structure local temporal difference alignment algorithm rely similar ity measure good metric crucial especially high dimensional setting featur

In [None]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [None]:
# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[0]
print("target word: {}, context word: {}".format(inverse_vocab[target_word], inverse_vocab[context_word]))

# Set the number of negative samples per positive context.
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # class that should be sampled as 'positive'
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    seed=SEED,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

target word: metric learning temporal sequence alignment damien garreau en damien garreau en emi lajugie inria remi lajugie inria sylvain arlot cnrs sylvain arlot en francis bach inria francis bach inria abstract paper propose learn mahalanobis distance perform alignment multivariate time series learning example task time series true alignment known cast alignment problem structured prediction task propose realistic loss alignment optimization tractable provide experiment real data audio audio context learning similarity measure lead improvement performance alignment task propose use metric learning framework perform feature selection basic audio feature build combination better alignment performance introduction problem aligning temporal sequence ubiquitous application ranging bioinformat ic audio processing goal align similar time series global structure local temporal difference alignment algorithm rely similar ity measure good metric crucial especially high dimensional setting feat

In [None]:
# Add a dimension so you can use concatenation (on the next step).
negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

# Concat positive context word with negative sampled words.
context = tf.concat([context_class, negative_sampling_candidates], 0)

# Label first context word as 1 (positive) followed by num_ns 0s (negative).
label = tf.constant([1] + [0]*num_ns, dtype="int64")

# Reshape target to shape (1,) and context and label to (num_ns+1,).
target = tf.squeeze(target_word)
context = tf.squeeze(context)
label = tf.squeeze(label)

In [None]:
print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")

target_index    : 71
target_word     : metric learning temporal sequence alignment damien garreau en damien garreau en emi lajugie inria remi lajugie inria sylvain arlot cnrs sylvain arlot en francis bach inria francis bach inria abstract paper propose learn mahalanobis distance perform alignment multivariate time series learning example task time series true alignment known cast alignment problem structured prediction task propose realistic loss alignment optimization tractable provide experiment real data audio audio context learning similarity measure lead improvement performance alignment task propose use metric learning framework perform feature selection basic audio feature build combination better alignment performance introduction problem aligning temporal sequence ubiquitous application ranging bioinformat ic audio processing goal align similar time series global structure local temporal difference alignment algorithm rely similar ity measure good metric crucial especially hig

In [None]:
print("target  :", target)
print("context :", context)
print("label   :", label)

target  : tf.Tensor(71, shape=(), dtype=int32)
context : tf.Tensor([70 16  4 40  8], shape=(5,), dtype=int64)
label   : tf.Tensor([1 0 0 0 0], shape=(5,), dtype=int64)


### Generate Training Data

In [None]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for vocab_size tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
        context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
        negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=SEED,
          name="negative_sampling")

        # Build context and label vectors (for one target word)
        negative_sampling_candidates = tf.expand_dims(
          negative_sampling_candidates, 1)

        context = tf.concat([context_class, negative_sampling_candidates], 0)
        label = tf.constant([1] + [0]*num_ns, dtype="int64")

        # Append each element from the training example to global lists.
        targets.append(target_word)
        contexts.append(context)
        labels.append(label)

    return targets, contexts, labels


In [None]:
# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')


# Define the vocabulary size and number of words in a sequence.
vocab_size = 4096
sequence_length = 10

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Set output_sequence_length length to pad all samples to same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [None]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(docs)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(docs)
train_seq_padd = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)



In [None]:
print(train_seq_padd[0])


[ 523  257  685 3939 2317 2318  185    1 3940  492    1 6181  187    1
  702 6181  187    1 6181  492    1 6181  492    1 5539    1  617  185
    1    1  360 4705 8478  791  185 2100    1 1486    1    1  360  539
  257  685  340   12  147  279 1015 2497  508 1139 1211 1487  396  257
  685    9  626  193  499  326 2241  218   24   44    3  685  540   83
  249  381  326  175  618  832  863  224  523  257  685    9  981 1488
   18  257    9  403  136 6182  601   96 2101 1625  923  685   19  114
  462  218 1386 1016 2498   79   88  176  967  548  271  218  325  218
  619  235  250 3729 1788   53  669   48]
