In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing

In [None]:
df = pd.read_csv('papers.csv')
df = df.iloc[:5000,:]

In [3]:
df.head(10)

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."
5,1002,1994,Using a neural net to instantiate a deformable...,,1002-using-a-neural-net-to-instantiate-a-defor...,Abstract Missing,U sing a neural net to instantiate a\ndeformab...
6,1003,1994,Plasticity-Mediated Competitive Learning,,1003-plasticity-mediated-competitive-learning.pdf,Abstract Missing,Plasticity-Mediated Competitive Learning\n\nTe...
7,1004,1994,ICEG Morphology Classification using an Analog...,,1004-iceg-morphology-classification-using-an-a...,Abstract Missing,ICEG Morphology Classification using an\nAnalo...
8,1005,1994,Real-Time Control of a Tokamak Plasma Using Ne...,,1005-real-time-control-of-a-tokamak-plasma-usi...,Abstract Missing,Real-Time Control of a Tokamak Plasma\nUsing N...
9,1006,1994,Pulsestream Synapses with Non-Volatile Analogu...,,1006-pulsestream-synapses-with-non-volatile-an...,Abstract Missing,Real-Time Control of a Tokamak Plasma\nUsing N...


In [4]:
df.shape

(5000, 7)

In [5]:
df.isnull().sum()

id               0
year             0
title            0
event_type    4335
pdf_name         0
abstract         0
paper_text       0
dtype: int64

# Preprocessing Data

# Working With "paper text"

In [6]:
df['paper_text'][0]

'767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABASE\nAND ITS APPLICATIONS\nHisashi Suzuki and Suguru Arimoto\nOsaka University, Toyonaka, Osaka 560, Japan\nABSTRACT\nAn efficient method of self-organizing associative databases is proposed together with\napplications to robot eyesight systems. The proposed databases can associate any input\nwith some output. In the first half part of discussion, an algorithm of self-organization is\nproposed. From an aspect of hardware, it produces a new style of neural network. In the\nlatter half part, an applicability to handwritten letter recognition and that to an autonomous\nmobile robot system are demonstrated.\n\nINTRODUCTION\nLet a mapping f : X -+ Y be given. Here, X is a finite or infinite set, and Y is another\nfinite or infinite set. A learning machine observes any set of pairs (x, y) sampled randomly\nfrom X x Y. (X x Y means the Cartesian product of X and Y.) And, it computes some\nestimate j : X -+ Y of f to make small, the estimation erro

# Steps to do
1 Lower case                                    
2 remove HTML tags                                     
3 remove special characters and digits                               
4 Convert to list from string                                      
5 remove stopwords                              
6 remove words less than three letters                             
7 lemmatize


In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [8]:
stop_words = set(stopwords.words('english'))
new_stop_words = ["fig","figure","image","sample","using", 
             "show", "result", "large", 
             "also", "one", "two", "three", 
             "four", "five", "seven","eight","nine"]

In [9]:
stop_words = list(stop_words.union(new_stop_words))

In [10]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

def preprocess_text(txt):
    # Lower case
    txt = txt.lower()
    # Remove HTML tags
    txt = re.sub(r"<.*?>", " ", txt)
    # Remove special characters and digits
    txt = re.sub(r"[^a-zA-Z]", " ", txt)
    # tokenization
    txt = nltk.word_tokenize(txt)
    # Remove stopwords
    txt = [word for word in txt if word not in stop_words]
    # Remove words less than three letters
    txt = [word for word in txt if len(word) >= 3]
    # Lemmatize
    lmtr = WordNetLemmatizer()
    txt = [lmtr.lemmatize(word) for word in txt]

    return " ".join(txt)


In [16]:
preprocess_text("HELO everyone I'm learning NLP from 99999 *&^ <p>This is a <b>sample</b> text with <i>HTML tags</i>.</p>")

'helo everyone learning nlp text html tag'

In [17]:
docs = df['paper_text'].apply(lambda x:preprocess_text(x))

In [18]:
docs

0       self organization associative database applica...
1       mean field theory layer visual cortex applicat...
2       storing covariance associative long term poten...
3       bayesian query construction neural network mod...
4       neural network ensemble cross validation activ...
                              ...                        
4995    low rank time frequency synthesis matthieu kow...
4996    state space model decoding auditory attentiona...
4997    efficient structured matrix rank minimization ...
4998    cient minimax signal detection graph jing qian...
4999    signal aggregate constraint additive factorial...
Name: paper_text, Length: 5000, dtype: object

# Using TF-IDF
TF-IDF stands for Text Frequency Inverse Document Frequency. The importance of each word increases in proportion to the number of times a word appears in the document (Text Frequency – TF) but is offset by the frequency of the word in the corpus (Inverse Document Frequency – IDF).

Using the tf-idf weighting scheme, the keywords are the words with the highest TF-IDF score. 

# CountVectorizer
For this task, I’ll first use the CountVectorizer method in Scikit-learn to create a vocabulary and generate the word count:

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

# Reduce max_features and adjust n-gram range
cv = CountVectorizer(max_features=6000, ngram_range=(1, 2))

# Create a vocabulary and word count vectors
word_count_vectors = cv.fit_transform(docs)


In [20]:
cv

# TfidfTransformer 

In [21]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vectors)

# I will create a function for the task of Keyword Extraction with Python by using the Tf-IDF vectorization:m

In [39]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    #taking top items from vector
    sorted_items = sorted_items[:topn]
    
    score_vals = []
    feature_vals = []
    for idx, score in sorted_items:
        fname = feature_names[idx]
        score_vals.append(round(score,3))
        feature_vals.append(feature_names[idx])
    
    #create a tuples of features,score
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]  # Fix: Changed '==' to '='
    return results


# get feature names
feature_names=cv.get_feature_names_out()

def get_keywords(idx, docs):

    #generate tf-idf for the given document
    tf_idf_vector=tfidf_transformer.transform(cv.transform([docs[idx]]))

    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only the top n; n here is 10
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    
    return keywords

def get_keywords_from_text(preprocessed_text, docs):
    # Generate TF-IDF for the given document
    tf_idf_vector = tfidf_transformer.transform(cv.transform([preprocessed_text]))

    # Sort the TF-IDF vectors by descending order of scores
    sorted_items = sort_coo(tf_idf_vector.tocoo())

    # Extract only the top n; n here is 10
    keywords = extract_topn_from_vector(feature_names, sorted_items, 20)

    return keywords


def print_results(idx,keywords, df):
    # now print the results
    print("\n=====Title=====")
    print(df['title'][idx])
    print("\n=====Abstract=====")
    print(df['abstract'][idx])
    print("\n===Keywords===")
    for k in keywords:
        print(k,keywords[k])

In [40]:
idx=590
keywords=get_keywords(idx, docs)
print_results(idx,keywords, df)


=====Title=====
What Size Net Gives Valid Generalization?

=====Abstract=====
Abstract Missing

===Keywords===
net 0.299
node 0.296
linear threshold 0.234
example 0.186
feedforward 0.18
baum 0.162
hidden layer 0.154
haussler 0.153
corollary 0.15
architecture 0.15


In [41]:
idx=456
keywords=get_keywords(idx, docs)
print_results(idx,keywords, df)


=====Title=====
A Hippocampal Model of Recognition Memory

=====Abstract=====
Abstract Missing

===Keywords===
item 0.37
memory 0.297
hippocampus 0.291
hippocampal 0.278
interference 0.248
pattern 0.208
mcclelland 0.177
model 0.168
studied 0.164
probe 0.159


In [42]:
idx=90
keywords=get_keywords(idx, docs)
print_results(idx,keywords, df)


=====Title=====
Neural Networks for Model Matching and Perceptual Organization

=====Abstract=====
Abstract Missing

===Keywords===
frame 0.521
model 0.257
match 0.237
matching 0.181
objective function 0.179
high level 0.173
perceptual 0.166
level 0.155
hopfield 0.144
rectangle 0.139


In [43]:
idx=685
keywords=get_keywords(idx, docs)
print_results(idx,keywords, df)


=====Title=====
A Polygonal Line Algorithm for Constructing Principal Curves

=====Abstract=====
Abstract Missing

===Keywords===
curve 0.604
principal 0.413
vertex 0.275
algorithm 0.233
segment 0.223
squared 0.144
generating 0.133
line 0.125
data 0.122
line segment 0.09


In [44]:
custom_text = "Bali is predominantly a Hindu country. Bali is known for its elaborate, traditional dancing. The dancing is inspired by its Hindi beliefs. Most of the dancing portrays tales of good versus evil. To watch the dancing is a breathtaking experience. Lombok has some impressive points of interest – the majestic Gunung Rinjani is an active volcano. It is the second highest peak in Indonesia. Art is a Balinese passion. Batik paintings and carved statues make popular souvenirs. Artists can be seen whittling and painting on the streets, particularly in Ubud. It is easy to appreciate each island as an attractive tourist destination. Majestic scenery; rich culture; white sands and warm, azure waters draw visitors like magnets every year. Snorkelling and diving around the nearby Gili Islands is magnificent. Marine fish, starfish, turtles and coral reef are present in abundance. Bali and Lombok are part of the Indonesian archipelago. Bali has some spectacular temples. The most significant is the Mother Temple, Besakih. The inhabitants of Lombok are mostly Muslim with a Hindu minority. Lombok remains the most understated of the two islands. Lombok has several temples worthy of a visit, though they are less prolific. Bali and Lombok are neighbouring islands."

preprocessed_text = preprocess_text(custom_text)
keywords = get_keywords_from_text(preprocessed_text, docs)
print("Keywords:", keywords)

Keywords: {'fish': 0.32, 'water': 0.255, 'visit': 0.246, 'street': 0.243, 'nearby': 0.217, 'mostly': 0.198, 'attractive': 0.198, 'peak': 0.19, 'experience': 0.19, 'rich': 0.186, 'inspired': 0.176, 'white': 0.168, 'draw': 0.164, 'belief': 0.164, 'year': 0.161, 'traditional': 0.16, 'highest': 0.159, 'versus': 0.155, 'art': 0.151, 'active': 0.15}


In [45]:
custom_text = "Many parents believe that sugar consumption causes hyperactivity in their children. Indeed, ‘sugar highs’ are often blamed for rowdiness or excitability – but is sugar the guilty party, or is it simply a case of ‘normal’ childhood behaviour? Several years ago, Channel 4, together with Jo Frost (perhaps better known as Supernanny) conducted an experiment to distinguish the truth. Forty children, aged six, were invited to a party and divided into two halves. One half was given typical sugary party foods; the other half ate sugar-free alternatives. Crucially, the parents of the children were unaware as to which group their child was in. (Incidentally, no artificial colourings or flavourings commonly found in sweets were present, since these have already been linked to hyperactivity.) Subsequently, as the children ran about and enjoyed the party, the parents were asked whether they believed their own child had been given sugar. The majority believed they had. Ironically, as the children then sat down to watch a magic show, most parents changed their minds. Basically, they could not accept that their child was capable of sitting still after consuming sugary foods. To conclude, the experiment suggested that there was no link between hyperactivity and sugar intake, but that the children were naturally excited because they were at a party."

preprocessed_text = preprocess_text(custom_text)
keywords = get_keywords_from_text(preprocessed_text, docs)
print("Keywords:", keywords)

Keywords: {'child': 0.633, 'party': 0.536, 'parent': 0.287, 'food': 0.203, 'half': 0.162, 'sat': 0.102, 'accept': 0.086, 'basically': 0.083, 'consuming': 0.083, 'linked': 0.077, 'asked': 0.077, 'behaviour': 0.074, 'mind': 0.074, 'subsequently': 0.071, 'changed': 0.069, 'conducted': 0.066, 'majority': 0.065, 'ran': 0.065, 'capable': 0.064, 'channel': 0.064}
