In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
#papers_df is pandas dataframe object
papers_df = pd.read_json('NIP_DataSet/papers.json',lines=True)
papers_df.rename(columns = {"paper_text" : "summary"},inplace=True)
papers_df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,summary
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [3]:
print("Papers : ", papers_df.shape)

Papers :  (7241, 7)


In [4]:
papers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7241 entries, 0 to 7240
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          7241 non-null   int64 
 1   year        7241 non-null   int64 
 2   title       7241 non-null   object
 3   event_type  7241 non-null   object
 4   pdf_name    7241 non-null   object
 5   abstract    7241 non-null   object
 6   summary     7241 non-null   object
dtypes: int64(2), object(5)
memory usage: 396.1+ KB


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{3,}',
            ngram_range=(1, 3),
            stop_words = 'english')

papers_df['summary'] = papers_df['summary'].fillna('')


In [6]:
tfv_matrix = tfv.fit_transform(papers_df['summary'])

In [8]:
def Find_Paper_Given_By_Keyword(words_array, keyword):
    index = words.index(keyword)
    the_most_used_on_paper_list = tfv_matrix[:, index].toarray()
    
    #Index the paper id and score
    count = 0
    the_most_used_on_paper_dict = {}
    for i in the_most_used_on_paper_list:
        the_most_used_on_paper_dict[count] = i
        count = count + 1 
        
    #Sort papers which have the most occurrence of  keyword that is given
    top_paper = {k: v for k, v in sorted(the_most_used_on_paper_dict.items(), key=lambda item: item[1],reverse=True)[:10]}
    
    #List in List [[paper_id,title,score]]
    title_list = []
    for i in top_paper.keys():
        title_list.append([i, papers_df.iloc[i].title, top_paper[i][0]])
    df = pd.DataFrame(title_list, columns = ['paper_id', 'title', 'score'])
    return df

In [9]:
words = tfv.get_feature_names()
df = Find_Paper_Given_By_Keyword(words, "algorithm")

In [10]:
df.head(10)

Unnamed: 0,paper_id,title,score
0,86,Learning Sparse Perceptrons,0.19093
1,4502,Low-rank matrix reconstruction and clustering ...,0.189494
2,213,An Apobayesian Relative of Winnow,0.188827
3,1578,Semi-Definite Programming by Perceptron Learning,0.187812
4,3713,Fast and Accurate k-means For Large Datasets,0.178971
5,691,Convergence of the Wake-Sleep Algorithm,0.176347
6,3104,Streaming k-means approximation,0.174538
7,5219,Online Gradient Boosting,0.169762
8,5063,"Streaming, Memory Limited Algorithms for Commu...",0.169273
9,2421,Online Linear Regression and Its Application t...,0.169153


In [11]:
#The following code;
#Find the most common words in the corpus(papers_df['summary'])
from sklearn.feature_extraction.text import CountVectorizer
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{3,}',
            ngram_range=(1, 3),
            stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]


In [12]:
get_top_n_words(papers_df['summary'],10)

[('learning', 106160),
 ('model', 105236),
 ('data', 94690),
 ('algorithm', 82708),
 ('set', 74920),
 ('function', 69850),
 ('using', 68015),
 ('time', 60440),
 ('number', 53529),
 ('figure', 53419)]

In [13]:
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)

In [14]:
sig[0]

array([0.76159449, 0.76159416, 0.76159417, ..., 0.76159417, 0.76159417,
       0.76159417])

In [15]:
# Reverse mapping of indices and paper titles
indices = pd.Series(papers_df.index, index=papers_df['title']).drop_duplicates()

In [20]:
def give_rec(title, sig=sig):
    # Get the index corresponding to title

    title = 'Semi-supervised Learning with Ladder Networks'
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig[idx]))

    # Sort the paper 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar papers
    sig_scores = sig_scores[1:11]
    
    data = []
    count = 0
    while count<10:
        data.append([sig_scores[count][0],(papers_df['title'].iloc[sig_scores[count][0]]), sig_scores[count][1]])
        count=count+1

    df = pd.DataFrame(data, columns = ["paper_id", "title", "score"])
    return df

In [21]:
# Testing our content-based recommendation system with Genetic Algorithms and its use with back-propagation network
give_rec('Semi-supervised Learning with Ladder Networks').head(10)

Unnamed: 0,paper_id,title,score
0,6834,Recurrent Ladder Networks,0.761594
1,4807,Semi-supervised Learning with Deep Generative ...,0.761594
2,5599,Tagger: Deep Unsupervised Perceptual Grouping,0.761594
3,6404,PixelGAN Autoencoders,0.761594
4,5664,Improved Techniques for Training GANs,0.761594
5,6111,Variational Autoencoder for Deep Learning of I...,0.761594
6,6825,Learning Disentangled Representations with Sem...,0.761594
7,5895,Regularization With Stochastic Transformations...,0.761594
8,6886,Good Semi-supervised Learning That Requires a ...,0.761594
9,6322,Mean teachers are better role models: Weight-a...,0.761594
