In [12]:
import json
import pandas as pd
import numpy as np

In [13]:
#papers_df is pandas dataframe object
papers_df = pd.read_json('NIP_DataSet/papers.json',lines=True)
papers_df.rename(columns = {"paper_text" : "summary"},inplace=True)
papers_df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,summary
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [14]:
print("Papers : ", papers_df.shape)

Papers :  (7241, 7)


In [15]:
papers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7241 entries, 0 to 7240
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          7241 non-null   int64 
 1   year        7241 non-null   int64 
 2   title       7241 non-null   object
 3   event_type  7241 non-null   object
 4   pdf_name    7241 non-null   object
 5   abstract    7241 non-null   object
 6   summary     7241 non-null   object
dtypes: int64(2), object(5)
memory usage: 396.1+ KB


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

papers_df['summary'] = papers_df['summary'].fillna('')


In [17]:
tfv_matrix = tfv.fit_transform(papers_df['summary'])

In [18]:
tfv_matrix.shape

(7241, 1666651)

In [19]:
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)

In [20]:
sig[0]

array([0.76159441, 0.76159416, 0.76159417, ..., 0.76159417, 0.76159417,
       0.76159417])

In [21]:
# Reverse mapping of indices and paper titles
indices = pd.Series(papers_df.index, index=papers_df['title']).drop_duplicates()

In [22]:
indices

title
Self-Organization of Associative Database and Its Applications                                                             0
A Mean Field Theory of Layer IV of Visual Cortex and Its Application to Artificial Neural Networks                         1
Storing Covariance by the Associative Long-Term Potentiation and Depression of Synaptic Strengths in the Hippocampus       2
Bayesian Query Construction for Neural Network Models                                                                      3
Neural Network Ensembles, Cross Validation, and Active Learning                                                            4
                                                                                                                        ... 
Single Transistor Learning Synapses                                                                                     7236
Bias, Variance and the Combination of Least Squares Estimators                                                         

In [24]:
indices['Neural Network Ensembles, Cross Validation, and Active Learning']

4

In [27]:
list(enumerate(sig[indices['Neural Network Ensembles, Cross Validation, and Active Learning']]))

[(0, 0.7615941738604206),
 (1, 0.7615941638850772),
 (2, 0.7615941610374066),
 (3, 0.7615941769442313),
 (4, 0.7615944079426235),
 (5, 0.7615941687566172),
 (6, 0.7615941664909346),
 (7, 0.7615941675520875),
 (8, 0.7615941682869648),
 (9, 0.7615941702607503),
 (10, 0.7615941672780666),
 (11, 0.7615941681502182),
 (12, 0.7615941794880099),
 (13, 0.7615941719922943),
 (14, 0.7615941637474),
 (15, 0.7615941843671479),
 (16, 0.7615941659471405),
 (17, 0.761594161628847),
 (18, 0.761594167127232),
 (19, 0.7615941673817667),
 (20, 0.7615941677652014),
 (21, 0.7615941633225791),
 (22, 0.7615941703445293),
 (23, 0.7615941667467159),
 (24, 0.7615941717827722),
 (25, 0.7615941650893155),
 (26, 0.7615941693291726),
 (27, 0.7615941686537456),
 (28, 0.7615941629939456),
 (29, 0.761594163132344),
 (30, 0.7615941613127019),
 (31, 0.7615941648813108),
 (32, 0.7615941672905563),
 (33, 0.7615941701700393),
 (34, 0.7615941811432299),
 (35, 0.7615941656935813),
 (36, 0.7615941652689239),
 (37, 0.761594164

In [28]:
sorted(list(enumerate(sig[indices['Neural Network Ensembles, Cross Validation, and Active Learning']])), key=lambda x: x[1], reverse=True)

[(4, 0.7615944079426235),
 (51, 0.7615942510125772),
 (200, 0.7615942201644355),
 (192, 0.7615942180793628),
 (307, 0.7615942120371357),
 (1765, 0.7615942046490161),
 (774, 0.761594199047537),
 (3245, 0.7615941986427222),
 (5825, 0.761594197740578),
 (7068, 0.7615941962233402),
 (7043, 0.7615941955137948),
 (6542, 0.7615941937780878),
 (2745, 0.7615941937070727),
 (5713, 0.7615941930729385),
 (215, 0.7615941925549669),
 (6875, 0.7615941925247525),
 (5460, 0.76159419238165),
 (5659, 0.7615941922627442),
 (69, 0.76159419204311),
 (7154, 0.7615941918966812),
 (7204, 0.761594191461793),
 (4533, 0.7615941913344633),
 (982, 0.7615941911794328),
 (80, 0.7615941907389747),
 (297, 0.7615941905312382),
 (4368, 0.7615941904631395),
 (4956, 0.7615941903275242),
 (2110, 0.7615941900834123),
 (549, 0.7615941899330323),
 (2708, 0.7615941898302401),
 (85, 0.7615941898049418),
 (6187, 0.7615941895698649),
 (67, 0.7615941894291155),
 (141, 0.7615941893307886),
 (2160, 0.7615941889867383),
 (5943, 0.7615

In [29]:
def give_rec(title, sig=sig):
    # Get the index corresponding to title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig[idx]))

    # Sort the paper 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar papers
    sig_scores = sig_scores[1:11]

    # Paper indices
    paper_indices = [i[0] for i in sig_scores]

    # Top 10 most similar papers
    return papers_df['title'].iloc[paper_indices]

In [30]:
# Testing our content-based recommendation system with Genetic Algorithms and its use with back-propagation network
give_rec('Neural Network Ensembles, Cross Validation, and Active Learning')

51      Learning with ensembles: How overfitting can b...
200                 Balancing Between Bagging and Bumping
192     Generating Accurate and Diverse Members of a N...
307           Ensemble Methods for Phoneme Classification
1765    Co-Validation: Using Model Disagreement on Unl...
774                 Bayesian Averaging is Well-Temperated
3245    Chaitin-Kolmogorov Complexity and Generalizati...
5825    Stochastic Multiple Choice Learning for Traini...
7068    Discontinuous Generalization in Large Committe...
7043    Optimal Stopping and Effective Machine Complex...
Name: title, dtype: object

In [31]:
papers_df.iloc[3]

id                                                         1000
year                                                       1994
title         Bayesian Query Construction for Neural Network...
event_type                                                     
pdf_name      1000-bayesian-query-construction-for-neural-ne...
abstract                                       Abstract Missing
summary       Bayesian Query Construction for Neural\nNetwor...
Name: 3, dtype: object