In [450]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import euclidean_distances
from scipy.stats import pearsonr

In [451]:
#papers_df is pandas dataframe object
#papers_df = pd.read_json('Papers_Metadata_6K.json')

#papers_df.head()

papers_df = pd.read_json('NIP_DataSet/papers_2K.json',lines=True)
papers_df.rename(columns = {"paper_text" : "summary"},inplace=True)
papers_df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,summary
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [452]:
print("Papers : ", papers_df.shape)

Papers :  (2000, 7)


In [453]:
papers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          2000 non-null   int64 
 1   year        2000 non-null   int64 
 2   title       2000 non-null   object
 3   event_type  2000 non-null   object
 4   pdf_name    2000 non-null   object
 5   abstract    2000 non-null   object
 6   summary     2000 non-null   object
dtypes: int64(2), object(5)
memory usage: 109.5+ KB


In [454]:
papers_df.iloc[757]

id                                                         1693
year                                                       1999
title         Dynamics of Supervised Learning with Restricte...
event_type                                                     
pdf_name      1693-dynamics-of-supervised-learning-with-rest...
abstract                                       Abstract Missing
summary       Dynamics of Supervised Learning with\nRestrict...
Name: 757, dtype: object

In [455]:
# Reverse mapping of indices and paper titles
indices = pd.Series(papers_df.index, index=papers_df['title']).drop_duplicates()

In [456]:
indices

title
Self-Organization of Associative Database and Its Applications                                                             0
A Mean Field Theory of Layer IV of Visual Cortex and Its Application to Artificial Neural Networks                         1
Storing Covariance by the Associative Long-Term Potentiation and Depression of Synaptic Strengths in the Hippocampus       2
Bayesian Query Construction for Neural Network Models                                                                      3
Neural Network Ensembles, Cross Validation, and Active Learning                                                            4
                                                                                                                        ... 
Single Transistor Learning Synapses                                                                                     1995
Bias, Variance and the Combination of Least Squares Estimators                                                         

In [457]:
ind = indices['Semi-supervised Learning with Ladder Networks']

In [458]:
tfv = TfidfVectorizer(max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{3,}',
            ngram_range=(1, 3),
            stop_words = 'english')


In [459]:
papers_df['all_content'] =  papers_df['title']

In [460]:
tfv_matrix = tfv.fit_transform(papers_df['all_content'])

In [461]:
tfv_matrix.shape

(2000, 18877)

In [462]:
#This function helps to find the most similar papers to specified paper.
def calc_similarity(method_name):
    
    if method_name == 'sigmoid_kernel':
        matrix = sigmoid_kernel(tfv_matrix, tfv_matrix,gamma = 0.8, coef0=0.5)
    elif method_name == 'linear_kernel':
        matrix = linear_kernel(tfv_matrix, tfv_matrix)
    elif method_name == 'euclidean_distances':
        matrix = euclidean_distances(tfv_matrix)
    elif method_name == 'pearsons_correlation':
        tfv_array = tfv_matrix.toarray()
        matrix = []
        for i in range(len(tfv_array)):
             matrix.append(pearsonr(tfv_array[ind], tfv_array[i])[0])
        
    
    return matrix

In [463]:
matrix = calc_similarity('linear_kernel')

In [464]:
def give_rec(title, matrix=matrix):
    # Get the index corresponding to title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(matrix[idx]))

    # Sort the paper 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar papers
    sig_scores = sig_scores[1:11]
    
    data = []
    count = 0
    while count<10:
        data.append([sig_scores[count][0],(papers_df['title'].iloc[sig_scores[count][0]]), sig_scores[count][1]])
        count=count+1

    df = pd.DataFrame(data, columns = ["paper_id", "title", "score"])
    return df

In [465]:
# Testing our content-based recommendation system with Genetic Algorithms and its use with back-propagation network
give_rec('Pulsestream Synapses with Non-Volatile Analogue Amorphous-Silicon Memories').head(10)

Unnamed: 0,paper_id,title,score
0,1898,A Silicon Axon,0.083941
1,1708,HIGH DENSITY ASSOCIATIVE MEMORIES,0.065761
2,1995,Single Transistor Learning Synapses,0.064808
3,370,Dynamic Stochastic Synapses as Computational U...,0.056846
4,1778,Analysis of Short Term Memories for Neural Net...,0.053677
5,1735,WATTLE: A Trainable Gain Analogue VLSI Neural ...,0.050006
6,990,A Silicon Primitive for Competitive Learning,0.049418
7,754,Bifurcation Analysis of a Silicon Neuron,0.049018
8,1905,Scaling Properties of Coarse-Coded Symbol Memo...,0.048851
9,7,ICEG Morphology Classification using an Analog...,0.047688
