In [626]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import euclidean_distances
from scipy.stats import pearsonr

In [627]:
#papers_df is pandas dataframe object
#papers_df = pd.read_json('Papers_Metadata_6K.json')

#papers_df.head()

papers_df = pd.read_json('NIP_DataSet/papers_2K.json',lines=True)
papers_df.rename(columns = {"paper_text" : "summary"},inplace=True)
papers_df.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,summary
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [628]:
print("Papers : ", papers_df.shape)

Papers :  (2000, 7)


In [629]:
papers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          2000 non-null   int64 
 1   year        2000 non-null   int64 
 2   title       2000 non-null   object
 3   event_type  2000 non-null   object
 4   pdf_name    2000 non-null   object
 5   abstract    2000 non-null   object
 6   summary     2000 non-null   object
dtypes: int64(2), object(5)
memory usage: 109.5+ KB


In [643]:
papers_df.iloc[757]

id                                                          1693
year                                                        1999
title          Dynamics of Supervised Learning with Restricte...
event_type                                                      
pdf_name       1693-dynamics-of-supervised-learning-with-rest...
abstract                                        Abstract Missing
summary        Dynamics of Supervised Learning with\nRestrict...
all_content    Dynamics of Supervised Learning with Restricte...
Name: 757, dtype: object

In [630]:
# Reverse mapping of indices and paper titles
indices = pd.Series(papers_df.index, index=papers_df['title']).drop_duplicates()

In [631]:
indices

title
Self-Organization of Associative Database and Its Applications                                                             0
A Mean Field Theory of Layer IV of Visual Cortex and Its Application to Artificial Neural Networks                         1
Storing Covariance by the Associative Long-Term Potentiation and Depression of Synaptic Strengths in the Hippocampus       2
Bayesian Query Construction for Neural Network Models                                                                      3
Neural Network Ensembles, Cross Validation, and Active Learning                                                            4
                                                                                                                        ... 
Single Transistor Learning Synapses                                                                                     1995
Bias, Variance and the Combination of Least Squares Estimators                                                         

In [632]:
ind = indices['Semi-supervised Learning with Ladder Networks']

In [633]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')


In [634]:
papers_df['all_content'] =  papers_df['title']

In [635]:
tfv_matrix = tfv.fit_transform(papers_df['all_content'])

In [636]:
tfv_matrix.shape

(2000, 1270)

In [637]:
#This function helps to find the most similar papers to specified paper.
def calc_similarity(method_name):
    
    if method_name == 'sigmoid_kernel':
        matrix = sigmoid_kernel(tfv_matrix, tfv_matrix)
    elif method_name == 'linear_kernel':
        matrix = linear_kernel(tfv_matrix, tfv_matrix)
    elif method_name == 'euclidean_distances':
        matrix = euclidean_distances(tfv_matrix)
    elif method_name == 'pearsons_correlation':
        tfv_array = tfv_matrix.toarray()
        matrix = []
        for i in range(len(tfv_array)):
             matrix.append(pearsonr(tfv_array[ind], tfv_array[i])[0])
        
    
    return matrix

In [638]:
matrix = calc_similarity('pearsons_correlation')



In [639]:
def give_rec(title, matrix=matrix):
    # Get the index corresponding to title
    #idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(matrix))

    # Sort the paper 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar papers
    sig_scores = sig_scores[1:11]
    
    data = []
    count = 0
    while count<10:
        data.append([sig_scores[count][0],(papers_df['title'].iloc[sig_scores[count][0]]), sig_scores[count][1]])
        count=count+1

    df = pd.DataFrame(data, columns = ["paper_id", "title", "score"])
    return df

In [640]:
# Testing our content-based recommendation system with Genetic Algorithms and its use with back-propagation network
give_rec('Semi-supervised Learning with Ladder Networks').head(10)

Unnamed: 0,paper_id,title,score
0,636,Semi-Supervised Support Vector Machines,0.448527
1,1317,ExtremeWeather: A large-scale climate dataset ...,0.425064
2,108,Using Unlabeled Data for Supervised Learning,0.353135
3,1281,A Fast Stochastic Error-Descent Algorithm for ...,0.327352
4,631,Dynamics of Supervised Learning with Restricte...,0.22908
5,757,Dynamics of Supervised Learning with Restricte...,0.220103
6,247,Learning from Demonstration,0.170683
7,643,Perceiving without Learning: From Spirals to I...,0.170683
8,650,Learning to Find Pictures of People,0.170683
9,765,Song Learning in Birds,0.170683
