In [19]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
import time

In [20]:
#papers_df is pandas dataframe object
#papers_df = pd.read_json('Papers_Metadata_6K.json')

#papers_df.head()

#papers_df = pd.read_json('NIP_DataSet/papers_2K.json',lines=True)
#papers_df.rename(columns = {"paper_text" : "summary"},inplace=True)
#papers_df.head()

In [21]:
#Get papers metadata from mongodb

from pymongo import MongoClient
client = MongoClient('mongodb://localhost:27017')
db = client['Paper']
collection = db['Papers']
data = collection.find({})
data_list = list(data)
papers_df = pd.DataFrame(data_list)
papers_df.drop(columns=['_id'],inplace=True)

In [22]:
print("Papers : ", papers_df.shape)

Papers :  (6000, 9)


In [23]:
papers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   author   6000 non-null   object
 1   day      6000 non-null   int64 
 2   id       6000 non-null   object
 3   link     6000 non-null   object
 4   month    6000 non-null   int64 
 5   summary  6000 non-null   object
 6   tag      6000 non-null   object
 7   title    6000 non-null   object
 8   year     6000 non-null   int64 
dtypes: int64(3), object(6)
memory usage: 422.0+ KB


In [24]:
papers_df.iloc[757].id

'1711.07459v1'

In [25]:
# Reverse mapping of indices and paper titles
indices = pd.Series(papers_df.index, index=papers_df['title']).drop_duplicates()

In [26]:
indices

title
Multiresolution Recurrent Neural Networks: An Application To Dialogue Response Generation       0
Sequential Short-Text Classification With Recurrent And Convolutional Neural Networks           1
A Deep Reinforcement Learning Chatbot (Short Version)                                           2
Generating Sentences by Editing Prototypes                                                      3
Dual Recurrent Attention Units for Visual Question Answering                                    4
                                                                                             ... 
Subspace Learning with Partial Information                                                   5995
Avoiding pathologies in very deep networks                                                   5996
Variational Particle Approximations                                                          5997
Predictive Interval Models for Non-parametric Regression                                     5998
Manifold Gauss

In [27]:
start = time.time()

In [28]:
ind = indices['Semi-Supervised Learning with Ladder Networks']

In [29]:
tfv = TfidfVectorizer(max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{3,}',
            ngram_range=(1, 3),
            stop_words = 'english')


In [30]:
papers_df['all_content'] =  papers_df['title']

In [31]:
tfv_matrix = tfv.fit_transform(papers_df['all_content'])

In [32]:
tfv_matrix.shape

(6000, 55633)

In [33]:
#This function helps to find the most similar papers to specified paper.
def calc_similarity(method_name):
    
    if method_name == 'sigmoid_kernel':
        matrix = sigmoid_kernel(tfv_matrix, tfv_matrix,gamma = 0.8, coef0=0.5)
    elif method_name == 'linear_kernel':
        matrix = linear_kernel(tfv_matrix, tfv_matrix)
    elif method_name == 'euclidean_distances':
        matrix = euclidean_distances(tfv_matrix)
    elif method_name == 'cosine_similarity':
        matrix = cosine_similarity(tfv_matrix,tfv_matrix)
    elif method_name == 'pearsons_correlation':
        tfv_array = tfv_matrix.toarray()
        matrix = []
        for i in range(len(tfv_array)):
             matrix.append(pearsonr(tfv_array[ind], tfv_array[i])[0])
        
    
    return matrix

In [34]:
matrix = calc_similarity('sigmoid_kernel')

In [35]:
def give_rec(title, matrix=matrix):
    # Get the index corresponding to title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(matrix[idx]))

    # Sort the paper 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar papers
    sig_scores = sig_scores[1:11]
    
    data = []
    count = 0
    while count<10:
        data.append([(papers_df.iloc[sig_scores[count][0]].id),(papers_df['title'].iloc[sig_scores[count][0]]), sig_scores[count][1]])
        count=count+1

    df = pd.DataFrame(data, columns = ["id", "title", "score"])
    return df

In [36]:
# Testing our content-based recommendation system with Genetic Algorithms and its use with back-propagation network

#str = 'Multiresolution Recurrent Neural Networks: An Application To Dialogue Response Generation'
give_rec('Semi-Supervised Learning with Ladder Networks').head(10)

Unnamed: 0,id,title,score
0,1711.07476v2,Virtual Adversarial Ladder Networks For Semi-s...,0.679026
1,1707.09219v4,Recurrent Ladder Networks,0.635579
2,1611.02320v1,Adversarial Ladder Networks,0.63485
3,1406.5298v2,Semi-Supervised Learning with Deep Generative ...,0.631932
4,1612.01756v3,Video Ladder Networks,0.629081
5,1706.02124v2,Semi-Supervised Phoneme Recognition with Recur...,0.619887
6,1803.01216v1,Deep Bayesian Active Semi-Supervised Learning,0.61618
7,1509.01168v1,Semi-described and semi-supervised learning wi...,0.611418
8,1202.3702v1,Semi-supervised Learning with Density Based Di...,0.606573
9,1206.5240v1,Analysis of Semi-Supervised Learning with the ...,0.605357


In [37]:
end = time.time()

In [38]:
print(end - start)

0.8944108486175537
