In [264]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
import time
import string
import nltk

In [265]:
#papers_df is pandas dataframe object
#papers_df = pd.read_json('Papers_Metadata_6K.json')

#papers_df.head()

#papers_df = pd.read_json('NIP_DataSet/papers_2K.json',lines=True)
#papers_df.rename(columns = {"paper_text" : "summary"},inplace=True)
#papers_df.head()

In [266]:
#Get papers metadata from mongodb

from pymongo import MongoClient
client = MongoClient('mongodb://localhost:27017')
db = client['Paper']
collection = db['Papers']
data = collection.find({})
data_list = list(data)
papers_df = pd.DataFrame(data_list)
papers_df.drop(columns=['_id'],inplace=True)

In [267]:
papers_df['summary'] = papers_df['summary'].str.lower()

In [268]:
papers_df['title'][4]

'Dual Recurrent Attention Units for Visual Question Answering'

In [269]:
papers_df['summary'] = papers_df['summary'].str.replace('[{}]'.format(string.punctuation), '')

In [270]:
papers_df['summary'] = papers_df['summary'].str.replace('\d+|\n+','')

In [271]:
papers_df['summary'] = papers_df['summary'].str.strip()

In [272]:
#w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
#englishStemmer=SnowballStemmer("english") #define stemming dict

In [273]:
#def stemm_texts(text):
#   return [englishStemmer.stem(w) for w in w_tokenizer.tokenize(text)]

In [274]:
#papers_df['summary'] = papers_df.summary.apply(stemm_texts)

In [275]:
papers_df['summary'][4]

'we propose an architecture for vqa which utilizes recurrent layers togenerate visual and textual attention the memory characteristic of theproposed recurrent attention units offers a rich joint embedding of visual andtextual features and enables the model to reason relations between severalparts of the image and question our single model outperforms the first placewinner on the vqa  dataset performs within margin to the currentstateoftheart ensemble model we also experiment with replacing attentionmechanisms in other stateoftheart models with our implementation and showincreased accuracy in both cases our recurrent attention mechanism improvesperformance in tasks requiring sequential or relational reasoning on the vqadataset'

In [276]:
print("Papers : ", papers_df.shape)

Papers :  (6000, 9)


In [277]:
papers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   author   6000 non-null   object
 1   day      6000 non-null   int64 
 2   id       6000 non-null   object
 3   link     6000 non-null   object
 4   month    6000 non-null   int64 
 5   summary  6000 non-null   object
 6   tag      6000 non-null   object
 7   title    6000 non-null   object
 8   year     6000 non-null   int64 
dtypes: int64(3), object(6)
memory usage: 422.0+ KB


In [278]:
papers_df.iloc[757].id

'1711.07459v1'

In [279]:
# Reverse mapping of indices and paper titles
indices = pd.Series(papers_df.index, index=papers_df['title']).drop_duplicates()

In [280]:
indices

title
Multiresolution Recurrent Neural Networks: An Application To Dialogue Response Generation       0
Sequential Short-Text Classification With Recurrent And Convolutional Neural Networks           1
A Deep Reinforcement Learning Chatbot (Short Version)                                           2
Generating Sentences by Editing Prototypes                                                      3
Dual Recurrent Attention Units for Visual Question Answering                                    4
                                                                                             ... 
Subspace Learning with Partial Information                                                   5995
Avoiding pathologies in very deep networks                                                   5996
Variational Particle Approximations                                                          5997
Predictive Interval Models for Non-parametric Regression                                     5998
Manifold Gauss

In [281]:
start = time.time()

In [282]:
ind = indices['Semi-Supervised Learning with Ladder Networks']

In [283]:
tfv = TfidfVectorizer(max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{3,}',
            ngram_range=(1, 3),
            stop_words = 'english')


In [284]:
papers_df['all_content'] =  papers_df['summary']

In [285]:
tfv_matrix = tfv.fit_transform(papers_df['all_content'])

In [286]:
tfv_matrix.shape

(6000, 906840)

In [287]:
#This function helps to find the most similar papers to specified paper.
def calc_similarity(method_name):
    
    if method_name == 'sigmoid_kernel':
        matrix = sigmoid_kernel(tfv_matrix, tfv_matrix,gamma = 0.8, coef0=0.5)
    elif method_name == 'linear_kernel':
        matrix = linear_kernel(tfv_matrix, tfv_matrix)
    elif method_name == 'euclidean_distances':
        matrix = euclidean_distances(tfv_matrix)
    elif method_name == 'cosine_similarity':
        matrix = cosine_similarity(tfv_matrix,tfv_matrix)
    elif method_name == 'pearsons_correlation':
        tfv_array = tfv_matrix.toarray()
        matrix = []
        for i in range(len(tfv_array)):
             matrix.append(pearsonr(tfv_array[ind], tfv_array[i])[0])
        
    
    return matrix

In [288]:
matrix = calc_similarity('sigmoid_kernel')

In [289]:
def give_rec(title, matrix=matrix):
    # Get the index corresponding to title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(matrix[idx]))

    # Sort the paper 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar papers
    sig_scores = sig_scores[1:11]
    
    data = []
    count = 0
    while count<10:
        data.append([(papers_df.iloc[sig_scores[count][0]].id),(papers_df['title'].iloc[sig_scores[count][0]]), sig_scores[count][1]])
        count=count+1

    df = pd.DataFrame(data, columns = ["id", "title", "score"])
    return df

In [290]:
# Testing our content-based recommendation system with Genetic Algorithms and its use with back-propagation network

#str = 'Multiresolution Recurrent Neural Networks: An Application To Dialogue Response Generation'
give_rec('A Unified Deep Neural Network for Speaker and Language Recognition').head(10)

Unnamed: 0,id,title,score
0,1505.06427v1,Deep Speaker Vectors for Semi Text-independent...,0.503852
1,1503.05471v1,Shared latent subspace modelling within Gaussi...,0.500789
2,1506.08349v1,Improved Deep Speaker Feature Learning for Tex...,0.495004
3,1504.01483v1,Transferring Knowledge from a RNN to a DNN,0.48786
4,1607.00410v1,Domain Adaptation for Neural Networks by Param...,0.487703
5,1508.01746v2,Using Deep Learning for Detecting Spoofing Att...,0.48762
6,1710.10467v2,Generalized End-to-End Loss for Speaker Verifi...,0.485437
7,1603.09643v4,Multi-task Recurrent Model for Speech and Spea...,0.48334
8,1504.01482v1,Deep Recurrent Neural Networks for Acoustic Mo...,0.482402
9,1711.02074v1,End-to-End Abnormality Detection in Medical Im...,0.481828


In [291]:
end = time.time()

In [292]:
print(end - start)

7.056185960769653
