In [33]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import euclidean_distances
from scipy.stats import pearsonr

In [34]:
#papers_df is pandas dataframe object
papers_df = pd.read_json('Papers_Metadata_6K.json')

papers_df.head()

Unnamed: 0,author,day,id,link,month,summary,tag,title,year
0,"[{'name': 'Ahmed Osman'}, {'name': 'Wojciech S...",1,1802.00209v1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",2,We propose an architecture for VQA which utili...,"[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",Dual Recurrent Attention Units for Visual Ques...,2018
1,"[{'name': 'Ji Young Lee'}, {'name': 'Franck De...",12,1603.03827v1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",3,Recent approaches based on artificial neural n...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",Sequential Short-Text Classification with Recu...,2016
2,"[{'name': 'Iulian Vlad Serban'}, {'name': 'Tim...",2,1606.00776v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",6,We introduce the multiresolution recurrent neu...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",Multiresolution Recurrent Neural Networks: An ...,2016
3,"[{'name': 'Sebastian Ruder'}, {'name': 'Joachi...",23,1705.08142v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",5,Multi-task learning is motivated by the observ...,"[{'term': 'stat.ML', 'scheme': 'http://arxiv.o...",Learning what to share between loosely related...,2017
4,"[{'name': 'Iulian V. Serban'}, {'name': 'Chinn...",7,1709.02349v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",9,We present MILABOT: a deep reinforcement learn...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",A Deep Reinforcement Learning Chatbot,2017


In [35]:
print("Papers : ", papers_df.shape)

Papers :  (6000, 9)


In [36]:
papers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   author   6000 non-null   object
 1   day      6000 non-null   int64 
 2   id       6000 non-null   object
 3   link     6000 non-null   object
 4   month    6000 non-null   int64 
 5   summary  6000 non-null   object
 6   tag      6000 non-null   object
 7   title    6000 non-null   object
 8   year     6000 non-null   int64 
dtypes: int64(3), object(6)
memory usage: 422.0+ KB


In [37]:
# Reverse mapping of indices and paper titles
indices = pd.Series(papers_df.index, index=papers_df['title']).drop_duplicates()

In [38]:
indices

title
Dual Recurrent Attention Units for Visual Question Answering                                       0
Sequential Short-Text Classification with Recurrent and Convolutional\n  Neural Networks           1
Multiresolution Recurrent Neural Networks: An Application to Dialogue\n  Response Generation       2
Learning what to share between loosely related tasks                                               3
A Deep Reinforcement Learning Chatbot                                                              4
                                                                                                ... 
Learning the Parameters of Determinantal Point Process Kernels                                  5995
Variational Particle Approximations                                                             5996
Avoiding pathologies in very deep networks                                                      5997
Predictive Interval Models for Non-parametric Regression                             

In [39]:
ind = indices['Learning what to share between loosely related tasks']

In [40]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')


In [41]:
papers_df['all_content'] = papers_df['title'] + papers_df['author'] + papers_df['summary']

In [42]:
tfv_matrix = tfv.fit_transform(papers_df['all_content'])

In [43]:
tfv_matrix.shape

(6000, 47152)

In [44]:
#This function helps to find the most similar papers to specified paper.
def calc_similarity(method_name):
    
    if method_name == 'sigmoid_kernel':
        matrix = sigmoid_kernel(tfv_matrix, tfv_matrix)
    elif method_name == 'linear_kernel':
        matrix = linear_kernel(tfv_matrix, tfv_matrix)
    elif method_name == 'euclidean_distances':
        matrix = euclidean_distances(tfv_matrix)
    elif method_name == 'pearsons_correlation':
        tfv_array = tfv_matrix.toarray()
        matrix = []
        for i in range(len(tfv_array)):
             matrix.append(pearsonr(tfv_array[ind], tfv_array[i])[0])
        
    
    return matrix

In [45]:
matrix = calc_similarity('sigmoid_kernel')

In [46]:
def give_rec(index, matrix=matrix):
    # Get the index corresponding to title
    idx = indices[index]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(matrix[idx]))

    # Sort the paper 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar papers
    sig_scores = sig_scores[1:11]

    # Paper indices
    paper_indices = [i[0] for i in sig_scores]

    # Top 10 most similar papers
    return papers_df['title'].iloc[paper_indices]

In [47]:
# Testing our content-based recommendation system with Genetic Algorithms and its use with back-propagation network
give_rec('Learning what to share between loosely related tasks')

2337    A Convex Formulation for Learning Task Relatio...
1454    Multi-Task Learning of Keyphrase Boundary Clas...
5828    Learning Task Grouping and Overlap in Multi-ta...
507     Multi-task Learning of Pairwise Sequence Class...
2690    An Overview of Multi-Task Learning in Deep Neu...
5763    A Convex Feature Learning Formulation for Late...
5054    Multi-task learning of time series and its app...
2856    DiGrad: Multi-Task Reinforcement Learning with...
5454    Transductive Learning for Multi-Task Copula Pr...
4916    Efficient Multi-task Feature and Relationship ...
Name: title, dtype: object

In [48]:
papers_df.iloc[3]

author         [{'name': 'Sebastian Ruder'}, {'name': 'Joachi...
day                                                           23
id                                                  1705.08142v2
link           [{'rel': 'alternate', 'href': 'http://arxiv.or...
month                                                          5
summary        Multi-task learning is motivated by the observ...
tag            [{'term': 'stat.ML', 'scheme': 'http://arxiv.o...
title          Learning what to share between loosely related...
year                                                        2017
all_content    Learning what to share between loosely related...
Name: 3, dtype: object