In [100]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import euclidean_distances
from scipy.stats import pearsonr

In [101]:
#papers_df is pandas dataframe object
papers_df = pd.read_json('Papers_Metadata_6K.json')

papers_df.head()

Unnamed: 0,author,day,id,link,month,summary,tag,title,year
0,"[{'name': 'Ahmed Osman'}, {'name': 'Wojciech S...",1,1802.00209v1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",2,We propose an architecture for VQA which utili...,"[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",Dual Recurrent Attention Units for Visual Ques...,2018
1,"[{'name': 'Ji Young Lee'}, {'name': 'Franck De...",12,1603.03827v1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",3,Recent approaches based on artificial neural n...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",Sequential Short-Text Classification with Recu...,2016
2,"[{'name': 'Iulian Vlad Serban'}, {'name': 'Tim...",2,1606.00776v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",6,We introduce the multiresolution recurrent neu...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",Multiresolution Recurrent Neural Networks: An ...,2016
3,"[{'name': 'Sebastian Ruder'}, {'name': 'Joachi...",23,1705.08142v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",5,Multi-task learning is motivated by the observ...,"[{'term': 'stat.ML', 'scheme': 'http://arxiv.o...",Learning what to share between loosely related...,2017
4,"[{'name': 'Iulian V. Serban'}, {'name': 'Chinn...",7,1709.02349v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",9,We present MILABOT: a deep reinforcement learn...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",A Deep Reinforcement Learning Chatbot,2017


In [102]:
print("Papers : ", papers_df.shape)

Papers :  (6000, 9)


In [103]:
papers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   author   6000 non-null   object
 1   day      6000 non-null   int64 
 2   id       6000 non-null   object
 3   link     6000 non-null   object
 4   month    6000 non-null   int64 
 5   summary  6000 non-null   object
 6   tag      6000 non-null   object
 7   title    6000 non-null   object
 8   year     6000 non-null   int64 
dtypes: int64(3), object(6)
memory usage: 422.0+ KB


In [104]:
# Reverse mapping of indices and paper titles
indices = pd.Series(papers_df.index, index=papers_df['title']).drop_duplicates()

In [105]:
indices

title
Dual Recurrent Attention Units for Visual Question Answering                                       0
Sequential Short-Text Classification with Recurrent and Convolutional\n  Neural Networks           1
Multiresolution Recurrent Neural Networks: An Application to Dialogue\n  Response Generation       2
Learning what to share between loosely related tasks                                               3
A Deep Reinforcement Learning Chatbot                                                              4
                                                                                                ... 
Learning the Parameters of Determinantal Point Process Kernels                                  5995
Variational Particle Approximations                                                             5996
Avoiding pathologies in very deep networks                                                      5997
Predictive Interval Models for Non-parametric Regression                             

In [106]:
ind = indices['Learning what to share between loosely related tasks']

In [107]:
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

papers_df['summary'] = papers_df['summary'].fillna('')


In [108]:
tfv_matrix = tfv.fit_transform(papers_df['summary'])

In [109]:
tfv_matrix.shape

(6000, 38728)

In [88]:
#This function helps to find the most similar papers to specified paper.
def calc_similarity(method_name):
    
    if method_name == 'sigmoid_kernel':
        matrix = sigmoid_kernel(tfv_matrix, tfv_matrix)
    elif method_name == 'linear_kernel':
        matrix = linear_kernel(tfv_matrix, tfv_matrix)
    elif method_name == 'euclidean_distances':
        matrix = euclidean_distances(tfv_matrix)
    elif method_name == 'pearsons_correlation':
        tfv_array = tfv_matrix.toarray()
        matrix = []
        for i in range(len(tfv_array)):
             matrix.append(pearsonr(tfv_array[ind], tfv_array[i])[0])
        
    
    return matrix

In [89]:
matrix = calc_similarity('pearsons_correlation')

In [90]:
matrix[0]

0.009652453031044998

In [92]:
list(enumerate(matrix))

[(0, 0.009652453031044998),
 (1, 0.022481550595569593),
 (2, 0.03275045540284626),
 (3, 1.0),
 (4, 0.030372527458996264),
 (5, 0.0037340640213985982),
 (6, 0.028897507742922227),
 (7, 0.0003618653483319724),
 (8, 0.02332311420436821),
 (9, 0.012330486630079882),
 (10, 0.04721292952808815),
 (11, 0.04778177958108683),
 (12, 0.04467023052830753),
 (13, 0.03786318776866164),
 (14, 0.02254844828320242),
 (15, 0.021601578078557095),
 (16, 0.016239438039249866),
 (17, 0.03352558320624851),
 (18, 0.053595655366389834),
 (19, 0.026746895333300356),
 (20, 0.033041991414369934),
 (21, 0.012835969994258217),
 (22, 0.05611542184473659),
 (23, 0.10838204487041384),
 (24, 0.020510199516659704),
 (25, 0.0251508155932394),
 (26, 0.024244656706685488),
 (27, 0.03786111779995415),
 (28, 0.0271279164990922),
 (29, 0.017245019449138976),
 (30, 0.00666898325253309),
 (31, 0.029891164736444303),
 (32, 0.013691566448581938),
 (33, 0.024010859723678384),
 (34, 0.027479916726348093),
 (35, 0.002075831085994497

In [93]:
sorted(list(enumerate(matrix)), key=lambda x: x[1], reverse=True)

[(3, 1.0),
 (2337, 0.306385243808494),
 (5828, 0.25549488722723845),
 (1454, 0.1938042203910404),
 (5054, 0.18179121257701553),
 (2856, 0.17338088017319872),
 (5763, 0.17084117140251692),
 (5384, 0.16684821352899534),
 (5454, 0.16192624382249504),
 (507, 0.16057907111641007),
 (1495, 0.1522137539366885),
 (4916, 0.15128258514476745),
 (275, 0.130575783035883),
 (2690, 0.12825157767258247),
 (5560, 0.12549764646684844),
 (5598, 0.12128362629069017),
 (2806, 0.11912277757896803),
 (5135, 0.11898191985947756),
 (2841, 0.11853382562381629),
 (1265, 0.11555615556150353),
 (623, 0.11261229891869538),
 (5685, 0.11068980557666926),
 (3865, 0.1098566314347251),
 (5409, 0.10887597869426473),
 (311, 0.1084818842736917),
 (5601, 0.10843691558272328),
 (23, 0.10838204487041384),
 (1156, 0.10695410982992862),
 (4406, 0.10429030361664089),
 (3370, 0.10426210962591792),
 (2994, 0.10267200229379081),
 (182, 0.10215419106193468),
 (5680, 0.10189387679885616),
 (1282, 0.10168838518238811),
 (2575, 0.0979

In [97]:
def give_rec(title, matrix=matrix):
    # Get the index corresponding to title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(matrix))

    # Sort the paper 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar movies
    sig_scores = sig_scores[1:11]

    # Paper indices
    paper_indices = [i[0] for i in sig_scores]

    # Top 10 most similar papers
    return papers_df['title'].iloc[paper_indices]

In [98]:
# Testing our content-based recommendation system with Genetic Algorithms and its use with back-propagation network
give_rec('Learning what to share between loosely related tasks')

2337    A Convex Formulation for Learning Task Relatio...
5828    Learning Task Grouping and Overlap in Multi-ta...
1454    Multi-Task Learning of Keyphrase Boundary Clas...
5054    Multi-task learning of time series and its app...
2856    DiGrad: Multi-Task Reinforcement Learning with...
5763    A Convex Feature Learning Formulation for Late...
5384    Stability of Multi-Task Kernel Regression Algo...
5454    Transductive Learning for Multi-Task Copula Pr...
507     Multi-task Learning of Pairwise Sequence Class...
1495    Multi-Task Video Captioning with Video and Ent...
Name: title, dtype: object

In [99]:
papers_df.iloc[3]

author     [{'name': 'Sebastian Ruder'}, {'name': 'Joachi...
day                                                       23
id                                              1705.08142v2
link       [{'rel': 'alternate', 'href': 'http://arxiv.or...
month                                                      5
summary    Multi-task learning is motivated by the observ...
tag        [{'term': 'stat.ML', 'scheme': 'http://arxiv.o...
title      Learning what to share between loosely related...
year                                                    2017
Name: 3, dtype: object