In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
#papers_df is pandas dataframe object
papers_df = pd.read_json('Papers_Metadata_12K.json')

papers_df.head()

Unnamed: 0,author,day,id,link,month,summary,tag,title,year
0,"[{'name': 'Ahmed Osman'}, {'name': 'Wojciech S...",1,1802.00209v1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",2,We propose an architecture for VQA which utili...,"[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",Dual Recurrent Attention Units for Visual Ques...,2018
1,"[{'name': 'Ji Young Lee'}, {'name': 'Franck De...",12,1603.03827v1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",3,Recent approaches based on artificial neural n...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",Sequential Short-Text Classification with Recu...,2016
2,"[{'name': 'Iulian Vlad Serban'}, {'name': 'Tim...",2,1606.00776v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",6,We introduce the multiresolution recurrent neu...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",Multiresolution Recurrent Neural Networks: An ...,2016
3,"[{'name': 'Sebastian Ruder'}, {'name': 'Joachi...",23,1705.08142v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",5,Multi-task learning is motivated by the observ...,"[{'term': 'stat.ML', 'scheme': 'http://arxiv.o...",Learning what to share between loosely related...,2017
4,"[{'name': 'Iulian V. Serban'}, {'name': 'Chinn...",7,1709.02349v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",9,We present MILABOT: a deep reinforcement learn...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",A Deep Reinforcement Learning Chatbot,2017


In [3]:
print("Papers : ", papers_df.shape)

Papers :  (12000, 9)


In [4]:
papers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   author   12000 non-null  object
 1   day      12000 non-null  int64 
 2   id       12000 non-null  object
 3   link     12000 non-null  object
 4   month    12000 non-null  int64 
 5   summary  12000 non-null  object
 6   tag      12000 non-null  object
 7   title    12000 non-null  object
 8   year     12000 non-null  int64 
dtypes: int64(3), object(6)
memory usage: 843.9+ KB


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

papers_df['summary'] = papers_df['summary'].fillna('')


In [6]:
tfv_matrix = tfv.fit_transform(papers_df['summary'])

In [7]:
tfv_matrix.shape

(12000, 78715)

In [8]:
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)

In [9]:
sig[0]

array([0.76159949, 0.76159428, 0.76159442, ..., 0.76159434, 0.76159438,
       0.76159426])

In [10]:
# Reverse mapping of indices and paper titles
indices = pd.Series(papers_df.index, index=papers_df['title']).drop_duplicates()

In [13]:
indices

title
Dual Recurrent Attention Units for Visual Question Answering                                        0
Sequential Short-Text Classification with Recurrent and Convolutional\n  Neural Networks            1
Multiresolution Recurrent Neural Networks: An Application to Dialogue\n  Response Generation        2
Learning what to share between loosely related tasks                                                3
A Deep Reinforcement Learning Chatbot                                                               4
                                                                                                ...  
Ranking medical jargon in electronic health record notes by adapted\n  distant supervision      11995
Multi-view Recurrent Neural Acoustic Word Embeddings                                            11996
Knowledge Enhanced Hybrid Neural Network for Text Matching                                      11997
A Neural Architecture Mimicking Humans End-to-End for Natural Language\n  In

In [14]:
indices['Learning what to share between loosely related tasks']

3

In [15]:
list(enumerate(sig[indices['Learning what to share between loosely related tasks']]))

[(0, 0.761594213240059),
 (1, 0.7615942864356731),
 (2, 0.7615943283038135),
 (3, 0.7615994912829356),
 (4, 0.7615943070475254),
 (5, 0.761594182228807),
 (6, 0.7615943044013012),
 (7, 0.7615941678991974),
 (8, 0.7615942792204127),
 (9, 0.7615942220140292),
 (10, 0.7615944067202736),
 (11, 0.761594401061399),
 (12, 0.7615943981324459),
 (13, 0.7615943565945192),
 (14, 0.7615942794080393),
 (15, 0.7615942717521165),
 (16, 0.7615942495118515),
 (17, 0.7615943336498023),
 (18, 0.7615944332095954),
 (19, 0.7615943017194179),
 (20, 0.7615943388129167),
 (21, 0.7615942344308473),
 (22, 0.7615944498016055),
 (23, 0.7615946856311828),
 (24, 0.7615942673476139),
 (25, 0.7615942926111361),
 (26, 0.7615942877937439),
 (27, 0.761594358226892),
 (28, 0.7615943000347322),
 (29, 0.7615942595949758),
 (30, 0.7615941972262863),
 (31, 0.7615943162987854),
 (32, 0.7615942394529044),
 (33, 0.7615942838407164),
 (34, 0.7615943092731166),
 (35, 0.7615941740672403),
 (36, 0.7615943654201864),
 (37, 0.7615944

In [16]:
sorted(list(enumerate(sig[indices['Learning what to share between loosely related tasks']])), key=lambda x: x[1], reverse=True)

[(3, 0.7615994912829356),
 (7532, 0.7615957952387301),
 (2337, 0.7615956575812346),
 (5828, 0.7615953382779497),
 (8140, 0.7615951489026813),
 (6725, 0.7615951319489471),
 (9680, 0.7615951211542757),
 (9612, 0.7615950888778155),
 (6022, 0.7615950765395152),
 (6760, 0.7615950415987205),
 (8480, 0.761595027520187),
 (7574, 0.7615950206110412),
 (5054, 0.761594999971865),
 (2856, 0.761594995923456),
 (9438, 0.7615949902526472),
 (1454, 0.7615949847701029),
 (5763, 0.7615949838451745),
 (5384, 0.7615949524589236),
 (5454, 0.7615949435597438),
 (6758, 0.761594925183793),
 (1495, 0.7615949169899491),
 (507, 0.7615949077420917),
 (4916, 0.7615948883633313),
 (6785, 0.7615948500080895),
 (275, 0.7615948274053783),
 (11921, 0.761594802309453),
 (2690, 0.7615947996975885),
 (7792, 0.7615947791973715),
 (6904, 0.761594774146227),
 (8719, 0.7615947716576947),
 (5598, 0.7615947684588613),
 (6900, 0.7615947619532161),
 (5560, 0.7615947599933892),
 (8556, 0.761594757727937),
 (2841, 0.761594742420704

In [18]:
def give_rec(title, sig=sig):
    # Get the index corresponding to title
    idx = indices[title]

    # Get the pairwsie similarity scores 
    sig_scores = list(enumerate(sig[idx]))

    # Sort the paper 
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar papers
    sig_scores = sig_scores[1:11]

    # Paper indices
    paper_indices = [i[0] for i in sig_scores]

    # Top 10 most similar papers
    return papers_df['title'].iloc[paper_indices]

In [19]:
# Testing our content-based recommendation system with Genetic Algorithms and its use with back-propagation network
give_rec('Learning what to share between loosely related tasks')

7532           Multi-task Learning for Continuous Control
2337    A Convex Formulation for Learning Task Relatio...
5828    Learning Task Grouping and Overlap in Multi-ta...
8140        Cross-stitch Networks for Multi-task Learning
6725           Multi-Task Learning for Contextual Bandits
9680    Multi-Task Learning with Group-Specific Featur...
9612    Learning Multiple Tasks with Multilinear Relat...
6022         Bounds for Vector-Valued Function Estimation
6760    Joint auto-encoders: a flexible multi-task lea...
8480    Fully-adaptive Feature Sharing in Multi-Task N...
Name: title, dtype: object

In [20]:
papers_df.iloc[3]

author     [{'name': 'Sebastian Ruder'}, {'name': 'Joachi...
day                                                       23
id                                              1705.08142v2
link       [{'rel': 'alternate', 'href': 'http://arxiv.or...
month                                                      5
summary    Multi-task learning is motivated by the observ...
tag        [{'term': 'stat.ML', 'scheme': 'http://arxiv.o...
title      Learning what to share between loosely related...
year                                                    2017
Name: 3, dtype: object