In [1]:
import numpy as np
import pandas as pd
from scipy.stats import sem, t
from scipy import mean
import seaborn as sns
from scipy.spatial.distance import hamming

In [5]:
papers_df = pd.read_json('Papers_Metadata_6K.json')
users_df = pd.read_json('users.json')
ratings_df = pd.read_json('User_Ratings.json')

In [6]:
papers_df.head()

Unnamed: 0,author,day,id,link,month,summary,tag,title,year
0,"[{'name': 'Ahmed Osman'}, {'name': 'Wojciech S...",1,1802.00209v1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",2,We propose an architecture for VQA which utili...,"[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",Dual Recurrent Attention Units for Visual Ques...,2018
1,"[{'name': 'Ji Young Lee'}, {'name': 'Franck De...",12,1603.03827v1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",3,Recent approaches based on artificial neural n...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",Sequential Short-Text Classification with Recu...,2016
2,"[{'name': 'Iulian Vlad Serban'}, {'name': 'Tim...",2,1606.00776v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",6,We introduce the multiresolution recurrent neu...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",Multiresolution Recurrent Neural Networks: An ...,2016
3,"[{'name': 'Sebastian Ruder'}, {'name': 'Joachi...",23,1705.08142v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",5,Multi-task learning is motivated by the observ...,"[{'term': 'stat.ML', 'scheme': 'http://arxiv.o...",Learning what to share between loosely related...,2017
4,"[{'name': 'Iulian V. Serban'}, {'name': 'Chinn...",7,1709.02349v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",9,We present MILABOT: a deep reinforcement learn...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",A Deep Reinforcement Learning Chatbot,2017


In [7]:
papers_df.set_index('id',inplace=True)

In [8]:
papers_df.index

Index(['1802.00209v1', '1603.03827v1', '1606.00776v2', '1705.08142v2',
       '1709.02349v2', '1709.08878v1', '1801.06700v1', '1609.06492v1',
       '1610.01076v1', '1705.07962v2',
       ...
       '1402.4293v1', '1402.4304v3', '1402.4354v1', '1402.4512v2',
       '1402.4844v2', '1402.4862v1', '1402.5715v3', '1402.5836v3',
       '1402.5874v2', '1402.5876v4'],
      dtype='object', name='id', length=6000)

In [9]:
users_df.head()

Unnamed: 0,userID,Location,Age
0,1,nyc,
1,2,stockton,18.0
2,3,moscow,
3,4,porto,17.0
4,5,farnborough,


In [10]:
ratings_df.head()

Unnamed: 0,userID,id,paperRating
0,112,1802.00209v1,6
1,21,1603.03827v1,9
2,12,1606.00776v2,6
3,92,1705.08142v2,2
4,52,1709.02349v2,7


In [11]:
merge_df = pd.merge(ratings_df, users_df, on='userID')

In [12]:
merge_df.head()

Unnamed: 0,userID,id,paperRating,Location,Age
0,112,1802.00209v1,6,mexico city,32
1,112,1612.01589v1,8,mexico city,32
2,112,1705.06820v4,8,mexico city,32
3,112,1703.10722v3,5,mexico city,32
4,21,1603.03827v1,9,ferrol / spain,46


In [13]:
merge_df = pd.merge(merge_df, papers_df, on='id')

In [14]:
merge_df.head()

Unnamed: 0,userID,id,paperRating,Location,Age,author,day,link,month,summary,tag,title,year
0,112,1802.00209v1,6,mexico city,32,"[{'name': 'Ahmed Osman'}, {'name': 'Wojciech S...",1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",2,We propose an architecture for VQA which utili...,"[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",Dual Recurrent Attention Units for Visual Ques...,2018
1,21,1802.00209v1,8,ferrol / spain,46,"[{'name': 'Ahmed Osman'}, {'name': 'Wojciech S...",1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",2,We propose an architecture for VQA which utili...,"[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",Dual Recurrent Attention Units for Visual Ques...,2018
2,21,1802.00209v1,9,ferrol / spain,46,"[{'name': 'Ahmed Osman'}, {'name': 'Wojciech S...",1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",2,We propose an architecture for VQA which utili...,"[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",Dual Recurrent Attention Units for Visual Ques...,2018
3,21,1802.00209v1,10,ferrol / spain,46,"[{'name': 'Ahmed Osman'}, {'name': 'Wojciech S...",1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",2,We propose an architecture for VQA which utili...,"[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",Dual Recurrent Attention Units for Visual Ques...,2018
4,21,1802.00209v1,6,ferrol / spain,46,"[{'name': 'Ahmed Osman'}, {'name': 'Wojciech S...",1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",2,We propose an architecture for VQA which utili...,"[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",Dual Recurrent Attention Units for Visual Ques...,2018


In [15]:
merge_df.shape

(1999, 13)

In [16]:
# drop out the user id that is null
merge_df = merge_df[pd.notnull(merge_df['userID'])]

# drop out the paper id that is null
merge_df = merge_df[pd.notnull(merge_df['id'])]

In [17]:
merge_df.shape

(1999, 13)

In [18]:
merge_df = merge_df.drop(['Location', 'Age', 'author', 'day',
                   'link', 'month', 'summary', 'tag',
                   'title', 'year'], axis=1)

In [19]:
merge_df.head()

Unnamed: 0,userID,id,paperRating
0,112,1802.00209v1,6
1,21,1802.00209v1,8
2,21,1802.00209v1,9
3,21,1802.00209v1,10
4,21,1802.00209v1,6


In [20]:
merge_df[(merge_df.userID == 10) & (merge_df.paperRating > 5)]

Unnamed: 0,userID,id,paperRating
40,10,1802.00209v1,6
181,10,1603.03827v1,10
475,10,1801.06700v1,9
498,10,1705.08142v2,6
1082,10,1802.07426v1,6
1285,10,1510.08983v2,10
1287,10,1505.01809v3,6


In [21]:
userItemRatingMatrix = pd.pivot_table(merge_df, values='paperRating',
                                    index=['userID'], columns=['id'])

In [23]:
userItemRatingMatrix.head()

id,0812.0743v2,0911.5372v1,1004.4965v1,1006.1346v2,1007.2449v1,1008.1566v5,1008.1643v2,1010.3460v2,1102.2739v1,1103.4487v1,...,1802.09914v1,1803.00094v1,1803.01686v1,1803.02544v2,1803.03232v1,1803.03692v1,1803.05407v1,1803.06959v1,1803.07679v1,1803.08240v1
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,6.0,...,,,,,,,,,,
4,,2.0,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [24]:
userItemRatingMatrix.index

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            191, 192, 193, 194, 195, 196, 197, 198, 199, 200],
           dtype='int64', name='userID', length=200)

In [25]:
def distance(userID1,userID2):
        try:
            user1Ratings = userItemRatingMatrix.transpose()[userID1]
            user2Ratings = userItemRatingMatrix.transpose()[userID2]
            distance = hamming(user1Ratings,user2Ratings)
        except: 
            distance = np.NaN
        return distance 
def nearestNeighbors(userID,K=10):
    allUsers = pd.DataFrame(userItemRatingMatrix.index)
    #Remove only UserID given as parameter
    allUsers = allUsers[allUsers.userID!=userID]
    #Calculate hammer distance between parameter UserID and others 
    allUsers["distance"] = allUsers["userID"].apply(lambda x: distance(userID,x))
    #Sorted calculated distance by ascending order 
    KnearestUsers = allUsers.sort_values(["distance"],ascending=True)["userID"][:K]
    return KnearestUsers

In [26]:
def paperMeta(paperID):
    title = papers_df.at[paperID,"title"]
    author = papers_df.at[paperID,"author"]
    return title, author

def GetPaperByUserID(userID,N):
    userRatings = merge_df[merge_df["userID"]==userID]
    sortedRatings = pd.DataFrame.sort_values(userRatings,['paperRating'],ascending=[0])[:N] 
    sortedRatings["title"] = sortedRatings["id"].apply(paperMeta)
    return sortedRatings

def GetUserByPaperID(paperID,N):
    paperRatings = merge_df[merge_df["id"]==paperID]
    sortedRatings = pd.DataFrame.sort_values(paperRatings,['paperRating'],ascending=[0])[:N] 
    sortedRatings["title"] = sortedRatings["id"].apply(paperMeta)
    return sortedRatings

def topN(user,N=3):
    
    KnearestUsers = nearestNeighbors(user)
    
    #Assign KnearestUsers that has papers to NNRatings
    NNRatings = userItemRatingMatrix[userItemRatingMatrix.index.isin(KnearestUsers)]
    
    #Drop non values paper
    avgRating = NNRatings.apply(np.nanmean).dropna()
    
    # Transpose userItemRatingMatrix by user parameter. row : paperId, column : userId, values : rating
    papersAlreadyRead = userItemRatingMatrix.transpose()[user].dropna().index
    
    #Remove papers that is read by user
    avgRating = avgRating[~avgRating.index.isin(papersAlreadyRead)]
    
    #Sorted by descending rating
    topNpaperId = avgRating.sort_values(ascending=False).index[:N]
    
    return pd.Series(topNpaperId).apply(paperMeta)

In [27]:
paperMeta('1802.00209v1')

('Dual Recurrent Attention Units for Visual Question Answering',
 "[{'name': 'Ahmed Osman'}, {'name': 'Wojciech Samek'}]")

In [28]:
GetPaperByUserID(100, 10)

Unnamed: 0,userID,id,paperRating,title
167,100,1603.03827v1,8,(Sequential Short-Text Classification with Rec...
1104,100,1707.09219v4,8,"(Recurrent Ladder Networks, [{'name': 'Isabeau..."
24,100,1802.00209v1,7,(Dual Recurrent Attention Units for Visual Que...
356,100,1606.00776v2,6,(Multiresolution Recurrent Neural Networks: An...
694,100,1604.00289v3,6,(Building Machines That Learn and Think Like P...
1102,100,1305.1027v2,6,(Regret Bounds for Reinforcement Learning with...
1101,100,1402.0929v3,5,(Input Warping for Bayesian Optimization of No...
355,100,1606.00776v2,4,(Multiresolution Recurrent Neural Networks: An...
1095,100,1611.00454v1,1,(Collaborative Recurrent Autoencoder: Recommen...
1103,100,1206.6434v1,1,(A Generative Process for Sampling Contractive...


In [29]:
GetUserByPaperID('1609.06492v1', 10)

Unnamed: 0,userID,id,paperRating,title
623,9,1609.06492v1,10,(Document Image Coding and Clustering for Scri...
631,24,1609.06492v1,9,(Document Image Coding and Clustering for Scri...
630,68,1609.06492v1,9,(Document Image Coding and Clustering for Scri...
617,27,1609.06492v1,8,(Document Image Coding and Clustering for Scri...
618,32,1609.06492v1,7,(Document Image Coding and Clustering for Scri...
616,52,1609.06492v1,6,(Document Image Coding and Clustering for Scri...
632,93,1609.06492v1,6,(Document Image Coding and Clustering for Scri...
621,45,1609.06492v1,5,(Document Image Coding and Clustering for Scri...
622,42,1609.06492v1,5,(Document Image Coding and Clustering for Scri...
627,3,1609.06492v1,4,(Document Image Coding and Clustering for Scri...


In [30]:
topN(100,10)

  result = libreduction.compute_reduction(


0    (Variational Inference of Disentangled Latent ...
1    (Visualizing and Understanding Curriculum Lear...
2    (Detecting and Correcting for Label Shift with...
3    (Aligned Image-Word Representations Improve In...
4    (Semi-Supervised Phoneme Recognition with Recu...
5    (An ensemble-based system for automatic screen...
6    (Tutorial on Answering Questions about Images ...
7    (Option Discovery in Hierarchical Reinforcemen...
8    (A Factorization Machine Framework for Testing...
9    (Improving the Performance of Neural Machine T...
Name: id, dtype: object