In [126]:
import numpy as np
import pandas as pd
from scipy.stats import sem, t
from scipy import mean
import seaborn as sns
from scipy.spatial.distance import hamming

In [127]:
papers_df = pd.read_json('Papers_Metadata_6K.json')
users_df = pd.read_json('Users.json')
ratings_df = pd.read_json('User_Ratings.json')

In [128]:
papers_df.head()

Unnamed: 0,author,day,id,link,month,summary,tag,title,year
0,"[{'name': 'Ahmed Osman'}, {'name': 'Wojciech S...",1,1802.00209v1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",2,We propose an architecture for VQA which utili...,"[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",Dual Recurrent Attention Units for Visual Ques...,2018
1,"[{'name': 'Ji Young Lee'}, {'name': 'Franck De...",12,1603.03827v1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",3,Recent approaches based on artificial neural n...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",Sequential Short-Text Classification with Recu...,2016
2,"[{'name': 'Iulian Vlad Serban'}, {'name': 'Tim...",2,1606.00776v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",6,We introduce the multiresolution recurrent neu...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",Multiresolution Recurrent Neural Networks: An ...,2016
3,"[{'name': 'Sebastian Ruder'}, {'name': 'Joachi...",23,1705.08142v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",5,Multi-task learning is motivated by the observ...,"[{'term': 'stat.ML', 'scheme': 'http://arxiv.o...",Learning what to share between loosely related...,2017
4,"[{'name': 'Iulian V. Serban'}, {'name': 'Chinn...",7,1709.02349v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",9,We present MILABOT: a deep reinforcement learn...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",A Deep Reinforcement Learning Chatbot,2017


In [129]:
papers_df.set_index('id',inplace=True)

In [130]:
papers_df.index

Index(['1802.00209v1', '1603.03827v1', '1606.00776v2', '1705.08142v2',
       '1709.02349v2', '1709.08878v1', '1801.06700v1', '1609.06492v1',
       '1610.01076v1', '1705.07962v2',
       ...
       '1402.4293v1', '1402.4304v3', '1402.4354v1', '1402.4512v2',
       '1402.4844v2', '1402.4862v1', '1402.5715v3', '1402.5836v3',
       '1402.5874v2', '1402.5876v4'],
      dtype='object', name='id', length=6000)

In [131]:
users_df.head()

Unnamed: 0,userID,Location,Age
0,1,nyc,
1,2,stockton,18.0
2,3,moscow,
3,4,porto,17.0
4,5,farnborough,


In [132]:
ratings_df.head()

Unnamed: 0,userID,id,paperRating
0,112,1802.00209v1,6
1,21,1603.03827v1,9
2,12,1606.00776v2,6
3,92,1705.08142v2,2
4,52,1709.02349v2,7


In [133]:
merge_df = pd.merge(ratings_df, users_df, on='userID')

In [134]:
merge_df.head()

Unnamed: 0,userID,id,paperRating,Location,Age
0,112,1802.00209v1,6,mexico city,32
1,112,1612.01589v1,8,mexico city,32
2,112,1705.06820v4,8,mexico city,32
3,112,1703.10722v3,5,mexico city,32
4,21,1603.03827v1,9,ferrol / spain,46


In [135]:
merge_df = pd.merge(merge_df, papers_df, on='id')

In [136]:
merge_df.head()

Unnamed: 0,userID,id,paperRating,Location,Age,author,day,link,month,summary,tag,title,year
0,112,1802.00209v1,6,mexico city,32,"[{'name': 'Ahmed Osman'}, {'name': 'Wojciech S...",1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",2,We propose an architecture for VQA which utili...,"[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",Dual Recurrent Attention Units for Visual Ques...,2018
1,112,1612.01589v1,8,mexico city,32,[{'name': 'Konrad Zolna'}],5,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",12,The method presented extends a given regressio...,"[{'term': 'cs.LG', 'scheme': 'http://arxiv.org...",Improving the Performance of Neural Networks i...,2016
2,112,1705.06820v4,8,mexico city,32,"[{'name': 'Hongyang Gao'}, {'name': 'Hao Yuan'...",18,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",5,Deconvolutional layers have been widely used i...,"[{'term': 'cs.LG', 'scheme': 'http://arxiv.org...",Pixel Deconvolutional Networks,2017
3,112,1703.10722v3,5,mexico city,32,"[{'name': 'Oleksii Kuchaiev'}, {'name': 'Boris...",31,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",3,We present two simple ways of reducing the num...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",Factorization tricks for LSTM networks,2017
4,21,1603.03827v1,9,ferrol / spain,46,"[{'name': 'Ji Young Lee'}, {'name': 'Franck De...",12,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",3,Recent approaches based on artificial neural n...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",Sequential Short-Text Classification with Recu...,2016


In [137]:
merge_df.shape

(998, 13)

In [138]:
# drop out the user id that is null
merge_df = merge_df[pd.notnull(merge_df['userID'])]

# drop out the paper id that is null
merge_df = merge_df[pd.notnull(merge_df['id'])]

In [139]:
merge_df.shape

(998, 13)

In [140]:
merge_df = merge_df.drop(['Location', 'Age', 'author', 'day',
                   'link', 'month', 'summary', 'tag',
                   'title', 'year'], axis=1)

In [141]:
merge_df.head()

Unnamed: 0,userID,id,paperRating
0,112,1802.00209v1,6
1,112,1612.01589v1,8
2,112,1705.06820v4,8
3,112,1703.10722v3,5
4,21,1603.03827v1,9


In [142]:
merge_df[(merge_df.userID == 10) & (merge_df.paperRating > 5)]

Unnamed: 0,userID,id,paperRating
321,10,1510.08983v2,10
322,10,1802.07426v1,6
323,10,1505.01809v3,6


In [143]:
userItemRatingMatrix = pd.pivot_table(merge_df, values='paperRating',
                                    index=['userID'], columns=['id'])

In [144]:
def distance(user1,user2):
        try:
            user1Ratings = userItemRatingMatrix.transpose()[userID1]
            user2Ratings = userItemRatingMatrix.transpose()[userID2]
            distance = hamming(user1Ratings,user2Ratings)
        except: 
            distance = np.NaN
        return distance 
def nearestNeighbors(userID,K=10):
    allUsers = pd.DataFrame(userItemRatingMatrix.index)
    allUsers = allUsers[allUsers.userID!=userID]
    allUsers["distance"] = allUsers["userID"].apply(lambda x: distance(userID,x))
    KnearestUsers = allUsers.sort_values(["distance"],ascending=True)["userID"][:K]
    return KnearestUsers

In [145]:
def paperMeta(paperID):
    title = papers_df.at[paperID,"title"]
    author = papers_df.at[paperID,"author"]
    return title, author

def GetPaperByUserID(userID,N):
    userRatings = merge_df[merge_df["userID"]==userID]
    sortedRatings = pd.DataFrame.sort_values(userRatings,['paperRating'],ascending=[0])[:N] 
    sortedRatings["title"] = sortedRatings["id"].apply(paperMeta)
    return sortedRatings

def GetUserByPaperID(paperID,N):
    paperRatings = merge_df[merge_df["id"]==paperID]
    sortedRatings = pd.DataFrame.sort_values(paperRatings,['paperRating'],ascending=[0])[:N] 
    sortedRatings["title"] = sortedRatings["id"].apply(paperMeta)
    return sortedRatings

def topN(user,N=3):
    KnearestUsers = nearestNeighbors(user)
    NNRatings = userItemRatingMatrix[userItemRatingMatrix.index.isin(KnearestUsers)]
    avgRating = NNRatings.apply(np.nanmean).dropna()
    papersAlreadyRead = userItemRatingMatrix.transpose()[user].dropna().index
    avgRating = avgRating[~avgRating.index.isin(papersAlreadyRead)]
    topNpaperId = avgRating.sort_values(ascending=False).index[:N]
    return pd.Series(topNpaperId).apply(paperMeta)

In [146]:
userItemRatingMatrix.shape

(199, 996)

In [147]:
paperMeta('1802.00209v1')

('Dual Recurrent Attention Units for Visual Question Answering',
 "[{'name': 'Ahmed Osman'}, {'name': 'Wojciech Samek'}]")

In [148]:
GetPaperByUserID(100, 10)

Unnamed: 0,userID,id,paperRating,title
190,100,1305.1027v2,6,(Regret Bounds for Reinforcement Learning with...
189,100,1402.0929v3,5,(Input Warping for Bayesian Optimization of No...
188,100,1611.00454v1,1,(Collaborative Recurrent Autoencoder: Recommen...
191,100,1206.6434v1,1,(A Generative Process for Sampling Contractive...


In [149]:
GetUserByPaperID('1609.06492v1', 10)

Unnamed: 0,userID,id,paperRating,title
40,76,1609.06492v1,8,(Document Image Coding and Clustering for Scri...
39,32,1609.06492v1,7,(Document Image Coding and Clustering for Scri...


In [150]:
topN(100,10)

  values, self.f, axis=self.axis, dummy=dummy, labels=labels


0    (Density estimation using Real NVP, [{'name': ...
1    (Domain Adaptive Neural Networks for Object Re...
2    (Highway Long Short-Term Memory RNNs for Dista...
3    (Pointing the Unknown Words, [{'name': 'Caglar...
4    (Learning values across many orders of magnitu...
5    (Predicting the Severity of Breast Masses with...
6    (Hybrid Linear Modeling via Local Best-fit Fla...
7    (Generating Factoid Questions With Recurrent N...
8    (Differentiable Scheduled Sampling for Credit ...
9    (Discriminative Learning via Semidefinite Prob...
Name: id, dtype: object