In [58]:
import numpy as np
import pandas as pd
from scipy.stats import sem, t
from scipy import mean
import seaborn as sns
from scipy.spatial.distance import hamming

In [43]:
papers_df = pd.read_json('Papers_Metadata_6K.json')
users_df = pd.read_json('Users.json')
ratings_df = pd.read_json('User_Ratings.json')

In [44]:
papers_df.head()

Unnamed: 0,author,day,id,link,month,summary,tag,title,year
0,"[{'name': 'Ahmed Osman'}, {'name': 'Wojciech S...",1,1802.00209v1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",2,We propose an architecture for VQA which utili...,"[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",Dual Recurrent Attention Units for Visual Ques...,2018
1,"[{'name': 'Ji Young Lee'}, {'name': 'Franck De...",12,1603.03827v1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",3,Recent approaches based on artificial neural n...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",Sequential Short-Text Classification with Recu...,2016
2,"[{'name': 'Iulian Vlad Serban'}, {'name': 'Tim...",2,1606.00776v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",6,We introduce the multiresolution recurrent neu...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",Multiresolution Recurrent Neural Networks: An ...,2016
3,"[{'name': 'Sebastian Ruder'}, {'name': 'Joachi...",23,1705.08142v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",5,Multi-task learning is motivated by the observ...,"[{'term': 'stat.ML', 'scheme': 'http://arxiv.o...",Learning what to share between loosely related...,2017
4,"[{'name': 'Iulian V. Serban'}, {'name': 'Chinn...",7,1709.02349v2,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",9,We present MILABOT: a deep reinforcement learn...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",A Deep Reinforcement Learning Chatbot,2017


In [45]:
users_df.head()

Unnamed: 0,userID,Location,Age
0,1,nyc,
1,2,stockton,18.0
2,3,moscow,
3,4,porto,17.0
4,5,farnborough,


In [46]:
ratings_df.head()

Unnamed: 0,userID,id,paperRating
0,112,1802.00209v1,6
1,21,1603.03827v1,9
2,12,1606.00776v2,6
3,92,1705.08142v2,2
4,52,1709.02349v2,7


In [47]:
merge_df = pd.merge(ratings_df, users_df, on='userID')

In [48]:
merge_df.head()

Unnamed: 0,userID,id,paperRating,Location,Age
0,112,1802.00209v1,6,mexico city,32
1,112,1612.01589v1,8,mexico city,32
2,112,1705.06820v4,8,mexico city,32
3,112,1703.10722v3,5,mexico city,32
4,21,1603.03827v1,9,ferrol / spain,46


In [49]:
merge_df = pd.merge(merge_df, papers_df, on='id')

In [50]:
merge_df.head()

Unnamed: 0,userID,id,paperRating,Location,Age,author,day,link,month,summary,tag,title,year
0,112,1802.00209v1,6,mexico city,32,"[{'name': 'Ahmed Osman'}, {'name': 'Wojciech S...",1,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",2,We propose an architecture for VQA which utili...,"[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",Dual Recurrent Attention Units for Visual Ques...,2018
1,112,1612.01589v1,8,mexico city,32,[{'name': 'Konrad Zolna'}],5,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",12,The method presented extends a given regressio...,"[{'term': 'cs.LG', 'scheme': 'http://arxiv.org...",Improving the Performance of Neural Networks i...,2016
2,112,1705.06820v4,8,mexico city,32,"[{'name': 'Hongyang Gao'}, {'name': 'Hao Yuan'...",18,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",5,Deconvolutional layers have been widely used i...,"[{'term': 'cs.LG', 'scheme': 'http://arxiv.org...",Pixel Deconvolutional Networks,2017
3,112,1703.10722v3,5,mexico city,32,"[{'name': 'Oleksii Kuchaiev'}, {'name': 'Boris...",31,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",3,We present two simple ways of reducing the num...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",Factorization tricks for LSTM networks,2017
4,21,1603.03827v1,9,ferrol / spain,46,"[{'name': 'Ji Young Lee'}, {'name': 'Franck De...",12,"[{'rel': 'alternate', 'href': 'http://arxiv.or...",3,Recent approaches based on artificial neural n...,"[{'term': 'cs.CL', 'scheme': 'http://arxiv.org...",Sequential Short-Text Classification with Recu...,2016


In [51]:
merge_df.shape

(999, 13)

In [52]:
# drop out the user id that is null
merge_df = merge_df[pd.notnull(merge_df['userID'])]

# drop out the paper id that is null
merge_df = merge_df[pd.notnull(merge_df['id'])]

In [53]:
merge_df.shape

(999, 13)

In [54]:
merge_df = merge_df.drop(['Location', 'Age', 'author', 'day',
                   'link', 'month', 'summary', 'tag',
                   'title', 'year'], axis=1)

In [55]:
merge_df.head()

Unnamed: 0,userID,id,paperRating
0,112,1802.00209v1,6
1,112,1612.01589v1,8
2,112,1705.06820v4,8
3,112,1703.10722v3,5
4,21,1603.03827v1,9


In [56]:
merge_df[(merge_df.userID == 10) & (merge_df.paperRating > 5)]

Unnamed: 0,userID,id,paperRating
321,10,1510.08983v2,10
322,10,1802.07426v1,6
323,10,1505.01809v3,6


In [57]:
userItemRatingMatrix = pd.pivot_table(merge_df, values='paperRating',
                                    index=['userID'], columns=['id'])

In [59]:
confidence = 0.95
data = ratings_per_isbn['count']

n = len(data)
m = mean(data)
std_err = sem(data)
h = std_err * t.ppf((1 + confidence) / 2, n - 1)

start = m - h
print (start)
ax = sns.distplot(ratings_per_isbn['count'])
ax2 = ax.twinx()
sns.boxplot(x=ratings_per_isbn['count'], ax=ax2)
ax2.set(ylim=(-0.5, 10))

NameError: name 'ratings_per_isbn' is not defined