In [3]:
import pandas as pd
import numpy as np
from itertools import chain

In [4]:
# Import users data with column of liked artworks
users = pd.read_csv('users.csv', converters={'liked': eval})
liked = users['liked']

# This next section is a convoluted way of creating
# a dataframe with 400 rows of users and a column of likes
# which was created to test the apriori algorithm and I have not modified
users_list = []
for user in liked:
    users_list.append(user)

df = pd.DataFrame({'user_id' : np.arange(400)})
df['artworks'] = users_list

# Create a matrix of 0 and 1 based on the artworks liked or not liked by the users
X = [set(art) for art in df.artworks]
Y = list(set(chain.from_iterable(X)))
liked_matrix = []
for id_, rec in df.iterrows():
    row = {
        "user_id": rec.user_id,
          }
    for each_art in Y:
        if each_art in rec.artworks:
             row[each_art] = 1.0
    liked_matrix.append(row)
liked_matrix = pd.DataFrame(liked_matrix)

# Transpose dataframe to have artworks as rows to calculate jaccard distance between artworks
liked_matrix = liked_matrix.set_index('user_id').fillna(0).transpose()
liked_matrix


user_id,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
9,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
11,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
510,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


In [5]:
# Calculate user similarity using jaccard distance as more appropriate metric for binary data
import scipy.spatial

jaccard = scipy.spatial.distance.cdist(liked_matrix, liked_matrix,  
                                       metric='jaccard')
artwork_distance = pd.DataFrame(jaccard,
                             columns=liked_matrix.index.values,
                             index=liked_matrix.index.values)
artwork_distance

Unnamed: 0,7,9,11,16,17,20,24,37,39,56,...,183,68,326,405,134,260,116,510,277,25
7,0.000000,0.982759,0.966102,0.982143,0.968750,0.983871,0.969697,0.928571,0.962963,0.984375,...,0.772727,0.983051,0.979592,0.932203,1.000000,0.880000,0.985294,0.960000,0.812500,0.929825
9,0.982759,0.000000,0.983051,0.697674,0.818182,0.784314,0.759259,0.964912,0.829787,0.792453,...,0.980769,1.000000,1.000000,0.983607,1.000000,0.981481,0.969697,0.980000,0.981818,0.983051
11,0.966102,0.983051,0.000000,0.945455,0.953125,0.950820,0.985294,0.983333,0.944444,0.952381,...,0.962264,0.826923,0.756098,0.984127,0.760870,0.963636,1.000000,0.795455,0.964286,1.000000
16,0.982143,0.697674,0.945455,0.000000,0.740000,0.775510,0.796296,0.963636,0.738095,0.760000,...,1.000000,1.000000,0.978261,0.983051,1.000000,1.000000,1.000000,1.000000,0.981132,0.982456
17,0.968750,0.818182,0.953125,0.740000,0.000000,0.767857,0.786885,0.984615,0.784314,0.854839,...,0.983051,0.968750,0.962963,1.000000,0.983607,0.983607,0.986486,0.982456,0.967213,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,0.880000,0.981481,0.963636,1.000000,0.983607,1.000000,0.950820,1.000000,0.980392,0.983333,...,0.780488,1.000000,0.977778,0.964912,0.980392,0.000000,0.984375,1.000000,0.707317,0.982143
116,0.985294,0.969697,1.000000,1.000000,0.986486,0.985915,0.973333,0.698113,0.968254,1.000000,...,0.983871,0.985294,1.000000,0.985915,1.000000,0.984375,0.000000,1.000000,0.952381,1.000000
510,0.960000,0.980000,0.795455,1.000000,0.982456,0.981481,1.000000,1.000000,1.000000,1.000000,...,1.000000,0.818182,0.800000,0.942308,0.769231,1.000000,1.000000,0.000000,1.000000,0.960784
277,0.812500,0.981818,0.964286,0.981132,0.967213,1.000000,0.984375,0.982143,1.000000,1.000000,...,0.756098,0.982143,1.000000,0.983051,0.980769,0.707317,0.952381,1.000000,0.000000,0.982456


In [11]:
# Create a dictionary of most similar artworks extracting the distances ranked by smallest
import json
artwork_rankings = {}

for artwork in artwork_distance.columns:
    distance = artwork_distance[artwork].nsmallest(len(artwork_distance))
   
    artwork_rankings[artwork] = [i for i in distance.index if i!=artwork]
    
# Create and save a json dictionary of rankings for the API     
with open("liked_rankings.json", "w") as output:
    json.dump(artwork_rankings, output)

In [7]:
# create a function that returns the 5 most similar artworks to the artwork_id entered
def similar_artworks(artwork_id):
    similar_artworks = []
    for i in range(5):
                similar_artwork = artwork_rankings[artwork_id][i]
                similar_artworks.append(similar_artwork)
    return similar_artworks

similar_artworks(25)

[235, 76, 434, 155, 302]