In [1]:
import pandas as pd
import numpy as np
from itertools import chain

In [80]:
# Import users data with column of liked artworks
users = pd.read_csv('users.csv', converters={'liked': eval})
liked = users['liked']

# This next section is a convoluted way of creating
# a dataframe with 400 rows of users and a column of likes
# which was created to test the apriori algorithm and I have not modified
users_list = []
for user in liked:
    users_list.append(user)

df = pd.DataFrame({'user_id' : np.arange(400)})
df['artworks'] = users_list

# Create a matrix of 0 and 1 based on the artworks liked or not liked by the users
X = [set(art) for art in df.artworks]
Y = list(set(chain.from_iterable(X)))
liked_matrix = []
for id_, rec in df.iterrows():
    row = {
        "user_id": rec.user_id,
          }
    for each_art in Y:
        if each_art in rec.artworks:
             row[each_art] = 1.0
    liked_matrix.append(row)
liked_matrix = pd.DataFrame(liked_matrix)

# Transpose dataframe to have artworks as rows to calculate jaccard distance between artworks
liked_matrix = liked_matrix.set_index('user_id').fillna(0).transpose()
liked_matrix


user_id,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
25,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
38,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [81]:
# Calculate user similarity using jaccard distance as more appropriate metric for binary data (?)
import scipy.spatial

jaccard = scipy.spatial.distance.cdist(liked_matrix, liked_matrix,  
                                       metric='jaccard')
artwork_distance = pd.DataFrame(jaccard,
                             columns=liked_matrix.index.values,
                             index=liked_matrix.index.values)
artwork_distance

Unnamed: 0,25,38,52,66,83,84,91,118,123,131,...,481,272,478,101,111,485,416,166,233,347
25,0.000000,0.820000,0.725490,0.622222,0.983607,0.951613,0.673077,0.758621,0.972973,0.796296,...,0.969231,1.000000,0.983051,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.983607
38,0.820000,0.000000,0.872727,0.795918,0.964912,0.966667,0.754717,0.830508,0.971831,0.760000,...,1.000000,1.000000,0.963636,1.000000,0.984848,0.983871,0.982456,0.983051,0.940000,0.946429
52,0.725490,0.872727,0.000000,0.700000,0.951613,0.937500,0.690909,0.684211,0.946667,0.716981,...,0.985507,0.982456,1.000000,1.000000,1.000000,1.000000,1.000000,0.968750,0.982759,1.000000
66,0.622222,0.795918,0.700000,0.000000,0.983607,0.951613,0.673077,0.736842,0.986667,0.725490,...,0.984848,1.000000,1.000000,1.000000,0.970588,0.984615,1.000000,1.000000,1.000000,1.000000
83,0.983607,0.964912,0.951613,0.983607,0.000000,0.951613,0.985294,0.956522,0.754098,0.984375,...,0.953125,0.962264,1.000000,0.955224,0.985507,0.984615,0.983333,0.833333,0.981818,0.983607
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485,1.000000,0.983871,1.000000,0.984615,0.984615,1.000000,0.986111,0.986667,0.947368,1.000000,...,0.985714,0.964912,0.967742,0.603774,0.549020,0.000000,0.586957,0.969231,1.000000,0.968750
416,1.000000,0.982456,1.000000,1.000000,0.983333,1.000000,1.000000,0.985714,0.958333,1.000000,...,0.984615,0.981132,0.982759,0.620000,0.647059,0.586957,0.000000,0.983607,0.981481,0.966102
166,1.000000,0.983051,0.968750,1.000000,0.833333,1.000000,0.970588,0.986111,0.716667,0.984615,...,0.969697,0.943396,0.966102,0.955882,0.985714,0.969231,0.983607,0.000000,0.982143,0.967213
233,1.000000,0.940000,0.982759,1.000000,0.981818,0.982759,1.000000,1.000000,0.985507,1.000000,...,0.966102,0.804878,0.714286,1.000000,1.000000,1.000000,0.981481,0.982143,0.000000,0.697674


In [82]:
# Create a dictionary of most similar artworks extracting the distances ranked by smallest
artwork_rankings = {}

for artwork in artwork_distance.columns:
    distance = artwork_distance[artwork].nsmallest(len(artwork_distance))
   
    data = {artwork : [i for i in distance.index if i!=artwork]}
    artwork_rankings.update(data)
print(artwork_rankings)

{25: [66, 91, 504, 136, 107, 12, 317, 498, 362, 150, 480, 303, 361, 52, 144, 178, 378, 522, 328, 253, 226, 366, 121, 117, 559, 319, 142, 283, 69, 268, 118, 419, 331, 263, 258, 459, 170, 211, 200, 148, 182, 384, 249, 219, 398, 305, 465, 401, 430, 225, 552, 131, 531, 282, 129, 215, 57, 183, 229, 93, 433, 38, 483, 149, 280, 438, 418, 407, 356, 167, 207, 37, 535, 273, 318, 550, 545, 561, 0, 519, 415, 512, 218, 309, 217, 526, 274, 9, 155, 73, 368, 236, 514, 179, 489, 293, 446, 300, 231, 365, 442, 466, 22, 523, 553, 59, 141, 248, 322, 84, 546, 330, 238, 194, 82, 51, 296, 185, 2, 269, 404, 383, 255, 10, 292, 244, 382, 494, 321, 206, 534, 251, 11, 4, 501, 70, 191, 541, 475, 566, 421, 338, 450, 473, 247, 342, 453, 364, 402, 290, 260, 36, 289, 164, 72, 32, 511, 58, 432, 160, 153, 23, 327, 87, 35, 422, 390, 103, 325, 471, 288, 126, 312, 405, 518, 130, 92, 440, 157, 145, 195, 353, 304, 134, 570, 165, 89, 563, 460, 187, 412, 476, 275, 19, 234, 479, 337, 467, 352, 310, 98, 7, 34, 481, 108, 360, 375,

In [83]:
# create a function that returns the 5 most similar artworks to the artwork_id entered
def similar_artworks(artwork_id):
    similar_artworks = []
    for i in range(5):
                similar_artwork = artwork_rankings[artwork_id][i]
                similar_artworks.append(similar_artwork)
    return similar_artworks

similar_artworks(25)

[66, 91, 504, 136, 107]