In [11]:
import pandas as pd

In [12]:
thresholds = [10, 20, 50, 100, 200]

In [13]:
AFP = {}
AFP_normalized = {}
for t in thresholds:
    df = pd.read_csv(f'../LFM-1b_UGP/AFP/LFM-1b_AFP_{t}.txt', sep='\t')
    df.set_index('artist_id', inplace=True)
    df = df.loc[(df.sum(axis=1) != 0)]
    index_to_artist_id = {index: artist_id for index, artist_id in enumerate(df.index.tolist())}
    artist_id_to_index = {artist_id: index for index, artist_id in enumerate(df.index.tolist())}
    AFP[t] = (df, index_to_artist_id, artist_id_to_index)

    df = pd.read_csv(f'../LFM-1b_UGP/AFP/LFM-1b_AFP_{t}_normalized.txt', sep='\t')
    df.set_index('artist_id', inplace=True)
    df = df.loc[(df.sum(axis=1) != 0)]
    index_to_artist_id = {index: artist_id for index, artist_id in enumerate(df.index.tolist())}
    artist_id_to_index = {artist_id: index for index, artist_id in enumerate(df.index.tolist())}
    AFP_normalized[t] = (df, index_to_artist_id, artist_id_to_index)

In [14]:
from collections import defaultdict
import heapq
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.model_selection import train_test_split

In [15]:
df_plays_test = pd.read_csv('../testing/test_playcounts.txt', sep='\t')
user_ids = df_plays_test['user_id'].unique()
liked_artists = {}
for t in thresholds:
    mask = df_plays_test['artist_id'].isin(AFP[t][2]) & (df_plays_test['playcount'] >= t)
    liked_artists[t] = df_plays_test[mask].groupby('user_id')['artist_id'].apply(list).to_dict(into=defaultdict(list))

In [16]:
def recommend_and_evaluate(threshold: int, normalized: bool):
    if normalized:
        data = AFP[threshold]
    else:
        data = AFP_normalized[threshold]

    precisions_cosine = []
    precisions_kNN = []
    distances = []
    for i, user_id in enumerate(user_ids):
        # print(f"User {i}")
        artist_ids = liked_artists[threshold][user_id]
        if len(artist_ids) < 2:
            continue

        if len(artist_ids) < 5:
            train_artists, test_artists = train_test_split(artist_ids, train_size=0.75, random_state=42)
        else:
            train_artists, test_artists = train_test_split(artist_ids, test_size=0.2, random_state=42)

        k = 100
        top_k_kNN = []
        top_k_cosine = []
        for artist_id in train_artists:
            index = data[2][artist_id]
            pw_distances = pairwise_distances(data[0].values[[index]], data[0].values)
            cosine_sim = cosine_similarity(data[0].values[[index]], data[0].values)
            pw_distances[0][index] = float('inf')
            cosine_sim[0][index] = 0
            for idx, distance in enumerate(pw_distances[0]):
                if len(top_k_kNN) < k:
                    heapq.heappush(top_k_kNN, (distance, idx))
                else:
                    if distance < top_k_kNN[0][0]:
                        heapq.heapreplace(top_k_kNN, (distance, idx))

            for idx, sim_value in enumerate(cosine_sim[0]):
                if len(top_k_cosine) < k:
                    heapq.heappush(top_k_cosine, (sim_value, idx))
                else:
                    if sim_value > top_k_cosine[0][0]:
                        heapq.heapreplace(top_k_cosine, (sim_value, idx))

        positive_kNN = 0
        for distance, index in top_k_kNN:
            artist_id = data[1][index]
            if artist_id in test_artists:
                # print(f"GOAL!       {distance}")
                positive_kNN += 1
                distances.append(distance)

        precisions_kNN.append(positive_kNN / min(k, len(test_artists)))

        positive_cosine = 0
        for sim_value, index in top_k_cosine:
            artist_id = data[1][index]
            if artist_id in test_artists:
                # print(f"GOAL!       {sim_value}")
                positive_cosine += 1

        precisions_cosine.append(positive_cosine / min(k, len(test_artists)))

    print(f'precision (cosine): {sum(precisions_cosine) / len(precisions_cosine)}')
    print(f'precision (kNN): {sum(precisions_kNN) / len(precisions_kNN)}')
    # print('distances')
    # print(distances)

In [17]:
for t in thresholds:
    print(f"threshold:  {t}")
    recommend_and_evaluate(t, False)
    print(f"threshold:  {t} normalized")
    recommend_and_evaluate(t, True)
    print("--------------------------------------------------------------")

threshold:  10
precision (cosine): 0.03907858330823271
precision (kNN): 0.04353365065786911
threshold:  10 normalized
precision (cosine): 0.041246266526715925
precision (kNN): 0.04433735446421348
--------------------------------------------------------------
threshold:  20
precision (cosine): 0.07886709662219867
precision (kNN): 0.05167313926214331
threshold:  20 normalized
precision (cosine): 0.08001819964663166
precision (kNN): 0.050347423878695216
--------------------------------------------------------------
threshold:  50
precision (cosine): 0.09126445169384806
precision (kNN): 0.0687559873420582
threshold:  50 normalized
precision (cosine): 0.09077371323812604
precision (kNN): 0.06646528674174906
--------------------------------------------------------------
threshold:  100
precision (cosine): 0.13088976613205686
precision (kNN): 0.06617845769828147
threshold:  100 normalized
precision (cosine): 0.13000696073383297
precision (kNN): 0.06373107639627461
----------------------------