In [1]:
import pandas as pd

In [2]:
thresholds = [10, 20, 50, 100, 200]

In [3]:
df_plays = pd.read_csv('../LFM-1b/LFM-1b_LEs_PC.txt', sep='\t')
grouped_plays = df_plays.groupby('user_id')

In [4]:
AAP = {}
AAP_normalized = {}
for t in thresholds:
    df = pd.read_csv(f'../LFM-1b_UGP/AAP/LFM-1b_AAP_{t}.txt', sep='\t')
    df.set_index('artist_id', inplace=True)
    df = df.loc[(df.sum(axis=1) != 0)]
    index_to_artist_id = {index: artist_id for index, artist_id in enumerate(df.index.tolist())}
    artist_id_to_index = {artist_id: index for index, artist_id in enumerate(df.index.tolist())}
    AAP[t] = (df, index_to_artist_id, artist_id_to_index)

    df = pd.read_csv(f'../LFM-1b_UGP/AAP/LFM-1b_AAP_{t}_normalized.txt', sep='\t')
    df.set_index('artist_id', inplace=True)
    df = df.loc[(df.sum(axis=1) != 0)]
    index_to_artist_id = {index: artist_id for index, artist_id in enumerate(df.index.tolist())}
    artist_id_to_index = {artist_id: index for index, artist_id in enumerate(df.index.tolist())}
    AAP_normalized[t] = (df, index_to_artist_id, artist_id_to_index)

In [5]:
from collections import defaultdict
import heapq
import skfuzzy as fuzz
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [6]:
df_plays_test = pd.read_csv('../testing/test_playcounts.txt', sep='\t')
user_ids = df_plays_test['user_id'].unique()
liked_artists = {}
for t in thresholds:
    mask = df_plays_test['artist_id'].isin(AAP[t][2]) & (df_plays_test['playcount'] >= t)
    liked_artists[t] = df_plays_test[mask].groupby('user_id')['artist_id'].apply(list).to_dict(into=defaultdict(list))

In [7]:
def train_and_test(threshold: int, normalized: bool, n_clusters: int, m_value: float = 2.0):
    if normalized:
        data = AAP[threshold]
    else:
        data = AAP_normalized[threshold]

    _, u, _, _, jm, _, fpc = fuzz.cmeans(data[0].values.T, n_clusters, m_value, error=0.005, maxiter=2000)
    if len(jm) == 2000 and abs(jm[-2] - jm[-1]) > 0.005:
        print(f"The algorithm didn't converge.  {m_value} {n_clusters}")

    return u.T

In [8]:
def recommend_and_evaluate(fuzzy_c_partitioned_matrix, threshold: int, normalized: bool):
    if normalized:
        data = AAP[threshold]
    else:
        data = AAP_normalized[threshold]

    precisions = []
    similarities = []
    for i, user_id in enumerate(user_ids):
        # print(f"User {i}")
        artist_ids = liked_artists[threshold][user_id]
        if len(artist_ids) < 2:
            continue

        if len(artist_ids) < 5:
            train_artists, test_artists = train_test_split(artist_ids, train_size=0.75, random_state=42)
        else:
            train_artists, test_artists = train_test_split(artist_ids, test_size=0.2, random_state=42)

        k = 100
        top_k = []
        for artist_id in train_artists:
            index = data[2][artist_id]
            cosine_sim = cosine_similarity(fuzzy_c_partitioned_matrix[[index]], fuzzy_c_partitioned_matrix)
            cosine_sim[0][index] = 0
            for idx, sim_value in enumerate(cosine_sim[0]):
                # if sim_value > 0.8:
                #     continue

                if len(top_k) < k:
                    heapq.heappush(top_k, (sim_value, idx))
                else:
                    if sim_value > top_k[0][0]:
                        heapq.heapreplace(top_k, (sim_value, idx))

        top_k.sort(key=lambda x: x[0], reverse=True)
        positive = 0
        for sim_value, index in top_k:
            artist_id = data[1][index]
            if artist_id in test_artists:
                # print(f"GOAL!       {sim_value}")
                positive += 1
                similarities.append(sim_value)

        precisions.append(positive / min(k, len(test_artists)))

    print(f'precision: {sum(precisions) / len(precisions)}')
    print('similarities')
    print(similarities)

In [9]:
u_t10_c10 = train_and_test(threshold=10, normalized=False, n_clusters=10)

In [10]:
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t10_c10, threshold=10, normalized=False)

precision: 0.00859016851142932
similarities
[1.0, 0.999861455089224, 1.0, 0.9999545191829152, 0.9999380822056155, 0.9991728786302317, 0.9998805137835383, 0.9998804103517286, 0.9998508345863114, 0.9998480109753289, 0.9995245042807319, 0.9986448389090917, 1.0000000000000002, 0.9999585225975169, 0.9999683063674756, 0.9999999350144388, 1.0, 1.0000000000000002]


In [11]:
u_t20_c10 = train_and_test(threshold=20, normalized=False, n_clusters=10)

In [12]:
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t20_c10, threshold=20, normalized=False)

precision: 0.005805887272650279
similarities
[0.9999585233462435, 0.9999742854760714, 0.9999999979930168, 0.9999943531241384, 0.9998795112703558, 0.9964932493255426, 0.9995166665396936, 0.9999418672356526]


In [13]:
u_t50_c10 = train_and_test(threshold=50, normalized=False, n_clusters=10)

In [14]:
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t50_c10, threshold=50, normalized=False)

precision: 0.005767012687427912
similarities
[1.0, 0.999993808985878, 0.9998578082134144]


In [15]:
u_t100_c10 = train_and_test(threshold=100, normalized=False, n_clusters=10)

In [16]:
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t100_c10, threshold=100, normalized=False)

precision: 0.013947990543735224
similarities
[0.9999580700719541, 0.9998054239659703, 0.9992074499425417, 0.9999767233041006, 0.9985829065622455, 0.9997810450473583]


In [17]:
u_t200_c10 = train_and_test(threshold=200, normalized=False, n_clusters=10)

In [18]:
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t200_c10, threshold=200, normalized=False)

precision: 0.0009469696969696969
similarities
[0.9999998982923699]


In [19]:
u_t10_c10_normalized = train_and_test(threshold=10, normalized=True, n_clusters=10)

In [20]:
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t10_c10_normalized, threshold=10, normalized=True)

precision: 0.00172183628686611
similarities
[0.9997544606657949, 1.0000000000000002, 1.0000000000000002, 0.999999975705366, 0.9999997701127484]


In [21]:
u_t20_c10_normalized = train_and_test(threshold=20, normalized=True, n_clusters=10)
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t20_c10_normalized, threshold=20, normalized=True)

precision: 0.0006318344468633486
similarities
[0.999999999999906, 1.0000000000000002, 1.0000000000000002]


In [22]:
u_t50_c10_normalized = train_and_test(threshold=50, normalized=True, n_clusters=10)
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t50_c10_normalized, threshold=50, normalized=True)

precision: 0.01124567474048443
similarities
[0.9928147520750756, 0.2878952516383318, 0.9999941683539816, 0.9921742745343122]


In [23]:
u_t100_c10_normalized = train_and_test(threshold=100, normalized=True, n_clusters=10)
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t100_c10_normalized, threshold=100, normalized=True)

precision: 0.015425531914893617
similarities
[0.9994854514202852, 0.9999943120692364, 0.9998506784172371, 0.9999220855283236, 0.9998627306987357, 0.9987138019313977]


In [24]:
u_t200_c10_normalized = train_and_test(threshold=200, normalized=True, n_clusters=10)
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t200_c10_normalized, threshold=200, normalized=True)

precision: 0.025757575757575757
similarities
[0.9999951324278644, 0.99829115871012, 0.9975436865096408, 0.9966836968225543, 0.2638177361658134, 0.9994453326363784, 0.9999696460490249, 0.9884660258461484]


In [25]:
for n_clusters in [15, 20, 25]:
    for t in thresholds:
        print(f"threshold:  {t} n_clusters: {n_clusters}")
        recommend_and_evaluate(train_and_test(t, False, n_clusters), t, False)
        print(f"threshold:  {t} n_clusters: {n_clusters}    normalized")
        recommend_and_evaluate(train_and_test(t, True, n_clusters), t, True)
        print("--------------------------------------------------------------")

threshold:  10 n_clusters: 15
precision: 0.006332763944866163
similarities
[1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 0.9999899950530095, 0.9999034745172852, 0.9999003365383617, 0.9998864796009805, 0.9995402933826243, 0.9999457985714117, 0.9996085692301581, 0.9989445634742403, 0.9992644820928182]
threshold:  10 n_clusters: 15    normalized
precision: 0.001991327625495011
similarities
[1.0000000000000002, 0.999965236800302, 0.998459146597457, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002]
--------------------------------------------------------------
threshold:  20 n_clusters: 15
precision: 0.0019611890999174236
similarities
[0.999914967137978, 0.9999035034700924, 0.9999350037415098, 0.9998502912870395]
threshold:  20 n_clusters: 15    normalized
precision: 0.0016968821159572603
similarities
[1.0000000000000004, 0.9993024375345224, 0.9999998277921024]
--------------------------------------------------------------
threshold:  50 n_clusters: 15
precision: 0.