In [1]:
import pandas as pd

In [2]:
thresholds = [10, 20, 50, 100, 200]

In [3]:
AAP = {}
AAP_normalized = {}
for t in thresholds:
    df = pd.read_csv(f'../LFM-1b_UGP/AAP/LFM-1b_AAP_{t}.txt', sep='\t')
    df.set_index('artist_id', inplace=True)
    df = df.loc[(df.sum(axis=1) != 0)]
    index_to_artist_id = {index: artist_id for index, artist_id in enumerate(df.index.tolist())}
    artist_id_to_index = {artist_id: index for index, artist_id in enumerate(df.index.tolist())}
    AAP[t] = (df, index_to_artist_id, artist_id_to_index)

    df = pd.read_csv(f'../LFM-1b_UGP/AAP/LFM-1b_AAP_{t}_normalized.txt', sep='\t')
    df.set_index('artist_id', inplace=True)
    df = df.loc[(df.sum(axis=1) != 0)]
    index_to_artist_id = {index: artist_id for index, artist_id in enumerate(df.index.tolist())}
    artist_id_to_index = {artist_id: index for index, artist_id in enumerate(df.index.tolist())}
    AAP_normalized[t] = (df, index_to_artist_id, artist_id_to_index)

In [4]:
from collections import defaultdict
import heapq
import skfuzzy as fuzz
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import train_test_split

In [5]:
df_plays_test = pd.read_csv('../testing/test_playcounts.txt', sep='\t')
user_ids = df_plays_test['user_id'].unique()
liked_artists = {}
for t in thresholds:
    mask = df_plays_test['artist_id'].isin(AAP[t][2]) & (df_plays_test['playcount'] >= t)
    liked_artists[t] = df_plays_test[mask].groupby('user_id')['artist_id'].apply(list).to_dict(into=defaultdict(list))

In [6]:
def train_and_test(threshold: int, normalized: bool, n_clusters: int, m_value: float = 2.0):
    if normalized:
        data = AAP[threshold]
    else:
        data = AAP_normalized[threshold]

    _, u, _, _, jm, _, fpc = fuzz.cmeans(data[0].values.T, n_clusters, m_value, error=0.005, maxiter=2000)
    if len(jm) == 2000 and abs(jm[-2] - jm[-1]) > 0.005:
        print(f"The algorithm didn't converge.  {m_value} {n_clusters}")

    return u.T

In [7]:
def recommend_and_evaluate(fuzzy_c_partitioned_matrix, threshold: int, normalized: bool):
    if normalized:
        data = AAP[threshold]
    else:
        data = AAP_normalized[threshold]

    precisions = []
    distances = []
    for i, user_id in enumerate(user_ids):
        # print(f"User {i}")
        artist_ids = liked_artists[threshold][user_id]
        if len(artist_ids) < 2:
            continue

        if len(artist_ids) < 5:
            train_artists, test_artists = train_test_split(artist_ids, train_size=0.75, random_state=42)
        else:
            train_artists, test_artists = train_test_split(artist_ids, test_size=0.2, random_state=42)

        k = 100
        top_k = []
        for artist_id in train_artists:
            index = data[2][artist_id]
            pw_distances = pairwise_distances(fuzzy_c_partitioned_matrix[[index]], fuzzy_c_partitioned_matrix)
            pw_distances[0][index] = float('inf')
            for idx, distance in enumerate(pw_distances[0]):
                # if sim_value > 0.8:
                #     continue

                if len(top_k) < k:
                    heapq.heappush(top_k, (distance, idx))
                else:
                    if distance < top_k[0][0]:
                        heapq.heapreplace(top_k, (distance, idx))

        positive = 0
        for distance, index in top_k:
            artist_id = data[1][index]
            if artist_id in test_artists:
                # print(f"GOAL!       {distance}")
                positive += 1
                distances.append(distance)

        precisions.append(positive / min(k, len(test_artists)))

    print(f'precision: {sum(precisions) / len(precisions)}')
    # print('distances')
    # print(distances)

In [8]:
models = defaultdict(dict)
models_normalized = defaultdict(dict)

In [9]:
for n_clusters in [10, 15, 20, 25]:
    for t in thresholds:
        print(f"threshold:  {t} n_clusters: {n_clusters}")
        if t in models and n_clusters in models[t]:
            model = models[t][n_clusters]
            print("Found")
        else:
            model = train_and_test(t, False, n_clusters)
            models[t][n_clusters] = model

        recommend_and_evaluate(model, t, False)

        print(f"threshold:  {t} n_clusters: {n_clusters}    normalized")
        if t in models_normalized and n_clusters in models_normalized[t]:
            model_normalized = models_normalized[t][n_clusters]
            print("Found")
        else:
            model_normalized = train_and_test(t, True, n_clusters)
            models_normalized[t][n_clusters] = model_normalized

        recommend_and_evaluate(model_normalized, t, True)
        print("--------------------------------------------------------------")

threshold:  10 n_clusters: 10
precision: 0.042934975141200864
threshold:  10 n_clusters: 10    normalized
precision: 0.04340409315130248
--------------------------------------------------------------
threshold:  20 n_clusters: 10
precision: 0.046227113240847675
threshold:  20 n_clusters: 10    normalized
precision: 0.046116633009549324
--------------------------------------------------------------
threshold:  50 n_clusters: 10
precision: 0.052501717245090083
threshold:  50 n_clusters: 10    normalized
precision: 0.05198934034863014
--------------------------------------------------------------
threshold:  100 n_clusters: 10
precision: 0.0537843175609133
threshold:  100 n_clusters: 10    normalized
precision: 0.05359089396323438
--------------------------------------------------------------
threshold:  200 n_clusters: 10
precision: 0.06699503148366785
threshold:  200 n_clusters: 10    normalized
precision: 0.06813139512003148
-------------------------------------------------------------