In [1]:
import pandas as pd

In [2]:
thresholds = [10, 20, 50, 100, 200]

In [3]:
df_plays = pd.read_csv('../LFM-1b/LFM-1b_LEs_PC.txt', sep='\t')
grouped_plays = df_plays.groupby('user_id')

In [4]:
AFP = {}
AFP_normalized = {}
for t in thresholds:
    df = pd.read_csv(f'../LFM-1b_UGP/AFP/LFM-1b_AFP_{t}.txt', sep='\t')
    df.set_index('artist_id', inplace=True)
    df = df.loc[(df.sum(axis=1) != 0)]
    index_to_artist_id = {index: artist_id for index, artist_id in enumerate(df.index.tolist())}
    artist_id_to_index = {artist_id: index for index, artist_id in enumerate(df.index.tolist())}
    AFP[t] = (df, index_to_artist_id, artist_id_to_index)

    df = pd.read_csv(f'../LFM-1b_UGP/AFP/LFM-1b_AFP_{t}_normalized.txt', sep='\t')
    df.set_index('artist_id', inplace=True)
    df = df.loc[(df.sum(axis=1) != 0)]
    index_to_artist_id = {index: artist_id for index, artist_id in enumerate(df.index.tolist())}
    artist_id_to_index = {artist_id: index for index, artist_id in enumerate(df.index.tolist())}
    AFP_normalized[t] = (df, index_to_artist_id, artist_id_to_index)

In [5]:
from collections import defaultdict
import heapq
import skfuzzy as fuzz
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [6]:
df_plays_test = pd.read_csv('../testing/test_playcounts.txt', sep='\t')
user_ids = df_plays_test['user_id'].unique()
liked_artists = {}
for t in thresholds:
    mask = df_plays_test['artist_id'].isin(AFP[t][2]) & (df_plays_test['playcount'] >= t)
    liked_artists[t] = df_plays_test[mask].groupby('user_id')['artist_id'].apply(list).to_dict(into=defaultdict(list))

In [7]:
def train_and_test(threshold: int, normalized: bool, n_clusters: int, m_value: float = 2.0):
    if normalized:
        data = AFP[threshold]
    else:
        data = AFP_normalized[threshold]

    _, u, _, _, jm, _, fpc = fuzz.cmeans(data[0].values.T, n_clusters, m_value, error=0.005, maxiter=2000)
    if len(jm) == 2000 and abs(jm[-2] - jm[-1]) > 0.005:
        print(f"The algorithm didn't converge.  {m_value} {n_clusters}")

    return u.T

In [8]:
def recommend_and_evaluate(fuzzy_c_partitioned_matrix, threshold: int, normalized: bool):
    if normalized:
        data = AFP[threshold]
    else:
        data = AFP_normalized[threshold]

    precisions = []
    similarities = []
    for i, user_id in enumerate(user_ids):
        # print(f"User {i}")
        artist_ids = liked_artists[threshold][user_id]
        if len(artist_ids) < 2:
            continue

        if len(artist_ids) < 5:
            train_artists, test_artists = train_test_split(artist_ids, train_size=0.75, random_state=42)
        else:
            train_artists, test_artists = train_test_split(artist_ids, test_size=0.2, random_state=42)

        k = 100
        top_k = []
        for artist_id in train_artists:
            index = data[2][artist_id]
            cosine_sim = cosine_similarity(fuzzy_c_partitioned_matrix[[index]], fuzzy_c_partitioned_matrix)
            cosine_sim[0][index] = 0
            for idx, sim_value in enumerate(cosine_sim[0]):
                # if sim_value > 0.8:
                #     continue

                if len(top_k) < k:
                    heapq.heappush(top_k, (sim_value, idx))
                else:
                    if sim_value > top_k[0][0]:
                        heapq.heapreplace(top_k, (sim_value, idx))

        top_k.sort(key=lambda x: x[0], reverse=True)
        positive = 0
        for sim_value, index in top_k:
            artist_id = data[1][index]
            if artist_id in test_artists:
                # print(f"GOAL!       {sim_value}")
                positive += 1
                similarities.append(sim_value)

        precisions.append(positive / min(k, len(test_artists)))

    print(f'precision: {sum(precisions) / len(precisions)}')
    print('similarities')
    print(similarities)

In [9]:
u_t10_c10 = train_and_test(threshold=10, normalized=False, n_clusters=10)

In [10]:
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t10_c10, threshold=10, normalized=False)

precision: 0.009354369252452144
similarities
[1.0000000000000004, 1.0000000000000002, 1.0, 1.0000000000000004, 1.0000000000000002, 1.0000000000000002, 1.0000000000000004, 1.0, 1.0000000000000004, 0.999959162353752, 1.0, 1.0, 1.0000000000000004, 1.0000000000000002, 1.0000000000000002, 0.9999599054936534, 0.999807531306792, 0.999959162353752, 0.9999203033180503, 0.9999049383939878, 1.0000000000000004, 1.0, 0.9998069223690574, 0.9996192759676505, 0.9996372213361178, 1.0000000000000004, 1.0, 0.9998387886064677, 0.9998604324302249, 0.9997795937394393, 1.0, 1.0000000000000004, 1.0000000000000004, 1.0000000000000004, 1.0000000000000002]


In [11]:
u_t20_c10 = train_and_test(threshold=20, normalized=False, n_clusters=10)
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t20_c10, threshold=20, normalized=False)

precision: 0.0224895456017905
similarities
[1.0000000000000002, 1.0000000000000002, 0.9997686440224643, 0.9999220343282775, 0.999897925316418, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 0.9999696950259527, 1.0000000000000002, 0.9999077527116162, 0.999918481844055, 0.9999526725108656, 0.9998834447760168, 0.9999730383567396, 0.999968155364636, 0.9999599779623196, 1.0000000000000002, 0.9999843008532623, 1.0, 0.9999638222850361, 0.9998269525857144, 0.9998328957878476, 1.0000000000000002]


In [12]:
u_t50_c10 = train_and_test(threshold=50, normalized=False, n_clusters=10)
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t50_c10, threshold=50, normalized=False)

precision: 0.02956599020904243
similarities
[1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 0.999999160536118, 0.9999910210539111, 1.0, 0.9999999999999999, 0.9999403709051233, 0.9999929271622379, 0.9999892224604758, 1.0000000000000002, 0.9999934688981781, 0.9999841158285532, 0.9999999999999999, 1.0000000000000002, 0.9999999999999999, 1.0000000000000002, 0.9999285404233482, 1.0000000000000002, 0.9999999999999999, 0.9999648231136307, 1.0000000000000002, 1.0000000000000002, 0.999995928284253, 0.9999762716592682, 0.9999990381178712, 0.9999987445542871, 0.999994260460252, 0.9999965160049646, 0.9999677247724669, 1.0, 1.0000000000000002, 0.9999925626590512]


In [13]:
u_t100_c10 = train_and_test(threshold=100, normalized=False, n_clusters=10)
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t100_c10, threshold=100, normalized=False)

precision: 0.037459236798443846
similarities
[1.0000000000000002, 0.9999983514667721, 0.9999944310006261, 0.9999764568879175, 1.0, 1.0, 1.0, 0.999961978023052, 0.9999998456376068, 1.0000000000000002, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0000000000000002, 0.9999661806755424, 1.0, 1.0, 0.9999888394247738, 0.9998552448241768, 0.9999999999999999, 1.0000000000000002, 1.0000000000000002]


In [14]:
u_t200_c10 = train_and_test(threshold=200, normalized=False, n_clusters=10)
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t200_c10, threshold=200, normalized=False)

precision: 0.07895833333333332
similarities
[0.999947888244712, 1.0, 0.9999928071802604, 0.9999999999999999, 0.9999270757927614, 1.0000000000000002, 1.0, 1.0, 0.9999808379820996, 0.9999446474177616, 0.9999926572648583, 0.9999619752951048, 0.999994879707877, 0.9999862715389763, 0.9999999407720279, 0.9998275425306786, 0.9999104313712253, 1.0000000000000002, 0.9993050888096501, 0.9999999999999998, 1.0, 1.0, 0.9999932897634549, 0.9999796855942228, 0.9997015442756779, 0.999956406308981, 0.9998947700278769]


In [15]:
u_t10_c10_normalized = train_and_test(threshold=10, normalized=True, n_clusters=10)
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t10_c10_normalized, threshold=10, normalized=True)

precision: 0.005299517675234771
similarities
[1.0000000000000002, 1.0000000000000002, 1.0, 1.0000000000000002, 1.0000000000000002, 1.0, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 1.0, 0.9999883568948575, 1.0000000000000002, 1.0000000000000002]


In [16]:
u_t20_c10_normalized = train_and_test(threshold=20, normalized=True, n_clusters=10)
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t20_c10_normalized, threshold=20, normalized=True)

precision: 0.010632428386652548
similarities
[1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 1.0, 1.0000000000000002, 0.9999998629569146, 1.0000000000000002, 0.9995984281818644, 1.0000000000000002, 1.0000000000000002, 1.0, 1.0000000000000002, 1.0000000000000002, 1.0, 1.0, 1.0000000000000002, 1.0000000000000002, 0.9999880827100682, 1.0000000000000002, 0.9999999999999999, 0.9999889258064336, 0.9999262852518898, 0.9999880827100682, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002]


In [17]:
u_t50_c10_normalized = train_and_test(threshold=50, normalized=True, n_clusters=10)
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t50_c10_normalized, threshold=50, normalized=True)

precision: 0.020658285960777064
similarities
[1.0000000000000002, 0.9999958355855099, 0.9998599173214576, 0.9999999999999999, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 0.9999157961149191, 1.0000000000000002, 1.0000000000000002, 0.9999332226621566, 1.0000000000000002, 1.0000000000000002, 0.9999901561077984, 0.013659270145521062, 0.9999891484095527, 1.0]


In [18]:
u_t100_c10_normalized = train_and_test(threshold=100, normalized=True, n_clusters=10)
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t100_c10_normalized, threshold=100, normalized=True)

precision: 0.022691038770333925
similarities
[1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 0.9999643983812383, 1.0, 0.9999999999999999, 0.9999999999999998, 1.0000000000000002, 1.0, 1.0, 0.9999999999999998, 1.0, 0.9999999999999999, 0.9999406100829213, 0.9999358165089874, 1.0000000000000002]


In [19]:
u_t200_c10_normalized = train_and_test(threshold=200, normalized=True, n_clusters=10)
recommend_and_evaluate(fuzzy_c_partitioned_matrix=u_t200_c10_normalized, threshold=200, normalized=True)

precision: 0.07255514705882353
similarities
[0.9999999999999998, 0.9999999999999998, 0.9999999999999998, 0.9998780355104803, 1.0, 1.0, 0.9998599804519542, 1.0000000000000002, 0.9968019751999588, 0.9990196798335642, 0.9864933596290958, 1.0, 0.9999999999999999, 1.0000000000000004, 0.99564114400281, 0.3910175215833397, 0.9912159105324037, 0.9624396148807552, 0.9992528910779208, 1.0000000000000004]


In [20]:
for n_clusters in [15, 20, 25]:
    for t in thresholds:
        print(f"threshold:  {t} n_clusters: {n_clusters}")
        recommend_and_evaluate(train_and_test(t, False, n_clusters), t, False)
        print(f"threshold:  {t} n_clusters: {n_clusters}    normalized")
        recommend_and_evaluate(train_and_test(t, True, n_clusters), t, True)
        print("--------------------------------------------------------------")

threshold:  10 n_clusters: 15
precision: 0.009100567426705748
similarities
[1.0000000000000004, 1.0000000000000004, 1.0000000000000002, 1.0000000000000002, 1.0000000000000002, 1.0, 1.0, 1.0000000000000002, 1.0000000000000002, 0.9976136922750296, 0.9999349145047087, 0.999846757012726, 0.9998617848219324, 0.9998191295965877, 0.999766780131592, 0.9999999999999998, 0.9997777814882125, 0.9998439318594532, 0.9996480158922223, 0.9998870030386208, 0.9999421265265713, 0.9999016918974136, 0.9998021012343768, 0.999776324577601]
threshold:  10 n_clusters: 15    normalized
precision: 0.004114924646963353
similarities
[1.0000000000000004, 1.0000000000000004, 1.0, 1.0000000000000002, 1.0000000000000004, 1.0000000000000004, 1.0000000000000004, 0.9999792675803725, 1.0, 1.0000000000000002, 0.9997612661044925, 1.0000000000000004]
--------------------------------------------------------------
threshold:  20 n_clusters: 15
precision: 0.02171641321958592
similarities
[1.0000000000000002, 1.0000000000000002,