In [1]:
import numpy as np
import matplotlib
import sys
import os 
import matplotlib.pyplot as plt
import scipy
import scipy.sparse

import reachability as re
import pywFM as fm

import pandas as pd

import seaborn as sns

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook


In [3]:
# import matplotlib.font_manager as font_manager
# for font in  font_manager.findSystemFonts():
#     print(font)
# # https://dallascard.github.io/changing-the-font-in-matplotlib.html

In [4]:
    
from os.path import expanduser
import matplotlib as mpl
import matplotlib.font_manager as font_manager
fontpath = expanduser('/usr/share/fonts/opentype/linux-libertine/LinLibertine_R.otf')
prop = font_manager.FontProperties(fname=fontpath)
mpl.rcParams['font.family'] = prop.get_name()
mpl.rcParams['text.usetex'] = True

In [5]:
# matplotlib.rcParams['text.usetex'] = True
sns.set(style="ticks")
plt.rc('font', family='serif')
plt.rc('font', serif=prop.get_name())


SMALL_SIZE = 12
MEDIUM_SIZE = 14
BIGGER_SIZE = 16

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title


outfilestem = 'temp_plots/'  


# Plotting Over Multiple k

In [6]:

datapaths =['./ml-10M100K/',
            './lastfm-dataset-1K/']
filenames = ['ml', 'fm'] 


In [7]:
latent_dims = [16, 32, 64, 128, 256, 512]
cmap = plt.cm.autumn
color_list_unreach = [plt.cm.autumn(i) for i in range(0,cmap.N,int(cmap.N/len(latent_dims)))]
color_list_reach = [plt.cm.summer(i) for i in range(0,cmap.N,int(cmap.N/len(latent_dims)))]


In [8]:
df = pd.read_csv('./ml-10M100K/' + 'ratings.dat', sep='::', header=None)
df

  """Entry point for launching an IPython kernel.


FileNotFoundError: [Errno 2] No such file or directory: './ml-10M100K/ratings.dat'

In [None]:
## Dataset Distribution


plt.figure(figsize=(6,3))
plt.subplot(1,2,1)
ax = df[2].hist()
ax.set_yticklabels([int(y / 1000000) for y in ax.get_yticks()])
plt.ylabel('millions of datapoints')
plt.xlabel('ratings')
plt.title('MovieLens')


## Dataset Distribution

df2 = pd.read_csv('./lastfm-dataset-1K/' + 'lfm1k-log-play-counts.csv', header=None)
plt.subplot(1,2,2)
ax = df2[2].hist()
ax.set_yticklabels([int(y / 100000) for y in ax.get_yticks()])
plt.ylabel('100,000 datapoints')
plt.xlabel('log-listens')
plt.title('LastFM')

plt.tight_layout()
plt.savefig(outfilestem+'data-dist.pdf', bbox_inches='tight')

In [None]:
res_data = {}
for datapath, filename in zip(datapaths, filenames):

    data = {}
    for latent_dim in latent_dims:
        savefile = os.path.join(datapath, filename+'_reachability_k={}.npz'.format(latent_dim))
        data[latent_dim] = np.load(savefile, allow_pickle=True)
        
    res_data[filename] = data


In [None]:
[key for key in res_data['ml'][16].keys()]

In [None]:
## Model Accuracy


for datapath, filename in zip(datapaths, filenames):
    data =res_data [filename]

    RMSEs = []
    for j,latent_dim in enumerate(latent_dims):
        RMSEs.append(data[latent_dim]['RMSE'])


    plt.figure(figsize=(6,3))
    plt.plot(latent_dims, RMSEs, '-o', color='black')
#     plt.ylim([0.76, 0.79])
    plt.ylabel('RMSE')
    plt.xlabel('latent dimension')
    plt.title('Predictive Accuracy')
    plt.tight_layout()
    plt.savefig(outfilestem+'model-acc.pdf', bbox_inches='tight')

In [None]:
data[latent_dim]

In [None]:
data

In [None]:
## Item reachability vs. N with lines for multiple k
for key, data in res_data.items():
    plt.figure(figsize=(6,4))
    for j,latent_dim in enumerate(latent_dims):
        plt.plot(data[latent_dim]['Ns'],[len(reach) for reach in data[latent_dim]['reachable_items']], '-o', 
                 label='d={}'.format(latent_dim), color=color_list_reach[j],alpha=0.9)
    plt.title('Top-$N$ Availability')
    plt.ylabel('number of items'); plt.xlabel('$N$'); plt.legend(loc='lower right')
    plt.tight_layout()

    plt.savefig(outfilestem+'item-reachable-vs-n.pdf', bbox_inches='tight')

In [None]:
df_item_reach = {}
statfiles = ['movie_genres_stats.csv', 'artist_genres_stats.csv']

for filename, datapath, statfile in zip(filenames, datapaths, statfiles):
    print(filename, datapath)
    data = res_data[filename]
    df = pd.read_csv(datapath+statfile)

    for k in latent_dims:
        for n,reach in zip(data[latent_dim]['Ns'], data[k]['reachable_items']):
            print(n, len(reach))
            df['top-{} reachable,{}'.format(n,k)] = np.zeros(df.shape[0])
            for i in reach:
                df.loc[df['ordered_id']==i,'top-{} reachable,{}'.format(n,k)] = 1
        
    df_item_reach[filename] = df        

In [None]:
filenames


In [None]:
df_item_reach['fm']

In [None]:
# df_item_reach = df
center_measures = ['average', 'median']
for filename, cm in zip(filenames, center_measures):
    print(filename)
    df = df_item_reach[filename]
    n=5

    plt.figure(figsize=(6,4))
    from pylab import gcf
    fig = gcf()
    # plt.figure()
    # bins = np.linspace(0,5,50)

    bins = 10**(np.linspace(0,4,50))
    ax = plt.subplot(121)
    plt.hist(df['n_ratings_train'],
                 bins=bins, alpha=0.75,  histtype='step',
                               cumulative=True,  density=True, color='black',linewidth=1)
    ax.set_xscale('log')
    for j,k in enumerate(latent_dims[:4]):


    #     plot = [df[df['top-{} reachable,{}'.format(n,k)]==i]['average_ratings_train'] for i in [0,1]]
        plot = [df[df['top-{} reachable,{}'.format(n,k)]==i]['n_ratings_train'] for i in [0,1]] #*df[df['top-{} reachable'.format(n)]==0]['average_ratings_train']
        sns.distplot(plot[1], bins=bins, color=color_list_reach[j],
                     kde=False, hist_kws=dict(cumulative=True,density=True,histtype='step',linewidth=2))
        sns.distplot(plot[0], bins=bins, label='unreachable, d={}'.format(k), color=color_list_unreach[j],
                     kde=False, hist_kws=dict(cumulative=True,density=True,histtype='step',linewidth=2))
        plt.xlabel('number of ratings');

        plt.ylabel('cumulative frequency'); 

    ax.margins(x=0)

    bins = np.linspace(0,5,50)
    ax = plt.subplot(122)
    plt.hist(df['{}_ratings_train'.format(cm)],
                 bins=bins, alpha=0.75, label='population',  histtype='step',
                               cumulative=True,  density=True, color='black',linewidth=1)
    for j,k in enumerate(latent_dims[:4]):
        plot = [df[df['top-{} reachable,{}'.format(n,k)]==i]['{}_ratings_train'.format(cm)] for i in [0,1]] #*df[df['top-{} reachable'.format(n)]==0]['average_ratings_train']
        sns.distplot(plot[1], bins=bins, label='reachable, d={}'.format(k), color=color_list_reach[j],
                     kde=False, hist_kws=dict(cumulative=True,density=True,histtype='step', linewidth=2))
        sns.distplot(plot[0], bins=bins, color=color_list_unreach[j],
                     kde=False, hist_kws=dict(cumulative=True,density=True,histtype='step', linewidth=2))
        plt.xlabel('{} ratings'.format(cm)); 
        plt.ylabel('cumulative frequency'); 
    ax.margins(x=0)
    st = fig.suptitle("Popularity of Unavailable Items")

    plt.tight_layout()
    st.set_y(0.95)
    fig.subplots_adjust(top=0.85)
    plt.savefig(outfilestem+'ratings_distributions_reach_unreach.pdf', bbox_inches='tight')


In [None]:
## Mutable History user reachability vs. history length for multple n, new plots for new k

hist_cutoff = 400

for key, data in res_data.items():
    


    ## summary of something interest (one N/all k)
    plt.figure(figsize=(6,5))
    for i in [0,1]:
        ax = plt.subplot(1,2,i+1)

        for j,latent_dim in enumerate([16,32,64,256]):#latent_dims):
            translate = {0:0, 1:1, 2:2, 3:4}

            d = data[latent_dim]
            n_items = len(d['item_hist_n'])
    #         inds = d['user_hist_n'] <= hist_cutoff
            yval = (d['aligned_user_reach'][:,0,i]) / (n_items-d['user_hist_n'])
            ind_ord = np.argsort(yval)
        #     plt.plot(d['user_hist_n'][ind_ord],yval[ind_ord], 'o', 
        #              label='k={}'.format(latent_dim), color=color_list_reach[j])
            sns.regplot(d['user_hist_n'],yval,
                           label='d={}'.format(latent_dim), color=color_list_reach[translate[j]], lowess=True, ci=None, truncate=True)
        plt.title('Top-{} Recommended'.format(d['Ns'][i]), fontsize=MEDIUM_SIZE)
        plt.ylabel('reachable percent of unseen items'); plt.xlabel('history length'); 
#         plt.ylim([0,1.1])
#         plt.xlim([0,300])

    plt.legend()

    fig=gcf()

    plt.tight_layout()
    st = fig.suptitle("Amount of Recourse via History Edits")
    st.set_y(0.95)
    fig.subplots_adjust(top=0.85)
    plt.savefig(outfilestem+'user_reach_vs_history.pdf', bbox_inches='tight')


In [None]:
## Next N/Random N as Box and Whisker Plots for different N/k

for key, data in res_data.items():


    rectype = ['next','random']
    plt.figure(figsize=(6,3))
    for i in range(1,len(data[16]['Ns'])-2):
        k_for_df = []; percent_reach_for_df = []; rectype_for_df = []

        for j in [1,2]:
            for k in latent_dims:
                n_items = len(data[k]['item_hist_n'])
                k_for_df += [k] * data[k]['aligned_user_reach'].shape[0]
                rectype_name = "top-{}".format(data[k]['Ns'][i]) if rectype[j-1] == "next" else "random"
                rectype_for_df += [rectype_name] * data[k]['aligned_user_reach'].shape[0]
                percent_reach_for_df += list((data[k]['aligned_user_reach'][:,j,i]) / (n_items-data[k]['user_hist_n']))

        df = pd.DataFrame({"latent dimension": k_for_df, "percent reachable": percent_reach_for_df, "recommendation": rectype_for_df})

        plt.subplot(1,1,1)
        ax = sns.boxplot(x="latent dimension", y="percent reachable", hue="recommendation", data=df, palette="Paired")
        plt.title('Amount of Recourse via Reactions')
    plt.tight_layout()

    plt.savefig(outfilestem+'rec_reachable_vs_k.pdf', bbox_inches='tight')



In [None]:
# Effect of History

for key, data in res_data.items():


    colors = sns.color_palette("Paired", 2) # ['red','blue']
    labels = ['top-5','random']
    i=1
    plt.figure(figsize=(6,2))
    for j in [1,2]: # range(1,3):
        for l,k in enumerate([latent_dims[0], latent_dims[-1]]):
            plt.subplot(1,2,l+1)
            d = data[k]
            inds = d['user_hist_n'] <= 800 
            plt.title('$d={}$'.format(k), fontsize=MEDIUM_SIZE)
            sns.regplot(d['user_hist_n'][inds],(d['aligned_user_reach'][inds,j,i]) / (n_items-d['user_hist_n'][inds]),
                       label=labels[j-1], color=colors[j-1], truncate=True,scatter_kws={'alpha':0.5})
            plt.ylabel('percent reachable'); plt.xlabel('history length'); 

    # plt.legend()
    plt.tight_layout()
    plt.savefig(outfilestem+'rec_reachable_vs_history.pdf', bbox_inches='tight')



In [None]:
# DIFFICULTY!!!!
for filename, datapath in zip(filenames, datapaths):

    #TODO!
    
    if filename == 'fm':
        break

    latent_dims = [16, 32, 64, 128, 256, 512]

    data = {}
    for latent_dim in latent_dims:
        savefile = os.path.join(datapath, filename+'_reachability_difficulty_k={}.npz'.format(latent_dim))
        data[latent_dim] = np.load(savefile, allow_pickle=True)
    res_data[filename] = data

# 'user_difficulty' 'target_item_factor' 'user_factor' 'user_hist_n'

In [None]:

## Next N/Random N as Box and Whisker Plots for different N/k

for data in [res_data['ml']]:


    rectype = ['next','random']
    plt.figure(figsize=(6,3))

    k_for_df = []; difficulty = []; rectype_for_df = []
    amount_reachable = []

    for j in [1,2]:
        for k in latent_dims:
            k_for_df += [k] * data[k]['user_difficulty'].shape[0]
            rectype_name = "top-20" if rectype[j-1] == "next" else "random"

            rectype_for_df += [rectype_name] * data[k]['user_difficulty'].shape[0]
            difficulty += list(data[k]['user_difficulty'][:,j].flatten())
            amount_reachable.append(sum(data[k]['user_difficulty'][:,j].flatten()<np.inf))

    df = pd.DataFrame({"latent dimension": k_for_df, "cost": difficulty, "recommendation": rectype_for_df})

    plt.subplot(1,1,1)
    ax = sns.boxplot(x="latent dimension", y="cost", hue="recommendation", data=df[df['cost']< np.inf], palette="Paired")
    plt.title('Difficulty of Recourse')
    plt.tight_layout()

    plt.savefig(outfilestem+'rec_recourse_cost_vs_k.pdf', bbox_inches='tight')
