In this notebook, we are sampling from the LastFM data in order to be able to train all the algorithms on it.

# Libraries

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
import matplotlib.pyplot as plt
import random as rd
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
#from analyze_data import calculate_gini_coefficient
#from plots import plot_Lorenz

from collections import defaultdict
from scipy import stats
from numpy.linalg import norm
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
# set plot style: grey grid in the background:
sns.set(style="darkgrid")
pd.set_option("display.precision", 8)

# Read data

In [None]:
top_fraction = 0.2 # the percentage of items that are labelled "popular"

In [None]:
user_events_file = 'https://media.githubusercontent.com/media/SavvinaDaniil/UnfairnessOfPopularityBias/main/data/ratings_music.txt'

# read user events
item_col = "artist"
cols = ['user',item_col, 'album', 'track', 'timestamp']
df_events_music = pd.read_csv(user_events_file, sep = '\t', names = cols)

print('No. of user events: ' + str(len(df_events_music)))

In [None]:
# create user-item matrix
df_events_music = df_events_music.groupby(['user', item_col]).size().reset_index(name='rating')
print('No. user-item interactions: ' + str(len(df_events_music)))

In [None]:
df_events_music = df_events_music.astype("int")

# Analyze data

In [None]:
def users_and_items(df_events, user_col, item_col):
    print('No. user events: ' + str(len(df_events)))
    print('No. items: ' + str(len(df_events[item_col].unique())))
    print('No. users: ' + str(len(df_events[user_col].unique())))
    print("\n")
def user_distribution(df_events, user_col, item_col):
    user_dist = df_events[user_col].value_counts() 
    num_users = len(user_dist)
    print('Mean '+item_col+'s per user: ' + str(np.round(user_dist.mean(),1))) 
    print('Min '+item_col+'s per user: ' + str(np.round(user_dist.min(),1))) 
    print('Max '+item_col+'s per user: ' + str(np.round(user_dist.max(),1)))
    print("\n")
    return user_dist, num_users
def item_distribution(df_events, user_col, item_col):
    item_dist = df_events[item_col].value_counts()
    num_items = len(item_dist)
    print('Mean users per '+item_col+': ' + str(np.round(item_dist.mean(),1))) 
    print('Min users per '+item_col+': ' + str(np.round(item_dist.min(),1))) 
    print('Max users per '+item_col+': ' + str(np.round(item_dist.max(),1))) 
    print("\n")
    return item_dist, num_items

In [None]:
users_and_items(df_events_music, "user", item_col)
user_dist_music, num_users_music = user_distribution(df_events_music, "user", item_col)
item_dist_music, num_items_music = item_distribution(df_events_music, "user", item_col)

In [None]:
num_top_music = int(top_fraction * num_items_music)
top_item_dist_music = item_dist_music[:num_top_music]
print('No. top artists: ' + str(len(top_item_dist_music)))

In [None]:
def calculate_popularity_for_music(df_events, top_item_dist, item_dist, num_users, user_col, item_col, low_users, medium_users, high_users):
    pop_count = [] # number of top items per user
    user_hist = [] # user history sizes
    pop_fraq = [] # relative number of top items per user
    pop_item_fraq = [] # average popularity of items in user profiles
    low_profile_size = 0
    low_gap = 0
    medium_profile_size = 0
    medium_gap = 0
    high_profile_size = 0
    high_gap = 0
    low_ratio = 0
    medium_ratio = 0
    high_ratio = 0
    
    i=0
    for u, df in df_events.groupby(user_col):
        no_user_items = len(set(df[item_col]))
        no_user_pop_items = len(set(df[item_col]) & set(top_item_dist.index))
        pop_count.append(no_user_pop_items)
        user_hist.append(no_user_items) 
        pop_fraq.append(no_user_pop_items / no_user_items)
        user_pop_item_fraq = sum(item_dist[df[item_col]] / num_users) / no_user_items
        pop_item_fraq.append(user_pop_item_fraq)
    
        if u in low_users.index: # get user group-specific values
            low_profile_size += no_user_items
            low_gap += user_pop_item_fraq
            low_ratio += no_user_pop_items / no_user_items
        elif u in medium_users.index:
            medium_profile_size += no_user_items
            medium_gap += user_pop_item_fraq
            medium_ratio += no_user_pop_items / no_user_items
        else:
            high_profile_size += no_user_items
            high_gap += user_pop_item_fraq
            high_ratio += no_user_pop_items / no_user_items
        i+=1
        if i%1000==0:
            print(i)
    low_profile_size /= len(low_users)
    medium_profile_size /= len(medium_users)
    high_profile_size /= len(high_users)
    low_ratio /= len(low_users)
    medium_ratio /= len(medium_users)
    high_ratio /= len(high_users)
    low_gap /= len(low_users)
    medium_gap /= len(medium_users)
    high_gap /= len(high_users)
  
    return pop_count,user_hist,pop_fraq, pop_item_fraq, low_profile_size, medium_profile_size, high_profile_size, low_gap, medium_gap, high_gap, low_ratio, medium_ratio, high_ratio

In [None]:
low_user_file = "https://media.githubusercontent.com/media/SavvinaDaniil/UnfairnessOfPopularityBias/main/data/low_main_users.txt"
medium_user_file = "https://media.githubusercontent.com/media/SavvinaDaniil/UnfairnessOfPopularityBias/main/data/medium_main_users.txt"
high_user_file = "https://media.githubusercontent.com/media/SavvinaDaniil/UnfairnessOfPopularityBias/main/data/high_main_users.txt"

In [None]:
def read(low_user_file, medium_user_file, high_user_file):
    low_users = pd.read_csv(low_user_file, sep=',').set_index('user_id')
    medium_users = pd.read_csv(medium_user_file, sep=',').set_index('user_id')
    high_users = pd.read_csv(high_user_file, sep=',').set_index('user_id')
    no_users = len(low_users) + len(medium_users) + len(high_users)
    print('No. of users: ' + str(no_users))
    
    mainstreaminess = "M_global_R_APC"
    
    print('Average mainstreaminess per user for low: ' + str(low_users[mainstreaminess].mean()))
    print('Average mainstreaminess per user for med: ' + str(medium_users[mainstreaminess].mean()))
    print('Average mainstreaminess per user for high: ' + str(high_users[mainstreaminess].mean()))
    return no_users, low_users, medium_users, high_users

In [None]:
num_users_music, low_music, med_music, high_music= read(low_user_file, medium_user_file, high_user_file)

In [None]:
pop_count_music,user_hist_music,pop_fraq_music, pop_item_fraq_music, low_profile_size_music, med_profile_size_music, high_profile_size_music, low_GAP_music, med_GAP_music, high_GAP_music, low_ratio_music, med_ratio_music, high_ratio_music = calculate_popularity_for_music(df_events_music, top_item_dist_music, item_dist_music, num_users_music, "user", "artist", low_music, med_music, high_music)

# Plot distribution

In [None]:
def plot_data_distribution(item_dist, item_col):
    plt.figure()
    ax = plt.axes()
    ax.spines['bottom'].set_color('w')
    ax.spines['top'].set_color('w')
    ax.spines['right'].set_color('w')
    ax.spines['left'].set_color('w')
    ax.spines['left'].set_zorder(0)
    ax.xaxis.set_ticks_position('none') 
    ax.xaxis.set_major_locator(plt.MaxNLocator(5))
    ax.yaxis.set_ticks_position('none') 
    ax.set_facecolor("aliceblue")
    plt.grid(color = "w",linewidth = 2 )
    plt.plot(item_dist.values)
    plt.xticks(fontsize='13')
    plt.yticks(fontsize='13')
    plt.xlabel(item_col, fontsize='20')
    plt.ylabel('Number of users', fontsize='20')
    plt.show(block=True)

In [None]:
plot_data_distribution(item_dist_music, item_col)

In [None]:
np.arange(len(item_dist_music.values))

# Sampled data

In [None]:
limit = 20 # how many listeners an artist must have to be included in the sampled dataset

In [None]:
sampled_items = list(item_dist_music[item_dist_music>limit].index) # artists that have more than 20 listeners

In [None]:
len(sampled_items)

In [None]:
sampled_df_events_music = df_events_music[df_events_music.artist.isin(sampled_items)] # ratings of these sampled artists

In [None]:
len(sampled_df_events_music)

# Analyze sampled data

In [None]:
users_and_items(sampled_df_events_music, "user", item_col)
user_dist_music, num_users_music = user_distribution(sampled_df_events_music, "user", item_col)
item_dist_music, num_items_music = item_distribution(sampled_df_events_music, "user", item_col)

In [None]:
plot_data_distribution(item_dist_music, item_col)

# Save sampled data

In [None]:
sampled_df_events_music.to_csv("data/relevant_music_data_20.csv")