# Implementation of the Topic-Specific PageRank algorithm 

In [12]:
import os
import json
import time
import pandas as pd
import numpy as np
import scipy
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix
from numpy import linalg

In [13]:
#Function used for import of data in json format
def data_iterator(path):
    
    print("Importing data. Please wait.\n")
    
    for root, dirs, files in os.walk(path):
        for f in files:
            if f.endswith('.json'):
                fp = os.path.join(root,f)
                with open(fp) as o:
                    data = json.load(o)
                yield {"similars" : data["similars"], "track_id": data["track_id"], "tags": data["tags"]}


In [14]:
#function will return adjacency matrix A of csr format
#element Aij of matrix A is 1/sum_in_row if the edge between nodes i and j exists or 0 otherwise
def create_adjacency_matrix(t = 0):
    
    row,col,data=[],[],[]
    pairs = set() #set of pairs of nodes (row, column) for which the edge between them exists
    
    per_row = dict()

    
    for i in range(len(df)):
        #row will be index of the song S
        if track_id[i] not in nodes:
            all_songs.append(track_id[i])
            nodes[track_id[i]] = len(nodes)
        
        r = nodes[track_id[i]]

        #now indices of similar songs are needed
        for j in range(len(similars[i])):
            similar_song = similars[i][j][0] #similar song
            weight = similars[i][j][1] #weight
            
            #column will be index of the song which is similar to S
            if similar_song not in nodes:
                nodes[similar_song] = len(nodes)
                all_songs.append(similar_song)
            c = nodes[similar_song]

            #add the edge if the weight is larger than the threshold
            if (weight > t):
                row.append(r)
                
                if r not in per_row:
                    per_row[r] = 1
                else:
                    per_row[r] += 1
                
                col.append(c)
    
    for r in row:
        num = per_row[r]
        data.append((1.0/num)) 

                        
    # calculate the graph adjacency matrix as a coo_matrix
    N = len(nodes)
    A = scipy.sparse.coo_matrix((data,(row,col)),shape=(N,N))
    
    #convert to csr_matrix
    A = scipy.sparse.csr_matrix(A)
    
    return A
                
                      

In [15]:
#function returns an array of songs that have all tags that user wants
def get_relevants(user_tags, tag_matrix):
    
    S = []
    flag = 0
    
    for t in user_tags:
        if t not in tags_unique:
            continue
        tag_idx = tags_unique[t] #return index of tag
        s2 = tag_matrix.getcol(tag_idx).nonzero()[0] #all songs related to the tag
        
        if flag == 0: #in the first iteration S is empty, intersection is empty set
            flag += 1
            S = s2
            continue
            
        S = list(set(S) & set(s2)) #songs that have all user's tags
    
    return S

In [16]:
#function will return the matrix that has nonzero elements if song is related to tag
#first coordinate of matrix represents songs and the second coordinate represents tags
def create_tag_matrix(g = 50):

    row,col,data=[],[],[]
    

    #for i in range(len(df)):
    for i in range(len(track_id)):
        
        r = nodes[track_id[i]] #row will be index of the song S

        #now indices of tags are needed
        for j in range(len(tags[i])):
            tag = tags[i][j][0] 

            count = int(tags[i][j][1]) #count

            if tag not in tags_unique:
                tags_unique[tag] = len(tags_unique)

            c = tags_unique[tag] #column will be index of the tag 

            #add the edge if the count is larger than parameter g
            if (count > g):
                row.append(r)
                col.append(c)
                data.append(1)  
                similars = A.getrow(r).nonzero();
                for x in (similars[1]):
                    row.append(x)
                    col.append(c)
                    data.append(1)  

    N = len(nodes)
    M = len(tags_unique)
    tag_matrix = scipy.sparse.coo_matrix((data,(row,col)),shape=(N,M))
    tag_matrix = scipy.sparse.csc_matrix(tag_matrix)
    
    return tag_matrix

In [17]:
#The most important function 
#Implementation od the Topic-Specific PageRank algorithm
def get_rank_vector(N, S, beta = 0.2):
    
    if N == 0:
        print("Error: No relevant songs")
        return []
    
    if len(S) == 0:
        print("Error: No relevant tags")
        return []
    
    r = np.full((N,1), 1/N) #initialization
    
    es = np.zeros(N)
    es[S] = 1
    const = (1 - beta) * (1.0 / len(S)) * es
    const = const.transpose()

    for iteration in range(30):
        r_old = r
        r = beta * A * r_old 

        for k in range(len(r)):
            r[k] += const[k]

        if linalg.norm((r_old - r), 1) < 0.001: #check if it converged
            print("converged in step number ",iteration)
            break;
    
    return r

In [18]:
#Function returns to N songs
def get_top_n(r, n):
    topN = []
    tmp = r.transpose()[0]
    indices = tmp.argsort()[-n:] #get last 5
    
    for i in range(n):
        topN.insert(0,all_songs[indices[i]])
        
    return topN

In [19]:
#User input function
def get_tags():
    
    num = int(input("How many tags do you want? "))
    print("Insert tags that you want (if tag does not exist it will be ignored)")
    
    tags = []
    for i in range(num):
        print(i+1, "tag: ")
        tags.append(input(""))
        
    print ("\nChosen tags: ", tags, "\n\n")
    return tags     

In [20]:
#User input function
def get_parameters():
    answer = input("\nDo you want to use default parameters? y/n ")
    
    if answer == "y":
        t = 0
        g = 50
        beta = 0.2
        n = 5
    else:
        t = int(input("Edge threshold: "))
        g = int(input("Tag threshold: "))
        beta = float(input("Teleport probability: "))
        n = int(input("Number of best songs: "))
        
    return t, g, beta, n

In [21]:
#path = r"C:\Users\Maki\Desktop\MMDS\lastfm_subset\lastfm_subset"
path = input("Insert path to dataset: ")
df = pd.DataFrame(data_iterator(path))

#GLOBAL VARIABLES:
nodes = dict()
tags_unique = dict()
all_songs = []
track_id = df["track_id"]
similars = df["similars"]
tags = df["tags"]

#PARAMETERS:
t, g, beta, n = get_parameters()
print("Chosen parameters: t =", t, "  g =", g, "  beta =", beta, "  n =", n, "\n")
user_tags = get_tags()

#RUN PAGERANK ALGORITHM
start_time = time.time()

print("\n--------------------------------------------------------")
print("Topic-Specific PageRang algorithm launched. Please wait.\n")

A = create_adjacency_matrix(t)
tag_matrix = create_tag_matrix(g)

S = get_relevants(user_tags, tag_matrix)
r = get_rank_vector(len(nodes), S, beta)

if len(r) > 0:
    print("\nTop", n ,"songs: ")
    topN = get_top_n(r, n)
    for i, s in enumerate(topN):
        print(i+1, s)

print('\nTime: ', round(time.time() - start_time, 3))

Insert path to dataset: C:\Users\Maki\Desktop\MMDS\lastfm_subset\lastfm_subset
Importing data. Please wait.


Do you want to use default parameters? y/n y
Chosen parameters: t = 0   g = 50   beta = 0.2   n = 5 

How many tags do you want? 2
Insert tags that you want (if tag does not exist it will be ignored)
1 tag: 
rock
2 tag: 
pop

Chosen tags:  ['rock', 'pop'] 



--------------------------------------------------------
Topic-Specific PageRang algorithm launched. Please wait.

converged in step number  2

Top 5 songs: 
1 TRAMCJI128F93335AF
2 TRALDUE128F9312AF2
3 TRBAMHJ128F9302A08
4 TRAZOSB128F9302A07
5 TRAEUIW12903D018F0

Time:  29.184
