In [2]:
#IMPORTS
import numpy as np
import time
import math
import pandas as pd
import collections
import matplotlib.pyplot as plt
import time
import scipy
import random
from scipy.sparse.linalg import svds
from sklearn import linear_model
%matplotlib inline

In [3]:
#COLD START ISSUE AVOIDING
#Function cold_start iteratively deletes all users and songs that have <= 5 (<=eps) appearences
def cold_start(matrix, all_users, all_songs, eps = 5):

    iterations = 0

    while 1:
        
        if iterations == 20:
            break
        
        changed = False
        
        removed_users = set()
        removed_songs = set()

        for s in all_songs:
            count = 0 #count a song's appearences

            potential_users = set()

            for u in all_users:
                if u in matrix:
                    if s in matrix[u]:
                        count += 1
                        potential_users.add(u)
                else:
                    break

                if count > eps: #already has more than 5 appearences
                    break

            if count <= eps: #need to delete the song from all user's dictionaries

                for u in potential_users:
                    if u in matrix:
                        matrix[u].pop(s)

                changed = True

                removed_songs.add(s)


        for u in all_users:
            if u in matrix:
                if len(matrix[u]) <= eps:
                    removed_users.add(u)
                    if u in matrix:
                        matrix.pop(u) 
                    changed = True

        for s in removed_songs:
            all_songs.remove(s)

        for u in removed_users:
            all_users.discard(u)
        
        iterations += 1

        if changed == False:  
            break
        
    return matrix

In [4]:
#The function hash_count is used for calculating the bucket to whom the count_play belongs
def hash_count(n, b):
    largest = int(math.log(n, 2)) #largest power of two < n
    if largest < b:
        return largest+1
    else:
        return b

#The function preprocess_data reads data from the file and creates dictionary of dictionaries
#which means for each user creates dictionary of the songs that he played with corresponding count of plays
def preprocess_data(path = "train_triplets.txt", n = 300000, b = 10):

    data_tmp = pd.read_table(path, 
                             usecols=[0, 1, 2], 
                             names=['user', 'song', 'plays'],
                             nrows=n)


    songs = np.unique(data_tmp['song'])
    users = np.unique(data_tmp['user'])

    #matrix is in the format of dictionary of dictionaries and it contains only positive values
    matrix = collections.defaultdict(lambda: collections.defaultdict(int))

    #convert table to matrix
    data = data_tmp.as_matrix()  

    #add data in the new matrix 
    for triple in range(0,n):
        user = data[triple][0]
        song = data[triple][1]
        matrix[user][song] += data[triple][2]

    #BINNING
    all_keys = set(matrix.keys())

    for key in all_keys:
        songs_of_user = set(matrix[key].keys())
        for s in songs_of_user:
            play_count = matrix[key][s] 
            matrix[key][s] = hash_count(play_count, b)
       
            
    #COLD START ISSUE AVOIDING
    matrix = cold_start(matrix, set(users), set(songs))
    
    return matrix
        
 





In [10]:
#Function optimaze use alternationg optimization algorithm for the Latent Factors Method
def optimize(M, k = 30):

    #initilize P i Q using SVD
    U, s, V = svds(M, k=k)
    S = np.diag(s)
    Q = U.dot(S)
    P = V

    reg = linear_model.Ridge (alpha=1.0, fit_intercept=False)

    for q in range(15):
        #FIXED Q
        for i in range(len(M[0])): #for each user
            idx = np.flatnonzero(M[:,i]) 
            if len(idx) > 0:
                Mi = M[:,i][idx]
                Qi = Q[idx,]
                reg.fit(Qi, Mi)
                P[:,i] = reg.coef_
        
        #FIXED P
        for i in range(len(M)): #for each movie
            idx = np.flatnonzero(M[i]) 
            if len(idx) > 0:
                Mi = M[i][idx]
                Pi = (P.transpose())[idx,]
                reg.fit(Pi, Mi)
                Q[i] = reg.coef_
                
    return P,Q

In [6]:
#TEST SET
#For the matrix M which is the only parameter for the function, test set that contains 200 random values from M is created
#Test set consists of triples (row, column, value)
#Chosen values are removed from the matrix M
def create_test_set(M):

    test_set = set()
    indices = np.where(M>0)
    random_ind = random.sample(range(len(indices[0])),  200) 
    for x in random_ind:
        row = indices[0][x]
        column = indices[1][x]
        test_set.add((row, column, M[row][column]))
        M[row][column] = 0
        
    return (M, test_set)


In [7]:
# This function is used for the calculation of root-mean-square error (RMSE). 
#The RMSE represents the sample standard deviation of the differences between predicted values and real(observed) values. 
#RMSE is calculated fot the dataset which consists of triples (row, column, value)
def calculate_rmse(P, Q, test_set):

    prediction_matrix = np.dot(Q, P)

    numerator = 0

    for triple in test_set:
        real_value = triple[2]
        predicted_value =  prediction_matrix[triple[0]][triple[1]]
        numerator += (predicted_value - real_value)**2
    
    return math.sqrt(numerator / len(test_set))

In [12]:
while(1):
    answer = input('Do you want to use default parameters? y/n  ')

    if answer == 'y':
        print ('Default parameters: n = 300000, b = 10, k = 30, dataset_path =  train_triplets.txt')
        n = 300000
        b = 10
        k = 30
        path = "train_triplets.txt"
        #path = "C:\\Users\\Maki\\Desktop\\MMDS\\train_triplets\\train_triplets.txt"
    elif answer == 'n':
        n = int(input('How many rows do you want? n = '))
        b = int(input('How many bins do you want to be used? b = '))
        k = int(input('What is the value of k? k = '))
        path = input('What is the path for the dataset? path = ')

    else:
        print('ERROR: Wrong input!')
        continue

    start_time = time.time()

    print('\nThe preprocessing has just started. Please be patient.')

    matrix = preprocess_data(path, n, b)

    if len(matrix) == 0:
        print('All data deleted')
        continue

    #converting dictionary to matrix format
    df = pd.DataFrame(matrix).T.fillna(0)
    M = df.as_matrix()
        
    print('\nPreprocessing has been completed. Dimensions of matrix M: ',len(M),'x',len(M[0]))
    print('\nPreprocessing time:  ', round(time.time() - start_time ,3))
    
    #test set
    (M, test_set) = create_test_set(M)  
    print('\nThe test set has been created.')
    print('-------------------------------------------------------')

    #optimization
    print('\nThe optimization has just started. Please be patient.')
    start_time = time.time()
    (P, Q) = optimize(M, k)
    print('\nOptimization time:  ', round(time.time() - start_time ,3))
    print('-------------------------------------------------------')

    #RMSE calculation
    rmse = calculate_rmse(P, Q, test_set)
    print ('\nRMSE = ', round(rmse,2))     
    
    print ('\n-------------------------------------------------------')
    print ('Thank you for using our program')     
    break
        


Do you want to use default parameters? y/n  n
How many rows do you want? n = 300000
How many bins do you want to be used? b = 10
What is the value of k? k = 30
What is the path for the dataset? path = C:\\Users\\Maki\\Desktop\\MMDS\\train_triplets\\train_triplets.txt

The preprocessing has just started. Please be patient.

Preprocessing has been completed. Dimensions of matrix M:  5693 x 10966

Preprocessing time:   218.0

The test set has been created.
-------------------------------------------------------

The optimization has just started. Please be patient.

Optimization time:   135.747
-------------------------------------------------------

RMSE =  1.19

-------------------------------------------------------
Thank you for using our program
