In [4]:
import numpy as np
import pandas as pd
import scipy
from scipy.sparse import csr_matrix
import time
import csv
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from pyfm import pylibfm

In [5]:
d_train = pd.read_csv('./data/train.csv')
d_test = pd.read_csv('./data/test.csv')
d_prof = pd.read_csv('./data/profiles.csv')
d_artists = pd.read_csv('./data/artists.csv')

In [31]:
# Read in data
def loadTrainData(filename,path="./data/"):
    data = []
    y = []
    users=set()
    items=set()
    with open(path+filename) as f:
        i = 0
        for line in f:
            if i == 0:
                i += 1
                continue
            (user,movieid,rating)=line.strip().split(',')
            data.append({ "user_id": str(user), "movie_id": str(movieid)})
            y.append(float(rating))
            users.add(user)
            items.add(movieid)

    return (data, np.array(y), users, items)

def loadTestData(filename,path="./data/"):
    data = []
    users=set()
    items=set()
    with open(path+filename) as f:
        i = 0
        for line in f:
            if i == 0:
                i += 1
                continue
            (Id,user,movieid)=line.split(',')
            data.append({ "user_id": str(user), "movie_id": str(movieid)})
            users.add(user)
            items.add(movieid)

    return (data, users, items)

In [24]:
users_profiles=set(d_prof['user'].values)
artists_artists=set(d_artists['artist'].values)

In [18]:
#dictionary for users id
dict_users=dict(zip(users_profiles, range(len(users_profiles))))
#dictionary for artists id
dict_artists=dict(zip(artists_artists, range(len(artists_artists))))

In [None]:
(train_data, y_train, train_users, train_items) = loadTrainData('train.csv')
(test_data, test_users, test_items) = loadTestData("test.csv")
v = DictVectorizer()
X_train = v.fit_transform(train_data)
X_test = v.transform(test_data)

# Build and train a Factorization Machine
fm = pylibfm.FM(num_factors=100, num_iter=500, verbose=True, task="regression", initial_learning_rate=1e-6, learning_rate_schedule="optimal")

fm.fit(X_train,y_train)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 260084.50039
-- Epoch 2
Training MSE: 258072.29178
-- Epoch 3
Training MSE: 257997.58225
-- Epoch 4
Training MSE: 257923.80142
-- Epoch 5
Training MSE: 257847.67711
-- Epoch 6
Training MSE: 257757.28072
-- Epoch 7
Training MSE: 257616.81034
-- Epoch 8
Training MSE: 257309.57222
-- Epoch 9
Training MSE: 256463.07133
-- Epoch 10
Training MSE: 253927.87874
-- Epoch 11
Training MSE: 247057.31508
-- Epoch 12
Training MSE: 234878.74455
-- Epoch 13
Training MSE: 227538.62074
-- Epoch 14
Training MSE: 223219.90873
-- Epoch 15
Training MSE: 220303.45278
-- Epoch 16
Training MSE: 216967.67312
-- Epoch 17
Training MSE: 211644.31472
-- Epoch 18
Training MSE: 204328.29947
-- Epoch 19
Training MSE: 195952.84688
-- Epoch 20
Training MSE: 187209.57576
-- Epoch 21
Training MSE: 178295.56496
-- Epoch 22
Training MSE: 169393.02050
-- Epoch 23
Training MSE: 160693.41298
-- Epoch 24
Training MSE: 152486.978

In [5]:
num_users = len(d_prof.user)
num_artists = len(d_artists)
Ndim = 10
lbd = 0.1

In [6]:
data=d_train['plays'].values
data_norm = np.double(data) / max(data)

In [7]:
# Rating matrix
row=[dict_users[i] for i in d_train['user'].values]
col=[dict_artists[i] for i in d_train['artist'].values]
RMat=csr_matrix((data_norm, (row, col)), shape=(num_users, num_artists))

In [19]:
# matrix factorization
#P_user = np.ones([num_users, Ndim])
#Q_art = np.ones([num_artists, Ndim])
P_user = np.random.rand(num_users, Ndim)
Q_art = np.random.rand(num_artists, Ndim)

In [24]:
numIter = 0
maxIter = 100
yita = 0.001
ErrVec = np.zeros(numIter)
t0 = time.clock()
while numIter < 100:
    # loop over training set
    for tr in d_train.values:
        u = dict_users[tr[0]]
        a = dict_artists[tr[1]]
        rate = tr[2]
        p_u = P_user[u, :]
        q_a = Q_art[a, :]
        Err = RMat[u, a] - np.dot(p_u, q_a)
        p_u = p_u + yita*(Err*q_a-lbd*p_u)
        q_a = q_a + yita*(Err*p_u-lbd*q_a)
        #print p_new
        #rint q_new
        P_user[u, :] = p_u
        Q_art[a, :] = q_a
#         if numIter > 10000:
#             break
    # calculate error
#     for tr in d_train.values:
#         u = dict_users[tr[0]]
#         a = dict_artists[tr[1]]
#         rate = tr[2]
#         p_u = P_user[u, :]
#         q_a = Q_art[a, :]
#         ErrVec[numIter] += RMat[u, a] - np.dot(p_u, q_a)
#     if ErrVec[numIter] < 0.1:
#         print 'err smaller than 0.001, numLoop:', numIter
#         break
    numIter += 1
print time.clock() - t0, "seconds process time"

14510.423634 seconds process time


In [33]:
Q_art

array([[  9.87816642e-05,   1.20004923e-04,   1.22392421e-04, ...,
          1.56154891e-04,   1.15391705e-04,   7.67940657e-05],
       [  3.38697341e-04,   1.23089923e-04,   2.11607648e-04, ...,
          1.39878924e-04,   1.19596138e-04,   4.36085306e-05],
       [  5.90304217e-05,   4.19185033e-05,   1.50955826e-04, ...,
          8.62375350e-05,   9.87862306e-05,   1.25716585e-04],
       ..., 
       [  2.06705238e-04,   1.27445877e-04,   1.83317150e-04, ...,
          1.74474019e-04,   9.99966211e-05,   1.03595759e-04],
       [  1.57581672e-04,   1.57165700e-04,   1.53954898e-04, ...,
          8.31431302e-05,   1.48812804e-04,  -1.22429660e-05],
       [  7.68036655e-05,   1.11633773e-04,   7.25608346e-05, ...,
          4.24426778e-05,   1.02153193e-04,   1.13658411e-04]])

In [13]:
P_user

array([[ 0.86252341,  0.54441578,  0.37208828, ...,  0.67664195,
         0.18824546,  0.64020052],
       [ 0.93991752,  0.64114049,  0.98100957, ...,  0.49941717,
         0.85338248,  0.85966756],
       [ 0.17271683,  0.98830942,  0.56421805, ...,  0.03465758,
         0.83038321,  0.05105259],
       ..., 
       [ 0.91093163,  0.33463205,  0.16468657, ...,  0.70577038,
         0.7750251 ,  0.63143624],
       [ 0.93545673,  0.66579405,  0.09680115, ...,  0.39475343,
         0.31151672,  0.08343328],
       [ 0.79499278,  0.97529649,  0.06261903, ...,  0.20586603,
         0.55278331,  0.46895966]])

In [35]:
np.save('userMat', P_user)
np.save('artMat', Q_art)

In [122]:
scipy.sparse.find(RMat)

(array([     0,      0,      0, ..., 233285, 233285, 233285], dtype=int32),
 array([ 147,  422,  524, ..., 1516, 1670, 1739], dtype=int32),
 array([  2.00402236e-04,   1.19287045e-04,   1.43144454e-04, ...,
          9.54296362e-05,   5.01005590e-05,   3.74561322e-04]))

In [16]:
mat = pd.read_csv('mat_factorization.csv')
mat.shape

(4154804, 2)

In [8]:
train_file = './data/train.csv'
test_file  = './data/test.csv'
soln_file  = './data/user_median.csv'

# Load the training data.
train_data = {}
with open(train_file, 'r') as train_fh:
    train_csv = csv.reader(train_fh, delimiter=',', quotechar='"')
    next(train_csv, None)
    for row in train_csv:
        user   = row[0]
        artist = row[1]
        plays  = row[2]
    
        if not user in train_data:
            train_data[user] = {}
        
        train_data[user][artist] = int(plays)

# Compute the global median and per-user median.
plays_array  = []
user_medians = {}
for user, user_data in train_data.iteritems():
    user_plays = []
    for artist, plays in user_data.iteritems():
        plays_array.append(plays)
        user_plays.append(plays)

    user_medians[user] = np.median(np.array(user_plays))
global_median = np.median(np.array(plays_array))

In [9]:
maxValue = max(data)
maxValue

419157

In [10]:
# read mat
P_user = np.load('userMat.npy')
Q_art = np.load('artMat.npy')

In [21]:
soln_file  = 'mat_factorization.csv'

# write solution
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])
        Iter = 0
        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]
            u = dict_users[user]
            a = dict_artists[artist]
            p_u = P_user[u, :]
            q_a = Q_art[a, :]
            res = np.round(np.dot(p_u, q_a)*maxValue)
            if res < 0:
                res = np.round(user_medians[user])
            soln_csv.writerow([id, res])