In [1]:
import numpy as np
import math
import argparse
from scipy.sparse import rand as sprand
from scipy.sparse import lil_matrix
import torch
from torch.autograd import Variable
import pandas as pd

In [2]:
user_friends = pd.read_csv('hetrec2011-lastfm-2k/user_friends.dat',sep='\t',engine='python')
user_artists = pd.read_csv('hetrec2011-lastfm-2k/user_artists.dat',sep='\t',engine='python')

In [3]:
tag = pd.read_csv('hetrec2011-lastfm-2k/user_taggedartists.dat',sep='\t',engine='python')
tag.head()

Unnamed: 0,userID,artistID,tagID,day,month,year
0,2,52,13,1,4,2009
1,2,52,15,1,4,2009
2,2,52,18,1,4,2009
3,2,52,21,1,4,2009
4,2,52,41,1,4,2009


In [197]:
timetag = pd.read_csv('hetrec2011-lastfm-2k/user_taggedartists-timestamps.dat',sep='\t',engine='python')
timetag.head()

Unnamed: 0,userID,artistID,tagID,timestamp
0,2,52,13,1238536800000
1,2,52,15,1238536800000
2,2,52,18,1238536800000
3,2,52,21,1238536800000
4,2,52,41,1238536800000


In [198]:
user_artists.head()

Unnamed: 0,userID,artistID,weight
0,2,51,13883
1,2,52,11690
2,2,53,11351
3,2,54,10300
4,2,55,8983


In [199]:
# create a table with total number of plays
user_plays = (user_artists.
                groupby(by = ['userID'])['weight'].
                sum().
                reset_index().
                rename(columns = {'weight': 'total_user_plays'})
                [['userID', 'total_user_plays']])
user_plays.head()

Unnamed: 0,userID,total_user_plays
0,2,168737
1,3,20501
2,4,26930
3,5,13159
4,6,1011


In [200]:
user_artists_with_total_plays = user_artists.merge(user_plays, left_on ='userID', right_on = 'userID', how = 'left')
user_artists_with_total_plays

Unnamed: 0,userID,artistID,weight,total_user_plays
0,2,51,13883,168737
1,2,52,11690,168737
2,2,53,11351,168737
3,2,54,10300,168737
4,2,55,8983,168737
5,2,56,6152,168737
6,2,57,5955,168737
7,2,58,4616,168737
8,2,59,4337,168737
9,2,60,4147,168737


In [201]:
user_artists_with_total_plays['prerank'] = user_artists_with_total_plays['weight']/user_artists_with_total_plays['total_user_plays']

In [202]:
user_artists_with_total_plays['rank'] = user_artists_with_total_plays.groupby('userID')['prerank'].rank(ascending=True)

In [203]:
user_artists_with_total_plays.head(10)

Unnamed: 0,userID,artistID,weight,total_user_plays,prerank,rank
0,2,51,13883,168737,0.082276,50.0
1,2,52,11690,168737,0.069279,49.0
2,2,53,11351,168737,0.06727,48.0
3,2,54,10300,168737,0.061042,47.0
4,2,55,8983,168737,0.053237,46.0
5,2,56,6152,168737,0.036459,45.0
6,2,57,5955,168737,0.035292,44.0
7,2,58,4616,168737,0.027356,43.0
8,2,59,4337,168737,0.025703,42.0
9,2,60,4147,168737,0.024577,41.0


In [204]:
df = user_artists_with_total_plays

In [207]:
rating=[]
for i in range(user_artists_with_total_plays.shape[0]):
    if i==0:
        rating.append(5)
        temp=df.iloc[i][4] 
    else:
        if df.iloc[i][0]==df.iloc[i-1][0]:
            rating.append(5*(1-temp))
            temp+=df.iloc[i][4]
        else:
            rating.append(5)
            temp=df.iloc[i][4]


df['rating']=rating

In [208]:
print(rating[80000])

0.02590673575129432


In [209]:
from sklearn.model_selection import train_test_split
rawtrain, rawtest = train_test_split(df, test_size=0.3)

In [210]:
def get_artist_ratings(df):
#     n_users = max(df.userID.unique())
#     n_items = max(df.artistID.unique())
    n_users = 2100
    n_items = 18745
    
    interactions = lil_matrix( (n_users,n_items), dtype=float) #np.zeros((n_users, n_items))
    for row in df.itertuples():
        interactions[row[1] - 1, row[2] - 1] = row[7]
    return interactions

In [211]:
train = get_artist_ratings(rawtrain)
test = get_artist_ratings(rawtest)

In [222]:
train

<2100x18745 sparse matrix of type '<class 'numpy.float64'>'
	with 64983 stored elements in LInked List format>

In [213]:
test

<2100x18745 sparse matrix of type '<class 'numpy.float64'>'
	with 27851 stored elements in LInked List format>

In [223]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train, metric='cosine')
item_similarity = pairwise_distances(train.T, metric='cosine')


In [224]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating)
        pred = mean_user_rating + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [239]:
user_prediction = predict(train, user_similarity, type='user')
# item_prediction = predict(train, item_similarity, type='item')

In [226]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].toarray()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [227]:
print ('User-based CF RMSE: ' + str(rmse(user_prediction, test)))

User-based CF RMSE: 2.559780359259295


In [228]:
class MatrixFactorization(torch.nn.Module):
    
    def __init__(self, n_users, n_items, n_factors=5):
        super().__init__()
        self.user_factors = torch.nn.Embedding(n_users,
                                               n_factors,
                                               sparse=False)
        self.item_factors = torch.nn.Embedding(n_items,
                                               n_factors,
                                               sparse=False)

    # For convenience when we want to predict a sinble user-item pair.
    def predict(self, users, items):
        pred = torch.mm(users,self.item_factors(items))
        pred = torch.mm(pred,torch.transpose(self.item_factors(items),0,1))
        return pred
    
    # Much more efficient batch operator. This should be used for training purposes
    def forward(self, users, items):
        # Need to fit bias factors
        return torch.mm(self.user_factors(users),torch.transpose(self.item_factors(items),0,1))

In [229]:
def get_batch(batch_size,ratings):
    # Sort our data and scramble it
    rows, cols = ratings.shape
    p = np.random.permutation(rows)
    
    # create batches
    sindex = 0
    eindex = batch_size
    while eindex < rows:
        batch = p[sindex:eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= rows:
        batch = range(sindex,rows)
        yield batch

In [230]:
def test_error(model,test,BATCH_SIZE):
    loss_func = torch.nn.MSELoss()
    square_deviation = 0
    msei=0
    for i,batch in enumerate(get_batch(BATCH_SIZE, test)):
        # Turn data into variables
        interactions = Variable(torch.FloatTensor(test[batch, :].toarray()))
        rows = Variable(torch.LongTensor(batch))
        cols = Variable(torch.LongTensor(np.arange(test.shape[1])))
        
        # Predict and calculate loss
        predictions = model.predict(interactions, cols)
        loss = loss_func(predictions, interactions)
        
        # plus the square deviation
        square_deviation += loss*rows.shape[0]*cols.shape[0]
        msei += rows.shape[0]*cols.shape[0]
    rmse = torch.sqrt(square_deviation/msei)
    print("Test RMSE loss", rmse)
    return rmse

In [231]:
def plainvanilla(train, test, EPOCH = 100, BATCH_SIZE = 1000, LR = 0.1,l2_penalty=0.01,latent_factor=3):
    model = MatrixFactorization(train.shape[0], train.shape[1], n_factors=latent_factor)
    loss_func = torch.nn.MSELoss()
    reg_loss_func = torch.optim.SGD(model.parameters(), lr=LR, weight_decay = l2_penalty)
    for i in range(EPOCH):
        print("Epoch:", i)
        square_deviation = 0
        msei=0
        for j,batch in enumerate(get_batch(BATCH_SIZE, train)):
            # Set gradients to zero
            reg_loss_func.zero_grad()
            
            # Turn data into variables
            
            interactions = Variable(torch.FloatTensor(train[batch, :].toarray()))
            rows = Variable(torch.LongTensor(batch))
            cols = Variable(torch.LongTensor(np.arange(train.shape[1])))
            
            # Predict and calculate loss
            predictions = model(rows, cols)
            loss = loss_func(predictions, interactions)
            
            # Backpropagate
            loss.backward()
            
            # Update the parameters
            reg_loss_func.step()
            
            # plus the square deviation
            if i==EPOCH-1:
                square_deviation += loss*rows.shape[0]*cols.shape[0]
                msei += rows.shape[0]*cols.shape[0]
        print(loss)
    
    test_rmse = test_error(model,test,BATCH_SIZE)
    return model,test_rmse

In [241]:
model,rmse = plainvanilla(train,test, EPOCH = 80, BATCH_SIZE = 100, LR = 0.1, l2_penalty=0.01, latent_factor=5)

Epoch: 0
Variable containing:
 5.1323
[torch.FloatTensor of size 1]

Epoch: 1
Variable containing:
 4.6830
[torch.FloatTensor of size 1]

Epoch: 2
Variable containing:
 4.2746
[torch.FloatTensor of size 1]

Epoch: 3
Variable containing:
 3.9028
[torch.FloatTensor of size 1]

Epoch: 4
Variable containing:
 3.5646
[torch.FloatTensor of size 1]

Epoch: 5
Variable containing:
 3.2562
[torch.FloatTensor of size 1]

Epoch: 6
Variable containing:
 2.9752
[torch.FloatTensor of size 1]

Epoch: 7
Variable containing:
 2.7195
[torch.FloatTensor of size 1]

Epoch: 8
Variable containing:
 2.4869
[torch.FloatTensor of size 1]

Epoch: 9
Variable containing:
 2.2749
[torch.FloatTensor of size 1]

Epoch: 10
Variable containing:
 2.0812
[torch.FloatTensor of size 1]

Epoch: 11
Variable containing:
 1.9046
[torch.FloatTensor of size 1]

Epoch: 12
Variable containing:
 1.7435
[torch.FloatTensor of size 1]

Epoch: 13
Variable containing:
 1.5962
[torch.FloatTensor of size 1]

Epoch: 14
Variable containing: