# Collaborative filtering

In [2]:
import pandas as pd
import numpy as np

### Title column conversion to list of integers

In [3]:
import string

def stringParsing(listString):
    x = "".join(x for x in listString if x not in string.punctuation)
    x = list(map(int, x.split()))

    return x

In [4]:
playlists = pd.read_csv('Data/playlists_final.csv', sep='\t')
tracks = pd.read_csv('Data/tracks_final.csv', sep='\t')
target_tracks = pd.read_csv('Data/target_tracks.csv', sep='\t')
interactions = pd.read_csv('Data/train_final.csv', sep='\t')

In [5]:
playlists['title'] = playlists['title'].apply(stringParsing)
tracks['tags'] = tracks['tags'].apply(stringParsing)

In [6]:
playlists.head()

Unnamed: 0,created_at,playlist_id,title,numtracks,duration,owner
0,1216545588,644838,[12727],27,6522,41504
1,1249326867,7577564,[],9,2650,41504
2,1257766688,3120683,[183],16,3645,44542
3,1248079275,4278112,"[12389, 18698, 18925, 11695, 7117]",15,4151,44542
4,1175201268,8656823,"[12809, 2095, 13257, 12671, 20426, 14448, 18698]",84,18414,44542


In [7]:
tracks.head()

Unnamed: 0,track_id,artist_id,duration,playcount,album,tags
0,2972914,144,224000,49.0,[7],"[54087, 1757, 1718, 116712, 189631]"
1,2750239,246,157000,1.0,[8],"[189631, 3424, 177424, 46208, 205245]"
2,1550729,144,217000,554.0,[9],"[54087, 109806, 46869, 183258, 54337]"
3,2169950,144,207000,200.0,[9],"[54087, 70618, 207003, 109806, 116712]"
4,1903709,144,198000,5.0,[None],"[54087, 81223, 116712, 215342, 71028]"


In [8]:
interactions.head()

Unnamed: 0,playlist_id,track_id
0,3271849,2801526
1,5616275,727878
2,11267488,2805283
3,10103900,1515105
4,3836898,2945623


In [9]:
interactions.drop_duplicates(subset=['playlist_id', 'track_id'], keep='first')
interactions['rating'] = np.ones(interactions.shape[0])

In [10]:
interactions.head()

Unnamed: 0,playlist_id,track_id,rating
0,3271849,2801526,1.0
1,5616275,727878,1.0
2,11267488,2805283,1.0
3,10103900,1515105,1.0
4,3836898,2945623,1.0


In [11]:
n_playlists = interactions.playlist_id.nunique()
n_tracks = interactions.track_id.nunique()

playlists = interactions.playlist_id.unique()
tracks = interactions.track_id.unique()

print("Num of Playlists: %d" % n_playlists)
print("Num of Tracks: %d" % n_tracks)

Num of Playlists: 45649
Num of Tracks: 99999


In [12]:
playlist_to_idx = pd.Series(data=np.arange(len(playlists)), index=playlists)
track_to_idx = pd.Series(data=np.arange(len(tracks)), index=tracks)

idx_to_playlist = pd.Series(data=playlist_to_idx.index, index=playlist_to_idx.data)
idx_to_track = pd.Series(data=track_to_idx.index, index=track_to_idx.data)

interactions['playlist_id'] = interactions['playlist_id'].map(lambda x: playlist_to_idx[x])
interactions['track_id'] = interactions['track_id'].map(lambda x: track_to_idx[x])

In [13]:
interactions.head()

Unnamed: 0,playlist_id,track_id,rating
0,0,0,1.0
1,1,1,1.0
2,2,2,1.0
3,3,3,1.0
4,4,4,1.0


In [14]:
# Train/Test split
from sklearn.cross_validation import train_test_split
train_data, test_data = train_test_split(interactions, test_size=0.25)



In [15]:
from tempfile import mkdtemp
import os.path as path
train_file = path.join(mkdtemp(), 'trainFile.dat')
test_file = path.join(mkdtemp(), 'testFile.dat')

In [None]:
train_data_matrix = np.memmap(train_file, dtype='float32', mode='w+', shape=(n_playlists,n_tracks))

for line in train_data.iterrows():
    train_data_matrix[int(line[1][0]), int(line[1][1])] = int(line[1][2]) # or equals just 1
    
test_data_matrix = np.memmap(test_file, dtype='float32', mode='w+', shape=(n_playlists,n_tracks))
for line in test_data.iterrows():
    test_data_matrix[int(line[1][0]), int(line[1][1])] = int(line[1][2]) # or equals just 1

In [None]:
# Train and Test split
# Creating two playlist-track matrices, one for training and one for testing
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [None]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [None]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

# Evaluation

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [None]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

In [1]:
# -*- coding: utf-8 -*-

import pandas as pd 
import numpy as np
import scipy.sparse as sps
from sklearn.metrics.pairwise import cosine_similarity

from similarity import Cosine
from scipy.io import mmwrite, mmread

import time

import proba as prb
#read interactions
int_data=pd.read_csv('./Data/train_final.csv', sep='\t', header=0)
int_data = int_data.sort_values(['playlist_id', 'track_id', 'interaction_type'], ascending=False)
int_data = int_data.drop_duplicates(subset=['user_id', 'item_id'], keep='first')


items = int_data['track_id'].unique()
users = int_data['playlist_id'].unique()

item_to_idx = pd.Series(data=np.arange(len(items)), index=items)
user_to_idx = pd.Series(data=np.arange(len(users)), index=users)

idx_to_item = pd.Series(index=item_to_idx.data, data=item_to_idx.index)
idx_to_user = pd.Series(index=user_to_idx.data, data=user_to_idx.index)

#target users
tusers = pd.read_csv('./Data/target_playlists.csv', header=0)
tusers_that_rated = tusers[tusers['playlist_id'].isin(users) == True]
tusers_that_rated = tusers_that_rated.values.ravel()

#get item_ids that the traget users rated
#compute similarity only for these items
tusers_data = int_data[int_data['playlist_id'].isin(tusers_that_rated) == True]
items_to_compute = tusers_data['track_id'].unique()

#read item profiles
data=pd.read_csv('./Data/tracks_final.csv', sep='\t', header=0, usecols=['track_id', 'artist_id'])# 'discipline_id', 'industry_id', 'country', 'region', 'latitude', 'longitude', 'employment', 'active_during_test', 'tags', 'title'])
data = data.fillna(0)

# #not activeitems
# not_active_items = data.iloc[:][data['active_during_test'] == 0]
# not_active_items = not_active_items['id'].values



c = data['artist_id'].unique()
# d = data['discipline_id'].unique()
# i = data['industry_id'].unique()
# cn = data['country'].unique()
# r = data['region'].unique()
# la = data['latitude'].unique()
# lg = data['longitude'].unique()
# e = data['employment'].unique()
# tags = prb.get_disj_tags()
# titles = prb.get_disj_titles()

pitem_to_idx = pd.Series(data=np.arange(data.shape[0]), index=data['track_id'])
idx_to_pitem = pd.Series(index=pitem_to_idx.data, data=pitem_to_idx.index)



c_to_idx = pd.Series(index=c, data=np.arange(c.shape[0]))
# d_to_idx = pd.Series(index=d, data=np.arange(c.shape[0], c.shape[0] + d.shape[0]))
# i_to_idx = pd.Series(index=i, data=np.arange(c.shape[0] + d.shape[0], c.shape[0] + d.shape[0] + i.shape[0]))
# cn_to_idx = pd.Series(index=cn, data=np.arange(c.shape[0] + d.shape[0] + i.shape[0], c.shape[0] + d.shape[0] + i.shape[0] + cn.shape[0]))
# r_to_idx = pd.Series(index=r, data=np.arange(c.shape[0] + d.shape[0] + i.shape[0] +cn.shape[0], c.shape[0] + d.shape[0] + i.shape[0] + cn.shape[0] + r.shape[0]))
# la_to_idx = pd.Series(index=la, data=np.arange(c.shape[0] + d.shape[0] + i.shape[0] +cn.shape[0] + r.shape[0], c.shape[0] + d.shape[0] + i.shape[0] + cn.shape[0] + r.shape[0] + la.shape[0]))
# lg_to_idx = pd.Series(index=lg, data=np.arange(c.shape[0] + d.shape[0] + i.shape[0] +cn.shape[0] + r.shape[0] + la.shape[0], c.shape[0] + d.shape[0] + i.shape[0] + cn.shape[0] + r.shape[0] + la.shape[0] + lg.shape[0]))
# e_to_idx = pd.Series(index=e, data=np.arange(c.shape[0] + d.shape[0] + i.shape[0] +cn.shape[0] + r.shape[0] + la.shape[0] + lg.shape[0], c.shape[0] + d.shape[0] + i.shape[0] + cn.shape[0] + r.shape[0] + la.shape[0] + lg.shape[0] + e.shape[0]))
# tags_to_idx = pd.Series(index=tags, data=np.arange(c.shape[0] + d.shape[0] + i.shape[0] +cn.shape[0] + r.shape[0] + la.shape[0] + lg.shape[0], c.shape[0] + d.shape[0] + i.shape[0] + cn.shape[0] + r.shape[0] + la.shape[0] + lg.shape[0] + e.shape[0] + len(tags)))
# titles_to_idx = pd.Series(index=titles, data=np.arange(c.shape[0] + d.shape[0] + i.shape[0] +cn.shape[0] + r.shape[0] + la.shape[0] + lg.shape[0], c.shape[0] + d.shape[0] + i.shape[0] + cn.shape[0] + r.shape[0] + la.shape[0] + lg.shape[0] + e.shape[0] + len(tags) +len(tags)))


icm = sps.csc_matrix((data.shape[0], c.shape[0]))# + d.shape[0] + i.shape[0] + cn.shape[0] + r.shape[0] + la.shape[0] + lg.shape[0] + e.shape[0] + len(tags) + len(titles)))

#fancy indexing
icm[np.arange(0,data.shape[0]), c_to_idx[data.iloc[:,1].values]] = 1
# icm[np.arange(0,data.shape[0]), d_to_idx[data.iloc[:,2].values]] = 1
# icm[np.arange(0,data.shape[0]), i_to_idx[data.iloc[:,3].values]] = 1
# icm[np.arange(0,data.shape[0]), cn_to_idx[data.iloc[:,4].values]] = 1
# icm[np.arange(0,data.shape[0]), r_to_idx[data.iloc[:,5].values]] = 1
# icm[np.arange(0,data.shape[0]), la_to_idx[data.iloc[:,6].values]] = 1
# icm[np.arange(0,data.shape[0]), lg_to_idx[data.iloc[:,7].values]] = 1
# icm[np.arange(0,data.shape[0]), e_to_idx[data.iloc[:,8].values]] = 1    
# icm[np.arange(0,data.shape[0]), tags_to_idx[data.iloc[:,9].values]] = 1
# icm[np.arange(0,data.shape[0]), titles_to_idx[data.iloc[:,10].values]] = 1    

#icm for rated items by target users
tdata = data[data['track_id'].isin(items_to_compute) == True]

titems = tdata['track_id'].unique()

titem_to_idx = pd.Series(data=np.arange(len(titems)), index=titems)
idx_to_titem = pd.Series(index=titem_to_idx.data, data=titem_to_idx.index)

ticm = sps.csc_matrix((tdata.shape[0], c.shape[0])) # + d.shape[0] + i.shape[0] + cn.shape[0] + r.shape[0] + la.shape[0] + lg.shape[0] + e.shape[0]))

ticm[np.arange(0,tdata.shape[0]), c_to_idx[tdata.iloc[:,1].values]] = 1
# ticm[np.arange(0,tdata.shape[0]), d_to_idx[tdata.iloc[:,2].values]] = 1
# ticm[np.arange(0,tdata.shape[0]), i_to_idx[tdata.iloc[:,3].values]] = 1
# ticm[np.arange(0,tdata.shape[0]), cn_to_idx[tdata.iloc[:,4].values]] = 1
# ticm[np.arange(0,tdata.shape[0]), r_to_idx[tdata.iloc[:,5].values]] = 1
# ticm[np.arange(0,tdata.shape[0]), la_to_idx[tdata.iloc[:,6].values]] = 1
# ticm[np.arange(0,tdata.shape[0]), lg_to_idx[tdata.iloc[:,7].values]] = 1
# ticm[np.arange(0,tdata.shape[0]), e_to_idx[tdata.iloc[:,8].values]] = 1
# ticm[np.arange(0,tdata.shape[0]), tags_to_idx[tdata.iloc[:,9].values]] = 1
# ticm[np.arange(0,tdata.shape[0]), titles_to_idx[tdata.iloc[:,10].values]] = 1

def compute_sim():
    c = Cosine()

    sim = c.compute(icm, ticm)

    count = 1
    for i in titems:
        sim[titem_to_idx[i], pitem_to_idx[i]] = 0.0
        print("Finished for: ", count)
        count += 1

    mmwrite("./data/item_similarity.mtx", sim)

def filter_seen(user_id, ranking, rated_items):

        seen = pitem_to_idx[rated_items].values
        unseen_mask = np.in1d(ranking, seen, assume_unique=True, invert=True)
        return ranking[unseen_mask]

def filter_active(ranking):

        active_mask = np.in1d(ranking, pitem_to_idx[not_active_items], assume_unique=True, invert=True)
        return ranking[active_mask]

#estimate rating of all items to user
def recommend(user_id, n=None, exclude_seen=True):

    rated = int_data[int_data['playlist_id'] == user_id]

    rated_items = rated['track_id'].values
    ratings = np.ones(rated.shape[0])#rated['interaction_type'].values
    
    s = sim[titem_to_idx[rated_items], :].toarray()
    suma = s.sum(axis = 0)

    ratings = ratings.reshape(1, ratings.shape[0]).T

    ratings = np.tile(ratings, (1, s.shape[1]))


    s = s*ratings
    s = s.sum(axis = 0)

    s = s/suma

    if exclude_seen:
            s = filter_seen(user_id, s, rated_items)
            # s = filter_active(s)

    s = np.argsort(s)[::-1]


    return s[:n]


# Main
compute_sim()
# sim = mmread("./data/item_similarity.mtx")
# sim = sim.tocsc()
# #print(sim)


# result = np.zeros((tusers_that_rated.shape[0], 6))

# r = 0

# for u in tusers_that_rated:

#     result[r,0] = u
#     result_idx = recommend(u, 5, False)
#     result[r, 1:] = idx_to_pitem[result_idx].values
#     r+=1
#     print("Finished for: ", r)

# np.savetxt('./data/result_content1.csv', result, fmt='%d, %d %d %d %d %d')

FileNotFoundError: File b'./Data/item_profile.csv' does not exist