# Recommender Systems

In [1]:
import pandas as pd 
import numpy as np
import scipy.sparse as sps
from sklearn.metrics.pairwise import cosine_similarity

from similarity import Cosine
from scipy.io import mmwrite, mmread

import time

In [2]:
int_data=pd.read_csv('./Data/train_final.csv', sep='\t', header=0)

In [3]:
int_data = int_data.sort_values(['playlist_id', 'track_id'], ascending=False)
int_data = int_data.drop_duplicates(subset=['playlist_id', 'track_id'], keep='first')

In [4]:
tracks = int_data.track_id.unique() # items
playlists = int_data.playlist_id.unique() # users

In [5]:
track_to_idx = pd.Series(data=np.arange(len(tracks)), index=tracks)
playlist_to_idx = pd.Series(data=np.arange(len(playlists)), index=playlists)

In [6]:
idx_to_track = pd.Series(data=track_to_idx.index, index=track_to_idx.data)
idx_to_playlist = pd.Series(data=playlist_to_idx.index, index=playlist_to_idx.data)

In [7]:
target_playlists = pd.read_csv('./Data/target_playlists.csv', header=0)

In [8]:
created_playlists = target_playlists[target_playlists['playlist_id'].isin(playlists) == True]
created_playlists = created_playlists.values.ravel()

In [9]:
print(created_playlists.shape, target_playlists.shape, playlists.shape)

(10000,) (10000, 1) (45649,)


In [10]:
target_playlist_data = int_data[int_data['playlist_id'].isin(created_playlists)==True]
tracks_to_compute = target_playlist_data['track_id'].unique()

In [11]:
#read item profiles
data=pd.read_csv('./Data/tracks_final.csv', sep='\t', header=0, usecols=['track_id', 'artist_id','playcount', 'album', 'tags'])# 'discipline_id', 'industry_id', 'country', 'region', 'latitude', 'longitude', 'employment', 'active_during_test', 'tags', 'title'])
data = data.fillna(0) #???

In [12]:
data.head()

Unnamed: 0,track_id,artist_id,playcount,album,tags
0,2972914,144,49.0,[7],"[54087, 1757, 1718, 116712, 189631]"
1,2750239,246,1.0,[8],"[189631, 3424, 177424, 46208, 205245]"
2,1550729,144,554.0,[9],"[54087, 109806, 46869, 183258, 54337]"
3,2169950,144,200.0,[9],"[54087, 70618, 207003, 109806, 116712]"
4,1903709,144,5.0,[None],"[54087, 81223, 116712, 215342, 71028]"


In [13]:
# data.playcount = data.playcount.fillna(0)

### Parsing albums

In [14]:
def parse_albums(album): 
    if (len(album)!=0) and (album not in ['[None]', '[]']):
        x = album[1:len(album)-1]
        return int(x)
    else:
        return 0 #None #no album specified

In [15]:
data.album = data.album.apply(lambda x: parse_albums(x))

In [16]:
print(min(data.album.unique()), max(data.album.unique()))

0 217325


In [17]:
len(data.album.unique())

27605

### Parsing tags

In [18]:
import string

def stringParsing(listString):
    x = "".join(x for x in listString if x not in string.punctuation)
    # choose one of next two options
    x = list(map(int, x.split()))
#     x = x.split()

    return x

In [19]:
data.tags = data.tags.apply(stringParsing)

In [20]:
data.head()

Unnamed: 0,track_id,artist_id,playcount,album,tags
0,2972914,144,49.0,7,"[54087, 1757, 1718, 116712, 189631]"
1,2750239,246,1.0,8,"[189631, 3424, 177424, 46208, 205245]"
2,1550729,144,554.0,9,"[54087, 109806, 46869, 183258, 54337]"
3,2169950,144,200.0,9,"[54087, 70618, 207003, 109806, 116712]"
4,1903709,144,5.0,0,"[54087, 81223, 116712, 215342, 71028]"


In [21]:
len(data.album.unique())

27605

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
track_id     100000 non-null int64
artist_id    100000 non-null int64
playcount    100000 non-null float64
album        100000 non-null int64
tags         100000 non-null object
dtypes: float64(1), int64(3), object(1)
memory usage: 3.8+ MB


In [23]:
from sklearn.preprocessing import StandardScaler

In [24]:
sc = StandardScaler()

In [25]:
data.playcount = sc.fit_transform(data.playcount.reshape(-1,1)).ravel()

  """Entry point for launching an IPython kernel.


### Tags

In [27]:
tagset = set()
for tag_list in data.tags.values:
    for tag in tag_list:
        tagset.add(tag)

In [28]:
len(list(tagset))

31900

In [29]:
len(data.tags.values)

100000

### Album Categorical val

In [None]:
dummies = pd.get_dummies(data['album'],drop_first=True, sparse=True)
data = data.drop('album', axis=1)
data = data.join(dummies)

In [347]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 27608 entries, track_id to 217325
dtypes: float64(1), int64(2), object(1), uint8(27604)
memory usage: 2.6+ GB


### Recsys continue

In [11]:
artists = data['artist_id'].unique()
tracks_d = data['track_id'].unique()

In [12]:
ptrack_to_idx = pd.Series(data=np.arange(data.shape[0]), index=data['track_id'])
idx_to_ptrack = pd.Series(index=ptrack_to_idx.data, data=ptrack_to_idx.index)

In [13]:
artist_to_idx = pd.Series(data=np.arange(data.shape[0]), index=data['track_id'])
idx_to_artist = pd.Series(data=artist_to_idx.index, index=artist_to_idx.data)

In [14]:
artists_to_idx = pd.Series(index=artists, data=np.arange(artists.shape[0]))

In [15]:
icm = sps.csc_matrix((data.shape[0], artists.shape[0]))

In [16]:
#fancy indexing
icm[np.arange(0,data.shape[0]), artists_to_idx[data.iloc[:,1].values]] = 1



In [17]:
#icm for rated items by target users
tdata = data[data['track_id'].isin(tracks_to_compute) == True]

In [18]:
ttracks = tdata['track_id'].unique()

In [19]:
ttrack_to_idx = pd.Series(data=np.arange(len(ttracks)), index=ttracks)
idx_to_ttrack = pd.Series(index=ttrack_to_idx.data, data=ttrack_to_idx.index)

In [20]:
ticm = sps.csc_matrix((tdata.shape[0], artists.shape[0]))

In [None]:
ticm[np.arange(0,tdata.shape[0]), artist_to_idx[tdata.iloc[:,1].values]] = 1

In [1]:
def compute_sim():
    c = Cosine()

    sim = c.compute(icm, ticm)

    count = 1
    for i in titems:
        sim[titem_to_idx[i], pitem_to_idx[i]] = 0.0
        print("Finished for: ", count)
        count += 1

    mmwrite("./Data/item_similarity.mtx", sim)

In [2]:
def filter_seen(user_id, ranking, rated_items):

        seen = pitem_to_idx[rated_items].values
        unseen_mask = np.in1d(ranking, seen, assume_unique=True, invert=True)
        return ranking[unseen_mask]

def filter_active(ranking):

        active_mask = np.in1d(ranking, pitem_to_idx[not_active_items], assume_unique=True, invert=True)
        return ranking[active_mask]

In [3]:
#estimate rating of all items to user
def recommend(user_id, n=None, exclude_seen=True):

    rated = int_data[int_data['playlist_id'] == user_id]

    rated_items = rated['track_id'].values
    ratings = np.ones(rated.shape[0])#rated['interaction_type'].values
    
    s = sim[titem_to_idx[rated_items], :].toarray()
    suma = s.sum(axis = 0)

    ratings = ratings.reshape(1, ratings.shape[0]).T

    ratings = np.tile(ratings, (1, s.shape[1]))


    s = s*ratings
    s = s.sum(axis = 0)

    s = s/suma

    if exclude_seen:
            s = filter_seen(user_id, s, rated_items)
            # s = filter_active(s)

    s = np.argsort(s)[::-1]


    return s[:n]

In [5]:
compute_sim()

In [None]:
sim = mmread("./Data/item_similarity1.mtx")
sim = sim.tocsc()
#print(sim)


result = np.zeros((tusers_that_rated.shape[0], 6))

r = 0

for u in tusers_that_rated:

    result[r,0] = u
    result_idx = recommend(u, 5, False)
    result[r, 1:] = idx_to_pitem[result_idx].values
    r+=1
    print("Finished for: ", r)

np.savetxt('./data/result_content2.csv', result, fmt='%d, %d %d %d %d %d')