# Collaborative filtering

In [1]:
import pandas as pd
import numpy as np

### Title column conversion to list of integers

In [20]:
import string

def stringParsing(listString):
    x = "".join(x for x in listString if x not in string.punctuation)
    x = list(map(int, x.split()))

    return x

In [21]:
playlists = pd.read_csv('Data/playlists_final.csv', sep='\t')
tracks = pd.read_csv('Data/tracks_final.csv', sep='\t')
target_tracks = pd.read_csv('Data/target_tracks.csv', sep='\t')
interactions = pd.read_csv('Data/train_final.csv', sep='\t')

In [22]:
playlists['title'] = playlists['title'].apply(stringParsing)
tracks['tags'] = tracks['tags'].apply(stringParsing)

In [23]:
playlists.head()

Unnamed: 0,created_at,playlist_id,title,numtracks,duration,owner
0,1216545588,644838,[12727],27,6522,41504
1,1249326867,7577564,[],9,2650,41504
2,1257766688,3120683,[183],16,3645,44542
3,1248079275,4278112,"[12389, 18698, 18925, 11695, 7117]",15,4151,44542
4,1175201268,8656823,"[12809, 2095, 13257, 12671, 20426, 14448, 18698]",84,18414,44542


In [24]:
tracks.head()

Unnamed: 0,track_id,artist_id,duration,playcount,album,tags
0,2972914,144,224000,49.0,[7],"[54087, 1757, 1718, 116712, 189631]"
1,2750239,246,157000,1.0,[8],"[189631, 3424, 177424, 46208, 205245]"
2,1550729,144,217000,554.0,[9],"[54087, 109806, 46869, 183258, 54337]"
3,2169950,144,207000,200.0,[9],"[54087, 70618, 207003, 109806, 116712]"
4,1903709,144,198000,5.0,[None],"[54087, 81223, 116712, 215342, 71028]"


In [25]:
interactions.head()

Unnamed: 0,playlist_id,track_id
0,3271849,2801526
1,5616275,727878
2,11267488,2805283
3,10103900,1515105
4,3836898,2945623


In [26]:
interactions.drop_duplicates(subset=['playlist_id', 'track_id'], keep='first')
interactions['rating'] = np.ones(interactions.shape[0])

In [27]:
interactions.head()

Unnamed: 0,playlist_id,track_id,rating
0,3271849,2801526,1.0
1,5616275,727878,1.0
2,11267488,2805283,1.0
3,10103900,1515105,1.0
4,3836898,2945623,1.0


In [28]:
n_playlists = interactions.playlist_id.nunique()
n_tracks = interactions.track_id.nunique()

playlists = interactions.playlist_id.unique()
tracks = interactions.track_id.unique()

print("Num of Playlists: %d" % n_playlists)
print("Num of Tracks: %d" % n_tracks)

Num of Playlists: 45649
Num of Tracks: 99999


In [29]:
playlist_to_idx = pd.Series(data=np.arange(len(playlists)), index=playlists)
track_to_idx = pd.Series(data=np.arange(len(tracks)), index=tracks)

idx_to_playlist = pd.Series(data=playlist_to_idx.index, index=playlist_to_idx.data)
idx_to_track = pd.Series(data=track_to_idx.index, index=track_to_idx.data)

interactions['playlist_id'] = interactions['playlist_id'].map(lambda x: playlist_to_idx[x])
interactions['track_id'] = interactions['track_id'].map(lambda x: track_to_idx[x])

In [33]:
interactions.head()

Unnamed: 0,playlist_id,track_id,rating
0,0,0,1.0
1,1,1,1.0
2,2,2,1.0
3,3,3,1.0
4,4,4,1.0


In [12]:
# Train/Test split
from sklearn.cross_validation import train_test_split
train_data, test_data = train_test_split(interactions, test_size=0.25)



In [13]:
from tempfile import mkdtemp
import os.path as path
train_file = path.join(mkdtemp(), 'trainFile.dat')
test_file = path.join(mkdtemp(), 'testFile.dat')

In [19]:
# train_data_matrix = np.memmap(train_file, dtype='float32', mode='w+', shape=(n_playlists,n_tracks))

for line in train_data[:1].iterrows():
#     train_data_matrix[line[1], line[2]] = line[3] # or equals just 1
    print(line[0], line[1])
# test_data_matrix = np.memmap(test_file, dtype='float32', mode='w+', shape=(n_playlists,n_tracks))
# for line in test_data.iterrows():
#     test_data_matrix[line[1], line[2]] = line[3] # or equals just 1

444429 playlist_id    40843.0
track_id        5669.0
rating             1.0
Name: 444429, dtype: float64


In [None]:
# Train and Test split
# Creating two playlist-track matrices, one for training and one for testing
from sklearn.metrics.pairwise import pairwise_distances
playlist_similarity = pairwise