# Model
Let's use word2vec to train a user intent model. Nothing too fancy.

In [1]:
import gensim
from idomaar import *
import progressbar
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
class PlaylistIterator():
    def __init__(self, path, tracks_file=None, verbose=0):
        self.path = path
        self.tracks_file = tracks_file
        if verbose < 0:
            raise ValueError("verbosity level must be above or equal to 0")
        self.verbose = verbose
    
    def __iter__(self):
        with idomaarReader(self.path, tracks_file=self.tracks_file, tolerant=False) as ier:
            if self.verbose == 0:
                g = ier
            else:
                g = progressbar.progressbar(ier)
            for thingy in g:
                try:
                    if self.tracks_file is None:
                        yield [str(x.id) for x in thingy.linked.objects]
                    else:
                        strings =  ["track_{} artist_{}".format(x.id, x.properties.artist_id)
                              for x in thingy.linked.objects]
                        yield [item for string in strings for item in string.split(" ")]
                except Exception as e:
                    print(e)
                    print(thingy)
                    raise

# Sessions and playlists based similarity

In [3]:
import os

def w2v_model(session, out, tracks_file=None, overwrite=False, min_count=1, workers=4, size=100):
    if not overwrite and os.path.exists(out):
        return gensim.models.Word2Vec.load(out)
    playliterator = PlaylistIterator(session, tracks_file=tracks_file)
    model = gensim.models.Word2Vec(playliterator, min_count=min_count, workers=workers, size=size)
    model.save(out)
    return model

In [5]:
model = w2v_model("../data/ThirtyMusic/entities/playlist.idomaar", "cheap_playlists.w2v", overwrite=True, workers=8, size=100)

2018-11-27 04:02:15,491 : INFO : collecting all words and their counts
2018-11-27 04:02:15,645 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-11-27 04:02:15,716 : ERROR : Expecting ',' delimiter: line 1 column 34 (char 33)
 Offending line: playlist	381	1357156475	{"ID":10985280,"Title":"2012 m. "Radiocentro" Top 100","numtracks":66,"duration":13712}	{"subjects":[{"type":"user","id":43580}],"objects":[{"type":"track","id":2374504},{"type":"track","id":633023},{"type":"track","id":2205687},{"type":"track","id":2056701},{"type":"track","id":122518},{"type":"track","id":2733092},{"type":"track","id":2514711},{"type":"track","id":686532},{"type":"track","id":1736577},{"type":"track","id":3241885},{"type":"track","id":2026968},{"type":"track","id":2552800},{"type":"track","id":1203212},{"type":"track","id":1590256},{"type":"track","id":3618565},{"type":"track","id":568987},{"type":"track","id":1748173},{"type":"track","id":1047128},{"type":"track","id":236924

In [None]:
# this will likely take over your ram
#model = w2v_model("../data/ThirtyMusic/relations/sessions.idomaar", "cheap_sessions.w2v", overwrite=True, workers=8, size=30)

# Songs data 

In [6]:
import pandas as pd
from idomaar import *
import progressbar
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [10]:
!head -n5 ../data/ThirtyMusic/entities/tracks.idomaar

track	0	-1	{"duration":-1,"playcount":4,"MBID":null,"name":"000003+Music+Instructor/_/Dj%27s+Rock+Da+House+%C3%82%E2%89%88%C3%86%E2%89%88%C3%8A01+-+Dj+Max-Pulemet+Vs.+Bomfunk+Mc%27s+-+Electro+Breakdance+party+1+%5B2000%5D+=+CD+ONE%C3%82%E2%89%88%C3%86%E2%89%88%C3%8A"}	{"artists":[{"type":"person","id":0}],"albums":[],"tags":[]}
track	1	-1	{"duration":-1,"playcount":495,"MBID":null,"name":"00-01/_/%D0%A2%D0%B5%D0%BA%D1%81%D1%82"}	{"artists":[{"type":"person","id":1}],"albums":[],"tags":[]}
track	2	-1	{"duration":-1,"playcount":2,"MBID":null,"name":"0005.+Overkill/_/Overkill"}	{"artists":[{"type":"person","id":2}],"albums":[],"tags":[]}
track	3	-1	{"duration":-1,"playcount":2,"MBID":null,"name":"000C+Tony+Dize/_/Ruleta+Rusa"}	{"artists":[{"type":"person","id":3}],"albums":[],"tags":[]}
track	4	-1	{"duration":-1,"playcount":1,"MBID":null,"name":"000+Oscarcito/_/Tumbay%E2%80%9A+(Lyrics)"}	{"artists":[{"type":"person","id":4}],"albums":[],"tags":[]}


In [11]:
def idomaar_df(path): # TODO: chunksize
    def row_reader():
        with idomaarReader(path) as ier:
            for t in progressbar.progressbar(ier):
                p = t.properties
                yield [t.id, p.MBID, p.duration, p.name, p.playcount]
    df = pd.DataFrame([x for x in row_reader()], columns=["id","MBID","duration","name","playcount"])
    return df

df = idomaar_df("../data/ThirtyMusic/entities/tracks.idomaar")

100% (5675143 of 5675143) |##############| Elapsed Time: 0:05:57 Time:  0:05:57


In [12]:
df.head()

Unnamed: 0,id,MBID,duration,name,playcount
0,0,,-1.0,000003 Music Instructor/_/Dj's Rock Da House Â...,4.0
1,1,,-1.0,00-01/_/Текст,495.0
2,2,,-1.0,0005. Overkill/_/Overkill,2.0
3,3,,-1.0,000C Tony Dize/_/Ruleta Rusa,2.0
4,4,,-1.0,000 Oscarcito/_/Tumbay‚ (Lyrics),1.0
