In [None]:
import artm

In [None]:
import logging

logging.basicConfig(level=0)

formatter = logging.Formatter('%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', datefmt='%H:%M:%S')

logger = logging.getLogger('ARTM')
fh = logging.FileHandler('/tmp/lastfm')
fh.setLevel(0)
fh.setFormatter(formatter)

logger.addHandler(fh)

logger.info('Logging started.')
logger.info('Initialisation started...')


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import requests
from lxml import etree

from urllib.parse import quote_plus

from time import sleep

from collections import defaultdict

from gensim.utils import simple_tokenize, simple_preprocess
from gensim.parsing import preprocess_string

from multiprocessing import Pool

from scipy.spatial.distance import cosine

In [None]:
from annoy import AnnoyIndex

In [None]:
playlists = pd.read_csv('/data/lastfm_datafest5/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv', sep='\t', header=None)

In [None]:
users = pd.read_csv('/data/lastfm_datafest5/lastfm-dataset-360K/usersha1-profile.tsv', sep='\t',  header=None)

In [None]:
playlists.head(5)

In [None]:
tracks = pd.read_csv('/data/lastfm_datafest5/dump.csv', sep='|')

In [None]:
tracks.head(5)

In [None]:
artist_needed = set(playlists.artmbid)

In [None]:
len(artist_needed)

In [None]:
tracks_needed = tracks.loc[tracks.artist_gid.isin(artist_needed)]

tracks_needed.shape

In [None]:
tracks_needed.head()

In [None]:
big_lyrics = pd.read_csv('/data/lastfm_datafest5/lyrics.csv')

big_lyrics.dropna(axis=0, inplace=True)

In [None]:
with Pool(16) as pool:
    big_lyrics['preprocess'] = list(tqdm(pool.map(preprocess_string, big_lyrics.lyrics, chunksize=1000)))

In [None]:
big_lyrics.artist = big_lyrics.artist.apply(lambda x: list(simple_tokenize(x.lower())))

big_lyrics.genre = big_lyrics.genre.str.lower()

In [None]:
playlists.columns = ['user_sha', 'artmbid', 'artname', 'plays']

In [None]:
raw_playlist_data = []

for user, playlist in tqdm(playlists.groupby('user_sha')):
    raw_playlist_data.append((user, list(playlist.artname)))

In [None]:
def to_vw_playlist(user, playlist):
    res = '%s '%user
    res += '|artist %s '%' '.join(' '.join(simple_tokenize(str(_))) for _ in playlist)
    res += '|artist_f %s '%' '.join(str(_).replace(',', ' ').replace(':', '_').replace(' ', '_') for _ in playlist)
    
    return res

In [None]:
def to_vw_song(song, text, artist):
    res = '%s '%song
    res += '|text %s '%' '.join(str(_) for _ in text)
    res += '|artist %s'%' '.join(artist)
    
    return res

In [None]:
common_wv_path =  '/data/lastfm_datafest5/vw/common'
! rm $common_wv_path

In [None]:
raw_playlists_vw_path = '/data/lastfm_datafest5/vw/raw_playlists'

with open(common_wv_path, 'a') as outcome:
    for rec in tqdm(raw_playlist_data):
        outcome.write('%s\n'%to_vw_playlist(*rec))

In [None]:
raw_songs_vw_path = '/data/lastfm_datafest5/vw/raw_songs'

with open(common_wv_path, 'a') as outcome:
    for _, row in tqdm(big_lyrics.iterrows()):
        outcome.write('%s\n'%to_vw_song(row.song, row.preprocess, row.artist))

In [None]:
!rm -rf /data/lastfm_datafest5/common_batches

!mkdir /data/lastfm_datafest5/common_batches

In [None]:
common_batch = artm.BatchVectorizer(
    target_folder='/data/lastfm_datafest5/common_batches',
    data_format='vowpal_wabbit',
    data_path=raw_playlists_vw_path, 
    batch_size=10000)

In [None]:
songs_batch.dictionary.filter(min_df=3, max_df_rate=0.1, )

In [None]:
common_batch.dictionary.save_text('/data/common_dict.txt')

In [None]:
common_batch.dictionary.filter(min_df=2, max_df_rate=0.3)

In [523]:
common_batch.dictionary

artm.Dictionary(name=60040dda-04e0-48a1-9cbb-313b36958985, num_entries=272280)

In [None]:
hier = artm.hARTM(
    class_ids={'artist':2, 'artist_f':8, 'text':0.1},
    dictionary=common_batch.dictionary, 
    cache_theta=True,
    theta_columns_naming='title', 
    num_document_passes=1, 
    num_processors=24)

In [None]:
level0 = hier.add_level(num_topics=100)

level0.initialize(dictionary=common_batch.dictionary)

In [None]:
level0.regularizers.add(artm.DecorrelatorPhiRegularizer(
    name='level0_phi_decor', 
    class_ids=['artist_f', 'artist'], 
    tau=10))
level0.regularizers.add(artm.SmoothSparseThetaRegularizer(name='level0_theta_smoother', tau=2))

In [None]:
level0.fit_offline(batch_vectorizer=common_batch, num_collection_passes=25)

In [None]:
hier.tmp_files_path = '/tmp/'
level1 = hier.add_level(num_topics=1000, parent_level_weight=1)

In [None]:
level1.initialize(dictionary=common_batch.dictionary)

level1.regularizers.add(artm.HierarchySparsingThetaRegularizer(name="HierSp", tau=10**5))
level1.regularizers.add(artm.SmoothSparsePhiRegularizer(name="level1_phi_sparser", tau=-4, class_ids=['text']))
level1.regularizers.add(artm.DecorrelatorPhiRegularizer(name='level1_phi_decor', tau=4))

In [None]:
level1.fit_offline(batch_vectorizer=common_batch, num_collection_passes=5)

In [None]:
phi = hier.get_level(0).get_phi(class_ids=['artist_f'])

In [524]:
cosine(phi.loc['linkin_park'], phi.loc['tupac'])

0.28731784025325469

In [525]:
cosine(phi.loc['linkin_park'], phi.loc['linkin_park'])

5.3968269519977241e-08

In [526]:
cosine(phi.loc['linkin_park'], phi.loc['coldplay'])

0.21693327637531012

In [None]:
f = 100
t = AnnoyIndex(f)  # Length of item vector that will be indexed

for i, v in tqdm(enumerate(phi.values)):
    t.add_item(i, v)

t.build(100)

In [None]:
    # for id2see in range(10000, 10500, 10):
    id2see = np.where(phi.index == 'linkin_park')[0][0]
    print(phi.index[id2see])
    print('\t' + ', '.join(phi.index[t.get_nns_by_vector(phi.iloc[id2see], 20)]))
    print('+'*80)

In [530]:
def get_desc_h(tm, score_name, n_objcts, renamer={}, hlevel=0):
    topic_desc = {}

    collection = tm.get_level(hlevel).get_phi(class_ids=[score_name]).T.copy()
    
    if score_name in renamer:
        collection.columns = [renamer[score_name][c] for c in collection.columns]
    
    non_active_topics = collection.index[collection.sum(axis=1) < 0.5]
    logger.warn('Next topics are not active for "%s" modality: %s', score_name, list(non_active_topics))
    collection = collection[~collection.index.isin(non_active_topics)]
    
    collection.iloc[:,:] = (collection.values/collection.sum(axis=1).values[:, np.newaxis])

    for topic in collection.index:
        topic_sample = collection.loc[topic].sort_values(ascending=False)[:n_objcts]

        topic_desc[topic] = ', '.join(['%s:%3.3f'%(word, weight) for word, weight in topic_sample.iteritems()])
    return topic_desc

In [531]:
for n, desc in get_desc_h(hier, 'artist_f', 15).items():
    print('%s |||  %s'%(n, desc))
    print('#'*80)

  args = messages.GetTopicModelArgs(model_name=model)
  # Remove the CWD from sys.path while we load stuff.


topic_0 |||  rammstein:0.096, nirvana:0.080, marilyn_manson:0.059, ac/dc:0.048, the_offspring:0.044, linkin_park:0.041, serj_tankian:0.030, system_of_a_down:0.025, limp_bizkit:0.024, [unknown]:0.021, guano_apes:0.021, pink_floyd:0.019, bloodhound_gang:0.019, the_prodigy:0.016, metallica:0.014
################################################################################
topic_1 |||  nightwish:0.073, sonata_arctica:0.033, lacuna_coil:0.028, blind_guardian:0.022, apocalyptica:0.021, kamelot:0.020, epica:0.019, stratovarius:0.016, within_temptation:0.016, therion:0.016, hammerfall:0.016, avantasia:0.015, in_extremo:0.015, edguy:0.014, rhapsody_of_fire:0.013
################################################################################
topic_2 |||  red_hot_chili_peppers:0.120, evanescence:0.116, nirvana:0.060, nickelback:0.056, linkin_park:0.048, him:0.043, placebo:0.043, audioslave:0.042, lenny_kravitz:0.033, muse:0.030, 3_doors_down:0.025, bon_jovi:0.023, the_rasmus:0.023, foo_fighte