In [1]:
import pandas as pd
import json
import os
import numpy as np
import difflib
import re
import sys

sys.path.append('../..')

from tqdm import tqdm
from collections import deque, namedtuple

from IPython.display import display
from IPython.core.debugger import set_trace

# Listened log

In [2]:
DATA = './source/music.json'

FIX_ARTISTS = {
    'Tarja Turunen': 'Tarja',
    'Thomas Bergersen': 'Two Steps from Hell',
    'Томас Бергерсен': 'Two Steps from Hell',
    'Two Steps from Hell & Thomas Bergersen': 'Two Steps from Hell',
    'Tribe': 'Amaranthe',
    'Two Steps From Hell': 'Two Steps from Hell',
    'Two Steps From Hell, Thomas Bergersen': 'Two Steps from Hell',
    'Two Steps From Hell, Nick Phoenix': 'Two Steps from Hell',
    'Андреас Вальдетофт': 'Andreas Waldetoft',
    'Andreas Waltedoft': 'Andreas Waldetoft',
    'Paradox Interactive': 'Andreas Waldetoft',
    'Meyer': 'Andreas Waldetoft',
    'Saxon': 'Sabaton',
    'Visions Of Atlantis': 'Visions of Atlantis',
    'Leaves´ Eyes': 'Leaves\' Eyes',
    'Leaves \' Eyes': 'Leaves\' Eyes',
    'Russell Allen & Anette Olzon': 'Allen/Olzon'
}

FIX_TITLES = {
    'inf': 'infinity'
}

def fix_artist(artist):
    return FIX_ARTISTS.get(artist, artist)

def fix_title(title):
    title = title.lower()
    if not re.match('.*(orchestral|instrumental).*', title):
        title = re.sub(r" ?\([^)]+\)", "", title)
    title = re.sub(r"listened to", "", title)
    title = re.sub(r"(:|~|/).*", "", title)
    title = title.strip()
    return FIX_TITLES.get(title, title)

def check_title(title):
    return "skipped" not in title and "google play music" not in title

with open(DATA, 'r') as f:
    data = json.load(f)

In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,header,title,description,time,products,subtitles
0,Google Play Music,Listened to Into the Sun,Tarja,2020-07-24T19:11:16.641Z,[Google Play Music],
1,Google Play Music,Listened to Sleeping Sun,Nightwish,2020-07-24T18:55:45.200Z,[Google Play Music],
2,Google Play Music,Listened to Kuolleiden jumalten poika,Marko Hietala,2020-07-24T09:34:55.564Z,[Google Play Music],
3,Google Play Music,"Listened to Tähti, hiekka ja varjo",Marko Hietala,2020-07-24T09:30:13.489Z,[Google Play Music],
4,Google Play Music,Listened to Isäni ääni,Marko Hietala,2020-07-24T09:25:11.939Z,[Google Play Music],
...,...,...,...,...,...,...
13836,Google Play Music,Listened to Ocean Princess,Томас Бергерсен,2019-05-12T09:20:50.609Z,[Google Play Music],
13837,Google Play Music,Listened to Hurt,Томас Бергерсен,2019-05-12T09:17:57.848Z,[Google Play Music],
13838,Google Play Music,Listened to Dreammaker,Томас Бергерсен,2019-05-12T09:16:14.505Z,[Google Play Music],
13839,Google Play Music,Listened to Starvation,Томас Бергерсен,2019-05-12T09:11:56.597Z,[Google Play Music],


In [4]:
df['artist'] = df['description'].apply(fix_artist)
df['title'] = df['title'].apply(fix_title)
df = df[[check_title(t) for t in df.title]]
df = df.reset_index(drop=True)
df = df[['artist', 'title', 'time']]
df

Unnamed: 0,artist,title,time
0,Tarja,into the sun,2020-07-24T19:11:16.641Z
1,Nightwish,sleeping sun,2020-07-24T18:55:45.200Z
2,Marko Hietala,kuolleiden jumalten poika,2020-07-24T09:34:55.564Z
3,Marko Hietala,"tähti, hiekka ja varjo",2020-07-24T09:30:13.489Z
4,Marko Hietala,isäni ääni,2020-07-24T09:25:11.939Z
...,...,...,...
13600,Two Steps from Hell,ocean princess,2019-05-12T09:20:50.609Z
13601,Two Steps from Hell,hurt,2019-05-12T09:17:57.848Z
13602,Two Steps from Hell,dreammaker,2019-05-12T09:16:14.505Z
13603,Two Steps from Hell,starvation,2019-05-12T09:11:56.597Z


# Tracks data

In [5]:
from api import DBConn
from models import Base, MpdSong, SongListened

DBConn()

<api.db.DBConn at 0x7f52b03f4f50>

In [6]:
data = deque()

with DBConn.get_session() as db:
    for track in db.query(MpdSong).all():
        data.append({
            'title': track.title,
            'id': track.id,
            'artist': track.album_artist,
            'album': track.album
        })

df_t = pd.DataFrame(data)
df_t.title = df_t.title.apply(fix_title)
df_t.artist = df_t.artist.apply(fix_artist)
df_t = df_t.set_index('id')
df_t.head()

Unnamed: 0_level_0,title,artist,album
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,enter,After Forever,Remagine: The Album - The Sessions
2,come,After Forever,Remagine: The Album - The Sessions
3,boundaries are open,After Forever,Remagine: The Album - The Sessions
4,living shields,After Forever,Remagine: The Album - The Sessions
5,being everyone,After Forever,Remagine: The Album - The Sessions


# Join listened with tracks

In [7]:
tracks_by_name = {}

for track in df_t.itertuples(index=True):
    try:
        tracks_by_name[track.title].append(track)
    except KeyError:
        tracks_by_name[track.title] = [track]
        
track_names = list(tracks_by_name.keys())

In [8]:
def get_tracks(title, cutoff=0.6):
    try:
        track = tracks_by_name[title]
        return track
    except KeyError:
        pass
    if len(title) > 5:
        try:
            matches = [t for t in track_names if title in t or t in title]
            if len(matches) > 1:
                match = difflib.get_close_matches(title, matches, cutoff=0.4)[0]
            else:
                match = matches[0]
            return tracks_by_name[match]
        except (StopIteration, IndexError):
            pass
    elif "Instrumental" in title:
        title = re.sub('Instrumental', "", title)
        title = re.sub('-', "", title)
        title = title.strip()
        return get_tracks(title, cutoff)
    matches = difflib.get_close_matches(title, track_names, cutoff=cutoff)
    try:
        closest = matches[0]
        return tracks_by_name[closest]
    except IndexError:
        return get_tracks(title, cutoff=0.3)
    return None
    
def pick_track(tracks, track_listened, id_series):
    same_artists = [t for t in tracks if t.artist == track_listened.artist]
    if len(same_artists) == 0:
        same_artists = [t for t in tracks if t.artist == 'nan']
        if len(same_artists) == 0:
            return None
    if len(same_artists) == 1:
        return same_artists[0]
    by_album = {t.album: t for t in same_artists}
    try:
        previous_track = df_t.loc[id_series[len(id_series) - 1]]
        if previous_track.artist == track_listened.artist:
            return by_album[previous_track.album]
    except (KeyError, TypeError):
        pass
    return same_artists[0]
    
id_series = deque()
not_matched = []
NotMatched = namedtuple('NotMatched', ['track', 'candidates'])

for track_listened in tqdm(list(df.itertuples(index=True))):
    tracks = get_tracks(track_listened.title)
    if tracks is None:
        id_series.append(None)
        not_matched.append(NotMatched(track_listened))
        continue
    track = pick_track(tracks, track_listened, id_series)
    if track is None:
        id_series.append(None)
        not_matched.append(NotMatched(track_listened, tracks))
        continue
    id_series.append(track.Index)

100%|██████████| 13605/13605 [00:05<00:00, 2660.52it/s]


In [9]:
df['id'] = id_series
df_nm = pd.DataFrame([t.track for t in not_matched])
# display(len(id_series))

with pd.option_context('display.max_rows', 1000):
    display(df_nm)

Unnamed: 0,Index,artist,title,time
0,1145,Nightwish,elvenpath,2020-06-26T10:16:48.569Z
1,2644,Blackbriar,madwoman in the attic,2020-04-28T15:34:54.239Z
2,2645,Blackbriar,beautiful delirium,2020-04-28T15:30:10.383Z
3,2646,Blackbriar,mortal remains,2020-04-28T15:26:23.859Z
4,2647,Blackbriar,the rooster's crow,2020-04-28T15:21:54.243Z
5,4937,Blackbriar,beautiful delirium,2020-01-23T18:37:03.411Z
6,5522,Blackbriar,beautiful delirium,2020-01-09T17:46:39.460Z
7,5787,Blackbriar,beautiful delirium,2020-01-03T16:48:39.536Z
8,5799,Leaves' Eyes,to france,2020-01-03T10:21:20.687Z
9,5800,Leaves' Eyes,krakevisa,2020-01-03T10:16:43.386Z


In [10]:
with DBConn.get_session() as db:
    for record in df.itertuples(index=False):
        listened = SongListened(song_id=record['id'], time=record['time'])
        db.merge(listened)
    db.commit()

TypeError: tuple indices must be integers or slices, not str