In [1]:
import sys
import gensim
import gensim.models.doc2vec
from gensim.models import Doc2Vec

import pandas as pd

import logging
import re
import time
import itertools
import multiprocessing

from random import shuffle
from collections import namedtuple

In [2]:
TrainingDocument = namedtuple('TrainingDocument', 'words tags')

docs = []
processed_doc_count = 0
with open("../../midi-embeddings/data/1_measure_binned_full.txt", 'rb') as data:
    for line_no, line in enumerate(data, 1):
        tokens = [x.strip() for x in gensim.utils.to_unicode(line).split(",")]
        words = tokens[0:]
        tags = [line_no]
        docs.append(TrainingDocument(words, tags))
        processed_doc_count += 1
        if processed_doc_count % 100000 == 0:
            print("Loaded " + str(processed_doc_count) + " documents")

Loaded 100000 documents
Loaded 200000 documents
Loaded 300000 documents
Loaded 400000 documents
Loaded 500000 documents
Loaded 600000 documents
Loaded 700000 documents
Loaded 800000 documents
Loaded 900000 documents
Loaded 1000000 documents
Loaded 1100000 documents
Loaded 1200000 documents
Loaded 1300000 documents
Loaded 1400000 documents
Loaded 1500000 documents
Loaded 1600000 documents
Loaded 1700000 documents


In [3]:
def get_id(params):
    param_vals = [(shorten_param_name(key) + "_" + str(value)) for key, value in params.items()]
    return '-'.join(sorted(param_vals))

def shorten_param_name(name):
    name = re.sub("doc2vec_", "", name)
    name = re.sub("_", "", name)
    return name

In [4]:
def train(params):
    """
    Trains the document vector model as configured.
    :return: None.
    """
    assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

    # Model parameters:
    cores = multiprocessing.cpu_count()
    dm = params['doc2vec_dm']
    dm_mean = params['doc2vec_dm_mean']
    end_alpha = params['doc2vec_learning_rate_end']
    epochs = params['doc2vec_epochs']
    hs = params['doc2vec_hs']
    negative = params['doc2vec_negative']
    min_count = params['doc2vec_min_count']
    start_alpha = params['doc2vec_learning_rate_start']
    vector_size = params['doc2vec_vector_size']
    window = params['doc2vec_window']

    print("Training encoder model...")

    start = time.time()

    # Create model
    model = Doc2Vec(dm=dm,
                    dm_mean=dm_mean,
                    vector_size=vector_size,
                    window=window,
                    negative=negative,
                    hs=hs,
                    min_count=min_count,
                    workers=cores)

    model.build_vocab(docs)

    # Train model
    model.train(docs,
                total_examples=len(docs),
                epochs=epochs,
                start_alpha=start_alpha,
                end_alpha=end_alpha)

    end = time.time()
    message = "Trained encoder model in " + str(end - start) + " seconds"
    print(message)
    return model

In [5]:
NOTE_NAMES = ["c", "c#", "d", "d#", "e", "f", "f#", "g", "g#", "a", "a#", "b"]

def _get_instrument(word):
    fields = word.split("_")
    if len(fields) > 1:
        instrument = fields[0]
        return instrument
    else:
        return None

def _get_note_and_octave(word):
    fields = word.split("_")
    if len(fields) > 1:
        note_field = fields[1]
        octave = _get_octave(note_field)
        note = ''.join([i for i in note_field if not i.isdigit()])
        return (note, octave)
    else:
        return None
    
def _get_duration(word):
    fields = word.split("_")
    if len(fields) > 1:
        return fields[2]
    else:
        return None
        
def _get_octave(s):
    m = re.search(r'\d+$', s)
    return int(m.group()) if m else None

def _get_fifth(note, octave):
    note_index = NOTE_NAMES.index(note)
    fifth_index = (note_index + 7)
    if fifth_index > len(NOTE_NAMES) - 1:
        fifth_index = fifth_index % 12
        octave +=1
    return (NOTE_NAMES[fifth_index], octave)

def _get_fifth_word(word):
    instrument = _get_instrument(word)
    duration = _get_duration(word)
    note_and_octave = _get_note_and_octave(word)
    if instrument is not None and duration is not None and note_and_octave is not None:
        (note, octave) = note_and_octave
        (note, octave) = _get_fifth(note, octave)
        word = instrument + "_" + note + str(octave) + "_" + str(duration)
        return word
    return None

def get_fifth_score(model):
    scores = []
    for word in model.wv.vocab:
        fields = word.split("_")
        if len(fields) > 1 and fields[0] != "percussion":
            fifth = _get_fifth_word(word)
            if fifth in model.wv.vocab:
                scores.append(model.wv.similarity(word, fifth))
    return sum(scores) / float(len(scores))

In [6]:
eval_result_rows = []

In [7]:
params = {
    'doc2vec_dm': [1],
    'doc2vec_dm_mean': [1],
    'doc2vec_epochs': [2,4,8,16],
    'doc2vec_hs': [0],
    'doc2vec_learning_rate_start': [0.025],
    'doc2vec_learning_rate_end': [0.2],
    'doc2vec_min_count': [16,24,32],
    'doc2vec_negative': [32,64,128],
    'doc2vec_vector_size': [20],
    'doc2vec_window': [1]
}

values = [[(key, value) for value in values] for (key, values) in sorted(params.items())]
combinations = list(itertools.product(*values))
print("Defined {} combinations".format(len(combinations)))

shuffle(combinations)

for combination in combinations:
    combination = dict(combination)
    model_id = get_id(combination)
    print(model_id)
    model = train(combination)
    score = get_fifth_score(model)
    combination["score"] = score
    print("- score: {}".format(score))
    eval_result_rows.append(combination)
    model.save("doc2vec/" + model_id)
    del model
    break

Defined 36 combinations
dm_1-dmmean_1-epochs_8-hs_0-learningrateend_0.2-learningratestart_0.025-mincount_32-negative_128-vectorsize_20-window_1
Training encoder model...


KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(eval_result_rows)
df

In [None]:
variable = "doc2vec_negative"

grouped = df[[variable,'score']].groupby([variable], as_index=False).median()
sns.pointplot(x=variable, y='score', data=grouped)

In [None]:
variable = "doc2vec_epochs"

grouped = df[[variable,'score']].groupby([variable], as_index=False).median()
sns.pointplot(x=variable, y='score', data=grouped)

In [None]:
variable = "doc2vec_min_count"

grouped = df[[variable,'score']].groupby([variable], as_index=False).median()
sns.pointplot(x=variable, y='score', data=grouped)

In [None]:
import seaborn as sns
%matplotlib inline

heatmap_x = "doc2vec_negative"
heatmap_y = "doc2vec_min_count"

#filtered = df[df['doc2vec_window'].astype(int) == 1]
grouped = df.groupby([heatmap_y, heatmap_x], as_index=False).median()
pivoted = grouped.pivot(heatmap_y, heatmap_x, "score")
sns.heatmap(pivoted, annot=False, fmt="g", cmap='viridis')