In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import operator
import random
import datetime

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from scipy import stats
from itertools import islice

random.seed(2021)

In [2]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [3]:
def pick_rand_note(set_X, yhat, n=5):
    # pick random note from top n notes
    yhat_dict = dict(zip(set_X, yhat[0].tolist()))
    yhat_dict_sorted = dict(sorted(yhat_dict.items(), key=operator.itemgetter(1),reverse=True))
    top_n = take(n, yhat_dict_sorted.items())
    return random.choice(top_n)[0]

In [4]:
def get_voice_notes_ranges(X):
    # Get all note ranges per note of a voice
    range_dict = {}
    c = 0

    # Count the ranges
    for idx, x in enumerate(X):
        if x not in range_dict.keys():
            range_dict[x] = []
        if idx == 0:
            c += 1
        elif x == X[idx-1]:
            c += 1
        else:
            range_dict[X[idx-1]].append(c)
            c = 1
    return range_dict

def add_mode_mean_pd(notes_ranges_dict):
    # Add mode, mean, probabilities
    for k,v in notes_ranges_dict.items():
        num = round(np.mean(v))
        if num % 2 != 0:
            num += 1
        d = {}
        for i in v:
            d[i] = round(v.count(i)/len(v), 2)
        notes_ranges_dict[k] = {"range": v, "mode": stats.mode(v)[0][0], "mean": num, "pd": d}
    return notes_ranges_dict

In [5]:
def series_to_one(X):
    # convert series of notes of length n to 1
    X_one = []

    for idx, x in enumerate(X):
        if idx == 0:
            X_one.append(x)
        elif x != X[idx-1]:
            X_one.append(x)
    return X_one

In [6]:
def to_one_hot(X):
    y = []
    for x in X:
        # Conversion to n dimensional one hot vector
        n = set_X.index(x)
        y.append([1 if i==n else 0 for i in range(len(set_X))])
    return y

In [7]:
def train_cv(X, y):
    # Cross validation for time series
    tscv = TimeSeriesSplit()

    avg_acc = 0

    for train_index, test_index in tscv.split(X):
        print(f"TRAIN: [{train_index[0]} ... {train_index[-1]}] TEST: [{test_index[0]} ... {test_index[-1]}]")
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        reg = LinearRegression().fit(X_train, y_train)

        y_pred_vec_train = reg.predict(X_train)
        y_pred_train = [set_X[_y.tolist().index(max(_y.tolist()))] for _y in y_pred_vec_train]
        y_train_notes = [set_X[_y.tolist().index(max(_y.tolist()))] for _y in y_train]
        print(f"\t#MSE train\t{mean_squared_error(y_train_notes, y_pred_train)} #train error\t{reg.score(X_train, y_train)}")

        y_pred_vec_test = reg.predict(X_test)
        y_pred_test = [set_X[_y.tolist().index(max(_y.tolist()))] for _y in y_pred_vec_test]
        y_test_notes = [set_X[_y.tolist().index(max(_y.tolist()))] for _y in y_test]
        print(f"\t#MSE test\t{mean_squared_error(y_test_notes, y_pred_test)} #test error\t{reg.score(X_test, y_test)}")
    
    return reg

In [8]:
# Generate notes for each voice

dat = pd.read_csv("F.txt", sep="\t", header=None)

gen_voices_dict = {}

for i in dat.keys():
    # Select voice, get set of notes
    X = list(dat[i])
    set_X = list(set(X))
    
    # get ranges of the notes
    len_dict = get_voice_notes_ranges(X)
    
    # convert series of notes of length n to 1
    X = series_to_one(X)
    
    # n dimensional one hot vectors
    y = to_one_hot(X)
    
    # Sliding window with one-step forecasting
    X_t0 = np.array(X[:-1]).reshape(-1, 1)
    y_t1 = np.array(y[1:])
    
#     reg = LinearRegression().fit(X_t0, y_t1)
    reg = train_cv(X_t0, y_t1)
    
    # Predict t+1 note n times from t0
    preds = [X_t0[-1].tolist()]
    for j in range(100):
        yhat = reg.predict([preds[-1]])
        # Pick note, add it
        preds.append([pick_rand_note(set_X, yhat, n=10)])
    preds = [p[0] for p in preds]
    
    # Prolong the notes
    gen_voice = []
    for idx, p in enumerate(preds):
        # Weighted choice
        range_n = [p] * random.choice(len_dict[p])
        gen_voice.extend(range_n)
    gen_voices_dict[i] = gen_voice[:410]

TRAIN: [0 ... 104] TEST: [105 ... 206]
	#MSE train	46.50476190476191 #train error	0.19012145790221685
	#MSE test	11.754901960784315 #test error	0.07463744366292661
TRAIN: [0 ... 206] TEST: [207 ... 308]
	#MSE train	29.381642512077295 #train error	0.10224462544522703
	#MSE test	49.15686274509804 #test error	-0.047961534986492874
TRAIN: [0 ... 308] TEST: [309 ... 410]
	#MSE train	38.80582524271845 #train error	0.01054364315552584
	#MSE test	99.66666666666667 #test error	-0.0416250203495007
TRAIN: [0 ... 410] TEST: [411 ... 512]
	#MSE train	55.78345498783455 #train error	0.007827642602303987
	#MSE test	141.36274509803923 #test error	-0.014995829838405417
TRAIN: [0 ... 512] TEST: [513 ... 614]
	#MSE train	73.39376218323586 #train error	0.006861481787477172
	#MSE test	179.2156862745098 #test error	-0.02435372210823494
TRAIN: [0 ... 124] TEST: [125 ... 246]
	#MSE train	35.272 #train error	0.4149106977981958
	#MSE test	67.80327868852459 #test error	0.3096696350980007
TRAIN: [0 ... 246] TEST: 

In [9]:
df = pd.DataFrame(gen_voices_dict)
df

Unnamed: 0,0,1,2,3
0,68,57,51,42
1,68,57,51,42
2,68,57,51,42
3,68,57,51,42
4,62,57,51,49
...,...,...,...,...
405,62,59,52,37
406,62,52,52,37
407,62,52,51,37
408,62,52,51,37


In [10]:
for i in df.keys():
    print(i, set(df[i]))

0 {64, 65, 66, 67, 68, 69, 71, 59, 61, 62, 63}
1 {64, 66, 52, 54, 56, 57, 58, 59, 61, 62, 63}
2 {44, 45, 47, 49, 50, 51, 52, 53, 54, 55, 56}
3 {35, 37, 38, 40, 42, 43, 44, 45, 46, 47, 49, 50}


In [11]:
# f = f"F_410_{datetime.datetime.now()}.txt"
# df.to_csv("F_410.txt", sep="\t", header=None, index=None)