In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import operator
import random

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score
from scipy import stats
from itertools import islice

In [2]:
def bar_chart(notes, freq, set_X):
    N = len(set_X)
    ind = np.arange(N) # the x locations for the groups

    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])


    ax.bar(ind, freq, 0.75)

    ax.set_ylabel('Freq')
    ax.set_xlabel('Notes')
    ax.set_title('Frequency by note for a voice')
    plt.xticks(ind, set_X)
    steps = 100
    ax.set_yticks(np.arange(0, max(freq)+steps, steps))
    plt.show()

In [3]:
dat = pd.read_csv("F.txt", sep="\t", header=None)
dat

Unnamed: 0,0,1,2,3
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0
...,...,...,...,...
3819,0,0,49,0
3820,0,0,51,0
3821,0,0,51,0
3822,0,0,54,0


In [None]:
X = list(dat[3]) # 4th voice
print(X[:10])
set_X = list(set(X))
print(len(set_X), set_X)

In [None]:
# frequency per note in the voice
notes = []
freq = []
for x in set_X:
    notes.append(str(x))
    freq.append(X.count(x))
#     print(f"{x}:\t{X.count(x)}/{len(X)} ~\t{round((X.count(x)/len(X))*100,2)} %")

bar_chart(notes, freq, set_X)

In [None]:
# get range, mode, mean, probabilities per range for each note in the voice
len_dict = {}
c = 0

for idx, x in enumerate(X):
    if x not in len_dict.keys():
        len_dict[x] = []
    if idx == 0:
        c += 1
    elif x == X[idx-1]:
        c += 1
    else:
        len_dict[X[idx-1]].append(c)
        c = 1
        
for k,v in len_dict.items():
    num = round(np.mean(v))
    if num % 2 != 0:
        num += 1
    d = {}
    for i in v:
        d[i] = round(v.count(i)/len(v), 2)
    len_dict[k] = {"range": v, "mode": stats.mode(v)[0][0], "mean": num, "pd": d}

# example note: 0
print(len_dict[0])

In [None]:
# convert series of notes of length n to 1
X_one = []

for idx, x in enumerate(X):
    if idx == 0:
        X_one.append(x)
    elif x != X[idx-1]:
        X_one.append(x)

print(X_one[:10])
X = X_one

In [None]:
# frequency per note in the voice after transform
notes = []
freq = []
for x in set_X:
    notes.append(str(x))
    freq.append(X.count(x))
#     print(f"{x}:\t{X.count(x)}/{len(X)} ~\t{round((X.count(x)/len(X))*100,2)} %")

bar_chart(notes, freq, set_X)

In [None]:
y = []
for x in X:
    # Conversion to n dimensional one hot vector
    n = set_X.index(x)
    y.append([1 if i==n else 0 for i in range(len(set_X))])
y[0]

In [None]:
# Sliding window with one-step forecasting
X_t0 = np.array(X[:-1]).reshape(-1, 1)
y_t1 = np.array(y[1:])
print(X_t0[:3])
print(y_t1[:2])

In [None]:
reg = LinearRegression().fit(X_t0, y_t1)

In [None]:
# Sanity check on last training instance
reg.predict([X_t0[-1]])[0]

In [None]:
# Get the max probability 
# Trace back the note via its index e.g.
yhat = reg.predict([X_t0[49]])
pred = set_X[yhat.tolist()[0].index(max(yhat.tolist()[0]))]
pred

In [4]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [5]:
def pick_rand_note(set_X, yhat, n=5):
    # pick random note from top n notes
    yhat_dict = dict(zip(set_X, yhat[0].tolist()))
    yhat_dict_sorted = dict(sorted(yhat_dict.items(), key=operator.itemgetter(1),reverse=True))
    top_n = take(n, yhat_dict_sorted.items())
    return random.choice(top_n)[0]

In [None]:
# Predict t+1 note n times from t0
preds = [X_t0[-1].tolist()]
for i in range(50):
    yhat = reg.predict([preds[-1]])
    # Pick note, add it
    preds.append([pick_rand_note(set_X, yhat, n=10)])
preds = [p[0] for p in preds]
set(preds), preds[:10]

In [None]:
# Prolong the notes
output = []
for p in preds:
    # Weighted choice
    range_n = [p] * random.choice(len_dict[p]["range"])
    output.extend(range_n)
len(output), output[:10]

In [None]:
df = pd.DataFrame(output)
df

In [None]:
# Cross validation for time series
tscv = TimeSeriesSplit()
print(tscv)

avg_acc = 0

for train_index, test_index in tscv.split(X_t0):
    print(f"TRAIN: [{train_index[0]} ... {train_index[-1]}] TEST: [{test_index[0]} ... {test_index[-1]}]")
    X_train, X_test = X_t0[train_index], X_t0[test_index]
    y_train, y_test = y_t1[train_index], y_t1[test_index]
    
    reg = LinearRegression().fit(X_train, y_train)
    yhat_vec = reg.predict(X_test)

In [6]:
def get_voice_notes_ranges(X):
    # Get all note ranges per note of a voice
    range_dict = {}
    c = 0

    # Count the ranges
    for idx, x in enumerate(X):
        if x not in range_dict.keys():
            range_dict[x] = []
        if idx == 0:
            c += 1
        elif x == X[idx-1]:
            c += 1
        else:
            range_dict[X[idx-1]].append(c)
            c = 1
    return range_dict

def add_mode_mean_pd(notes_ranges_dict):
    # Add mode, mean, probabilities
    for k,v in notes_ranges_dict.items():
        num = round(np.mean(v))
        if num % 2 != 0:
            num += 1
        d = {}
        for i in v:
            d[i] = round(v.count(i)/len(v), 2)
        notes_ranges_dict[k] = {"range": v, "mode": stats.mode(v)[0][0], "mean": num, "pd": d}
    return notes_ranges_dict

In [7]:
def series_to_one(X):
    # convert series of notes of length n to 1
    X_one = []

    for idx, x in enumerate(X):
        if idx == 0:
            X_one.append(x)
        elif x != X[idx-1]:
            X_one.append(x)
    return X_one

In [8]:
def to_one_hot(X):
    y = []
    for x in X:
        # Conversion to n dimensional one hot vector
        n = set_X.index(x)
        y.append([1 if i==n else 0 for i in range(len(set_X))])
    return y

In [35]:
# Generate notes for each voice

gen_voices_dict = {}

for i in dat.keys():
    # Select voice, get set of notes
    X = list(dat[i])
    set_X = list(set(X))
    
    # get ranges of the notes
    len_dict = get_voice_notes_ranges(X)
    
    # convert series of notes of length n to 1
    X = series_to_one(X)
    
    # n dimensional one hot vectors
    y = to_one_hot(X)
    
    # Sliding window with one-step forecasting
    X_t0 = np.array(X[:-1]).reshape(-1, 1)
    y_t1 = np.array(y[1:])
    
    reg = LinearRegression().fit(X_t0, y_t1)
    
    # Predict t+1 note n times from t0
    preds = [X_t0[-1].tolist()]
    for j in range(100):
        yhat = reg.predict([preds[-1]])
        # Pick note, add it
        preds.append([pick_rand_note(set_X, yhat, n=10)])
    preds = [p[0] for p in preds]
    
    # Prolong the notes
    gen_voice = []
    for idx, p in enumerate(preds):
        # Weighted choice
        range_n = [p] * random.choice(len_dict[p])
        gen_voice.extend(range_n)
    gen_voices_dict[i] = gen_voice[:410]

df = pd.DataFrame(gen_voices_dict)
df

Unnamed: 0,0,1,2,3
0,68,57,51,42
1,68,57,51,42
2,69,52,53,42
3,69,52,53,42
4,66,52,56,49
...,...,...,...,...
405,64,59,54,49
406,64,59,54,49
407,64,59,54,49
408,64,59,54,49


In [36]:
for i in df.keys():
    print(i, set(df[i]))

0 {64, 65, 66, 68, 69, 71, 59, 61, 62, 63}
1 {64, 66, 52, 54, 56, 57, 58, 59, 61, 62, 63}
2 {0, 42, 44, 45, 46, 47, 49, 50, 51, 52, 53, 54, 56}
3 {0, 35, 37, 38, 39, 40, 42, 43, 44, 45, 46, 47, 49}


In [37]:
df.to_csv("F_410.txt", sep="\t", header=None, index=None)