In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score

In [2]:
def bar_chart(notes, freq, set_X):
    N = len(set_X)
    ind = np.arange(N) # the x locations for the groups

    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])


    ax.bar(ind, freq, 0.75)

    ax.set_ylabel('Freq')
    ax.set_xlabel('Notes')
    ax.set_title('Frequency by note for a voice')
    plt.xticks(ind, set_X)
    steps = 100
    ax.set_yticks(np.arange(0, max(freq)+steps, steps))
    plt.show()

In [3]:
def to_one_hot(X):
    set_X = list(set(X))
#     print(len(set_X))
    # Conversion to n dimensional one hot vector
    X_one_hot = []
    for x in X:
        n = set_X.index(x)
        one_hot = [1 if i==n else 0 for i in range(len(set_X))]
        X_one_hot.append(one_hot)
    
    return X_one_hot, set_X

In [4]:
def from_one_hot(y_vec, set_X):
    # Mapping to note from (one hot) probability vector using highest prob instance
    return [set_X[y.tolist().index(max(y.tolist()))] for y in y_vec]

In [5]:
dat = pd.read_csv("F.txt", sep="\t", header=None)
dat

Unnamed: 0,0,1,2,3
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0
...,...,...,...,...
3819,0,0,49,0
3820,0,0,51,0
3821,0,0,51,0
3822,0,0,54,0


In [6]:
dat2 = {}

for i in range(len(dat.keys())):
    
    X = list(dat[i])
    X_vec, set_X = to_one_hot(X)
    
    # Sliding window with one-step forecasting
    X_t0 = np.array(X_vec[:-1])
    y = np.array(X_vec[1:])
    
    # Cross validation for time series
    tscv = TimeSeriesSplit()
#     print(tscv)

    avg_acc = 0

    for train_index, test_index in tscv.split(X_t0):
#         print(f"TRAIN: [{train_index[0]} ... {train_index[-1]}] TEST: [{test_index[0]} ... {test_index[-1]}]")
        X_train, X_test = X_t0[train_index], X_t0[test_index]
        y_train, y_test = y[train_index], y[test_index]

        reg = LinearRegression().fit(X_train, y_train)
        y_pred_vec = reg.predict(X_test)

        # Convert to note
        y_pred = from_one_hot(y_pred_vec, set_X)
        y_test = from_one_hot(y_test, set_X)

#         print(accuracy_score(y_test, y_pred))
        avg_acc += accuracy_score(y_test, y_pred)

    print(avg_acc/tscv.n_splits)
    
    # Predict next 410 notes, based on previous 410 notes
    pred_vec = reg.predict(X_vec[-410:])
    pred = from_one_hot(pred_vec, set_X)
    
    dat2[i] = pred

dat2 = pd.DataFrame(dat2)
dat2

0.7836734693877551
0.721193092621664
0.8100470957613816
0.8072213500784929


Unnamed: 0,0,1,2,3
0,61,57,52,42
1,61,57,52,42
2,61,54,51,47
3,61,54,51,47
4,61,54,51,47
...,...,...,...,...
405,0,0,49,0
406,0,0,51,0
407,0,0,51,0
408,0,0,54,0


In [7]:
dat2.to_csv("F_pred.txt", sep="\t", header=False, index=False)

In [None]:
X = list(dat[3]) # 4th voice
print(X[:10])

In [None]:
X_vec, set_X = to_one_hot(X)
print(len(set_X), set_X)
X_vec[0]

In [None]:
notes = []
freq = []
for x in set_X:
    notes.append(str(x))
    freq.append(X.count(x))
#     print(f"{x}:\t{X.count(x)}/{len(X)} ~\t{round((X.count(x)/len(X))*100,2)} %")

bar_chart(notes, freq, set_X)

In [None]:
# Sliding window with one-step forecasting
X_t0 = np.array(X_vec[:-1])
y = np.array(X_vec[1:])
print(X_t0[0])
print(y[0])

In [None]:
reg = LinearRegression().fit(X_t0, y)

In [None]:
# Sanity check on last training instance
reg.predict([X_t0[-1]])

In [None]:
# Possible (quick/easy) way to trace back to note:
# Get the max probability 
# Trace back the note via its index e.g.
yhat = reg.predict([X_t0[49]])
set_X[yhat.tolist()[0].index(max(yhat.tolist()[0]))]

In [None]:
# Cross validation for time series
tscv = TimeSeriesSplit()
print(tscv)

avg_acc = 0

for train_index, test_index in tscv.split(X_t0):
    print(f"TRAIN: [{train_index[0]} ... {train_index[-1]}] TEST: [{test_index[0]} ... {test_index[-1]}]")
    X_train, X_test = X_t0[train_index], X_t0[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    reg = LinearRegression().fit(X_train, y_train)
    y_pred_vec = reg.predict(X_test)
    
    # Convert to note
    y_pred = from_one_hot(y_pred_vec, set_X)
    y_test = from_one_hot(y_test, set_X)
    
    print(accuracy_score(y_test, y_pred))
    avg_acc += accuracy_score(y_test, y_pred)
    
print(avg_acc/tscv.n_splits)

In [None]:
pred_vec = reg.predict(X_t0[-410:])
print(len(pred_vec))
pred = from_one_hot(pred_vec, set_X)
pred