In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score

In [2]:
dat = pd.read_csv("F.txt", sep="\t", header=None)
dat

Unnamed: 0,0,1,2,3
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0
...,...,...,...,...
3819,0,0,49,0
3820,0,0,51,0
3821,0,0,51,0
3822,0,0,54,0


In [3]:
X = list(dat[3]) # 4th voice
print(X[:10])

[0, 0, 0, 0, 0, 0, 0, 0, 42, 42]


In [4]:
set_X = list(set(X))
print(len(set_X))
# Conversion to n dimensional one hot vector
X_vec = []
for x in X:
    n = set_X.index(x)
    vec = [1 if i==n else 0 for i in range(len(set_X))]
    X_vec.append(vec)
X_vec[0]

26


[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [5]:
# Sliding window with one-step forecasting
X_t0 = np.array(X_vec[:-1])
y = np.array(X_vec[1:])
print(X_t0[0])
print(y[0])

[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [6]:
reg = LinearRegression().fit(X_t0, y)

In [7]:
# Sanity check on last training instance
reg.predict([X_t0[-1]])

array([[ 9.59716797e-01, -7.28583860e-17, -1.21430643e-16,
        -2.12503626e-17, -2.08166817e-16, -6.93889390e-18,
         3.26442719e-03, -6.33174069e-17,  3.29589844e-03,
         3.26633453e-03, -7.97972799e-17,  5.29479980e-03,
         1.69372559e-03,  7.08007812e-03,  1.84631348e-03,
         3.57055664e-03,  4.89997864e-03,  4.74880552e-17,
         1.65557861e-03,  1.59931183e-03,  2.88391113e-03,
        -1.04904175e-05, -1.14440918e-05, -3.27971157e-18,
        -5.01443505e-18, -5.09913834e-18]])

In [8]:
# Possible (quick/easy) way to trace back to note:
# Get the max probability 
# Trace back the note via its index e.g.
yhat = reg.predict([X_t0[49]])
set_X[yhat.tolist()[0].index(max(yhat.tolist()[0]))]

47

In [9]:
# Cross validation for time series
tscv = TimeSeriesSplit()
print(tscv)

avg_acc = 0

for train_index, test_index in tscv.split(X_t0):
    print(f"TRAIN: [{train_index[0]} ... {train_index[-1]}] TEST: [{test_index[0]} ... {test_index[-1]}]")
    X_train, X_test = X_t0[train_index], X_t0[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    reg = LinearRegression().fit(X_train, y_train)
    y_pred_vec = reg.predict(X_test)
    
    # Convert one hot to note
    y_pred = [set_X[y.tolist().index(max(y.tolist()))] for y in y_pred_vec]
    y_test = [set_X[y.tolist().index(max(y.tolist()))] for y in y_test]
    
    print(accuracy_score(y_test, y_pred))
    avg_acc += accuracy_score(y_test, y_pred)
    
print(avg_acc/tscv.n_splits)

TimeSeriesSplit(max_train_size=None, n_splits=5)
TRAIN: [0 ... 637] TEST: [638 ... 1274]
0.8006279434850864
TRAIN: [0 ... 1274] TEST: [1275 ... 1911]
0.8649921507064364
TRAIN: [0 ... 1911] TEST: [1912 ... 2548]
0.7692307692307693
TRAIN: [0 ... 2548] TEST: [2549 ... 3185]
0.7645211930926217
TRAIN: [0 ... 3185] TEST: [3186 ... 3822]
0.8367346938775511
0.8072213500784929
