In [5]:
from sklearn.neural_network import MLPClassifier
import torch
from torch import nn
import numpy as np
import csv

# canonical order
alphabet = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
# vowels in front (non-iotating then iotating) -- might make for easier insight into syllable structure (also no yo)
alphabet_sorted = 'аоуыэиеюябвгджзйклмнпрстфхцчшщъь'

words = []
stresses = []

feature_count = 37 # largest entry + 5

with open('russian3 - words.csv') as openrussian_dictionary_file:
    for word in csv.DictReader(openrussian_dictionary_file):
        features = [0 for _ in range(feature_count)]
        stress = [0 for _ in range(feature_count)]
        
        for i, letter in enumerate(word['bare']):
            # Pad the *start* of the word--it's like representing a "null prefix" if anything
            feature_index = i + feature_count - len(word['bare'])
            if letter == 'ё': # don't let the model cheat :P
                letter = 'е'
            features[feature_index] = alphabet_sorted.find(letter) + 1 # hyphen or whatever else -> 0
            
        i = feature_count - len(word['bare'])
        # Can have stress in multiple places -- don't think about it, just let the model be confused
        for character in word['accented']:
            if character == "'":
                stress[i-1] = 1
            elif character == 'ё':
                stress[i] = 1
                i += 1
            else:
                i += 1
        
        words.append(features)
        stresses.append(stress)


        
class StressNet(nn.Module):
    def __init__(self):
        super(StressNet, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(feature_count, 128),
            nn.Dropout(),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.Dropout(),
            nn.ReLU(),
            nn.Linear(128, feature_count)
        )
    
    def forward(self, x):
        return self.linear_relu_stack(self.flatten(x))
    
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'On {device}')

model = StressNet().to(device)

def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

On cuda


In [6]:
for t in range(5):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)

Epoch 1
-------------------------------


NameError: name 'train_dataloader' is not defined

Epoch 1
-------------------------------


NameError: name 'train' is not defined

In [5]:
    
# Originally planned on a CNN, but there's almost nothing useful about the spatial properties on small pieces of 1d data

# tune alpha later

classifier = MLPClassifier(hidden_layer_sizes=(200,200,200),max_iter=300)
classifier.fit(words, stresses)

print(classifier.score(words, stresses))



0.8743073749930319


In [7]:
for row in classifier.coefs_:
    print(*row)

[-2.14166161e-315  2.89617646e-315  9.25788122e-316 -8.30428947e-316
 -1.28246052e-315  3.75047225e-315 -2.30643233e-316  2.32248650e-316
  3.26031514e-316 -4.20108911e-315  2.18630358e-316 -3.61313374e-316
  3.24579003e-315 -5.03194447e-316 -2.26912904e-316 -1.18208263e-315
 -3.91251839e-315  9.76751710e-316  5.85167008e-316 -8.52797695e-316
 -7.22419951e-316  4.03866071e-315  1.49960565e-315 -9.16177982e-316
 -3.43027427e-315 -3.90234203e-316  2.56587080e-315  8.41631826e-316
  1.92454083e-316 -1.50011333e-315 -2.14004119e-315 -2.59406331e-316
  3.33162271e-315  3.96217839e-315  1.52499075e-315 -3.82708219e-315
 -3.38158051e-315  2.28650085e-315  1.57575165e-315  1.14602759e-315
 -3.31217469e-316 -9.99480874e-316 -2.41162822e-315 -1.39159417e-315
 -1.25100630e-315 -1.65431375e-316 -2.64396432e-315 -5.49600680e-316
 -3.03466325e-316 -1.01610404e-315 -6.86300388e-316 -1.60999938e-315
 -3.00115370e-315  2.50022453e-315  1.52381752e-315  2.16723590e-316
 -7.28694205e-316 -1.40651276e-315

  1.28516212e-001  3.64354233e-001 -4.71725718e-001 -1.60308208e+000] [-1.82340585e-001  1.65145520e-001  4.48032850e-002  2.18304067e-001
  3.78845742e-002  1.15820007e-003  2.29002920e-001 -6.94666954e-002
  1.03369569e-001  3.22387124e-001 -4.76831713e-001  3.62157101e-001
 -8.33830300e-002  7.38198291e-002  2.24953159e-001  9.73047170e-002
  9.53622012e-002 -2.43062722e-002 -8.44512865e-002 -1.53566214e-001
 -3.67290571e-001 -5.33333104e-001 -1.25100079e-002  2.22935776e-001
 -1.17514211e-315 -2.04984899e-001 -9.19962724e-002 -2.71155166e-315
  2.18704202e-001  5.48334907e-002  1.45542395e-001  8.34305142e-002
  1.33076963e-001 -1.18194071e-001  6.80513279e-002 -3.64649028e-001
 -1.14488085e-001 -5.82810496e-001  2.56180911e-001 -1.03740349e-002
  1.24880302e-001  2.05421510e-001 -1.08930100e-001  3.59070119e-003
  3.29578197e-315  4.06986350e-002 -8.36564121e-002 -1.72623498e-001
  7.57684194e-002 -3.85367050e-001  4.41322192e-002 -8.46761883e-002
  1.35958758e-001  9.55161037e-00