In [None]:
from sklearn.neural_network import MLPClassifier
import numpy as np
import csv

# canonical order
alphabet = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
# vowels in front (non-iotating then iotating) -- might make for easier insight into syllable structure (also no yo)
alphabet_sorted = 'аоуыэиеюябвгджзйклмнпрстфхцчшщъь'

words = []
stresses = []

with open('russian3 - words.csv') as openrussian_dictionary_file:
    feature_count = 37 # largest entry + 5
    for word in csv.DictReader(openrussian_dictionary_file):
        features = [0 for _ in range(feature_count)]
        stress = [0 for _ in range(feature_count)]
        
        for i, letter in enumerate(word['bare']):
            # Pad the *start* of the word--it's like representing a "null prefix" if anything
            feature_index = i + feature_count - len(word['bare'])
            if letter == 'ё': # don't let the model cheat :P
                letter = 'е'
            features[feature_index] = alphabet_sorted.find(letter) + 1 # hyphen or whatever else -> 0
            
        i = feature_count - len(word['bare'])
        # Can have stress in multiple places -- don't think about it, just let the model be confused
        for character in word['accented']:
            if character == "'":
                stress[i-1] = 1
            elif character == 'ё':
                stress[i] = 1
                i += 1
            else:
                i += 1
        
        words.append(features)
        stresses.append(stress)

words = np.array(words)
stresses = np.array(stresses)
        

In [5]:
    
# Originally planned on a CNN, but there's almost nothing useful about the spatial properties on small pieces of 1d data

# tune alpha later

classifier = MLPClassifier(hidden_layer_sizes=(200,200,200),max_iter=300)
classifier.fit(words, stresses)

print(classifier.score(words, stresses))



0.8743073749930319


In [6]:
print(classifier.coefs_)

[array([[-2.14166161e-315,  2.89617646e-315,  9.25788122e-316, ...,
        -1.22778363e-315,  3.78986869e-315, -3.92964968e-315],
       [-7.55422627e-316, -2.15166411e-315, -2.04893544e-315, ...,
        -1.92778294e-316,  1.14098543e-315, -3.61356657e-315],
       [ 6.68158204e-316, -4.08005602e-315, -1.42391882e-315, ...,
        -3.46061543e-316,  3.44828094e-315,  1.07073981e-315],
       ...,
       [ 2.48186980e-001, -4.19809817e-001, -1.21669605e-001, ...,
         4.40267097e-002, -5.78575093e-001, -2.76404038e-002],
       [-1.27667671e+000,  2.08438312e-001, -1.03565773e-001, ...,
        -5.87495757e-002, -5.37411841e-001,  8.80037522e-002],
       [-3.58068778e-001,  3.41089552e-001, -2.86585701e-001, ...,
         2.15443815e-001,  2.70196347e-001, -4.70165529e-001]]), array([[ 0.61161837,  0.14006146,  0.25180594, ...,  0.112296  ,
        -0.21036003, -0.63045392],
       [-0.14936018,  0.12845298, -0.60418518, ..., -0.11606904,
        -0.44147855, -0.15357353],
     