The code in `pyldl.applications.emphasis_selection` is basically an unofficial implementation of the paper "Learning Emphasis Selection for Written Text in Visual Media from Crowd-Sourced Label Distributions" (*ACL* 2019).

In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
import pyldl.applications.emphasis_selection as es

In [3]:
from pyldl.metrics import euclidean
from pyldl.utils import LDLEarlyStopping

In [4]:
import keras

In [5]:
words, freqs = es.load_semeval2020('./SemEval2020_Task10_Emphasis_Selection/')

In [6]:
es.visualization(words[:5])

In [7]:
words_train, words_test, freqs_train, freqs_test = \
train_test_split(words, freqs, test_size=0.1, random_state=0)

In [8]:
X_train, y_train, tokenizer, maxlen = es.preprocessing(words_train, freqs_train)

In [9]:
embeddings_matrix = es.load_glove('./glove.6B/', tokenizer, embedding_dim=100)

In [10]:
model = es.DL_BiLSTM(tokenizer, embeddings_matrix)

In [11]:
val = int(X_train.shape[0] * 0.9)

In [12]:
model.fit(X_train[:val], y_train[:val], verbose=1, optimizer=keras.optimizers.Adam(1e-3),
          X_val=X_train[val:], y_val=y_train[val:], callbacks=[LDLEarlyStopping(monitor='euclidean', patience=50)]);

[1m 132/1000[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m19s[0m 130ms/step - loss: 250.8580 - euclidean: 0.5938 
Epoch 132: early stopping (best euclidean: 0.576362669467926).


In [13]:
X_test, y_test = es.preprocessing(words_test, freqs_test, tokenizer, maxlen)

In [14]:
y_pred = model.predict(X_test)

In [15]:
df = pd.DataFrame({'score': euclidean(y_test, y_pred, reduction=None),
                   'group': y_test[:, :10].argmax(axis=1)})
samples = df.groupby('group')['score'].idxmin().values

In [16]:
es.visualization([words_test[i] for i in samples], y_pred[samples])

In [17]:
es.visualization([words_test[i] for i in samples], y_test[samples])