The code in `pyldl.applications.emphasis_selection` is basically an unofficial implementation of the paper "Learning Emphasis Selection for Written Text in Visual Media from Crowd-Sourced Label Distributions" (*ACL* 2019).

In [1]:
import pyldl.applications.emphasis_selection as es
from pyldl.utils import LDLEarlyStopping

In [2]:
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam

In [3]:
words, freqs = es.load_semeval2020('./SemEval2020_Task10_Emphasis_Selection/')

In [4]:
es.visualization(words[:5])

In [5]:
words_train, words_test, freqs_train, freqs_test = train_test_split(
    words, freqs, test_size=.1, random_state=0
)

In [6]:
X, D, tokenizer, maxlen = es.preprocessing(words_train, freqs_train)

In [7]:
embeddings_matrix = es.load_glove('./glove.6B/', tokenizer, embedding_dim=100)

In [8]:
X_train, X_val, D_train, D_val = train_test_split(X, D, test_size=.1, random_state=0)

In [9]:
model = es.DL_BiLSTM(tokenizer, embeddings_matrix, n_hidden=2048)

In [10]:
model.fit(X_train, D_train, verbose=1, optimizer=Adam(1e-3), batch_size=64,
          X_val=X_val, D_val=D_val, callbacks=[LDLEarlyStopping(monitor='zero_one_loss', patience=None)]);

[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2410s[0m 2s/step - loss: 56.7350 - zero_one_loss: 0.5709


In [11]:
X_test, D_test = es.preprocessing(words_test, freqs_test, tokenizer, maxlen)

In [12]:
scores = model.score(X_test, D_test, return_dict=True)
for (k, v) in scores.items():
    print(f"{k}: ", end='')
    if isinstance(v, list):
        print("[" + ", ".join(f"{i:.4f}" for i in v) + "]")
    else:
        print(f"{v:.4f}")

top_k: [0.0185, 0.0473, 0.0772, 0.1105]
match_m: [0.3600, 0.4727, 0.5273, 0.5800]
max_roc_auc: 0.6788


In [13]:
D_pred = model.predict(X_test)

In [14]:
es.visualization(words_test[:5], D_pred[:5])

In [15]:
es.visualization(words_test[:5], D_test[:5])