In [27]:
import os

embedding_dim = 200
corpus_base_dir = os.path.join('..', '..', '..', 'corpora', 'wired_it_20190821')

vocab_size = 10000
max_length = 1000
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
num_epochs = 100

training_dir = os.path.join(corpus_base_dir, 'training')
test_dir = os.path.join(corpus_base_dir, 'test')
classes = ['attualit_','attualit__ambiente','attualit__media','attualit__politica','attualit__tech','economia_business','economia_finanza','economia_lavoro','economia_startup','gadget_accessori','gadget_audio_e_tv','gadget_computer','gadget_elettrodomestici','gadget_foto_e_video','gadget_motori','gadget_outdoor','gadget_videogiochi','internet_regole','internet_social_network','internet_tlc','internet_web','lifestyle_design','lifestyle_food','lifestyle_mobilit_','lifestyle_salute','lifestyle_viaggi','lol','mobile_app','mobile_smartphone','mobile_tablet','play_cinema','play_cultura','play_fumetti','play_libri','play_musica','play_tv','scienza','scienza_biotech','scienza_ecologia','scienza_lab','scienza_medicina','scienza_spazio']

In [2]:
def read_file_to_text(file_path):
  with open(file_path, 'r', encoding="utf8") as file:
    return file.read().replace('\n', ' ')
def list_text_files(folder):
  return [os.path.join(folder, file_name) for file_name in os.listdir(folder) if file_name.endswith('.txt')]
flatten = lambda l: [item for sublist in l for item in sublist]

In [3]:
load_class_text_pairs = lambda folder: map(lambda classLabel: map(lambda file_name: (classLabel, read_file_to_text(file_name)), list_text_files(os.path.join(folder, classLabel))) , classes)

In [4]:
training_classes, training_texts = zip(*flatten(list(load_class_text_pairs(training_dir))))

In [5]:
test_classes, test_texts = zip(*flatten(list(load_class_text_pairs(test_dir))))

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from functools import reduce
import tensorflow as tf
import numpy as np

In [7]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)

In [8]:
tokenizer.fit_on_texts(training_texts)

In [9]:
training_sequences = tokenizer.texts_to_sequences(training_texts)
training_padded = pad_sequences(training_sequences, padding=padding_type, maxlen=max_length)

In [10]:
test_sequences = tokenizer.texts_to_sequences(test_texts)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

In [15]:
# We remove the '_' because class names features '_' as separators (and they need NOT to be split, thay must be a unique token)
classes_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
classes_tokenizer.fit_on_texts(classes)

training_classes_seq = np.array(classes_tokenizer.texts_to_sequences(training_classes))
test_classes_seq = np.array(classes_tokenizer.texts_to_sequences(test_classes))

In [16]:
training_classes_encoded = [class_encoded[0] for class_encoded in training_classes_seq]
classes_word_index = classes_tokenizer.word_index
print(classes_word_index)
classes_by_index = dict([(index, key) for (key, index) in classes_word_index.items()])

{'attualit_': 1, 'attualit__ambiente': 2, 'attualit__media': 3, 'attualit__politica': 4, 'attualit__tech': 5, 'economia_business': 6, 'economia_finanza': 7, 'economia_lavoro': 8, 'economia_startup': 9, 'gadget_accessori': 10, 'gadget_audio_e_tv': 11, 'gadget_computer': 12, 'gadget_elettrodomestici': 13, 'gadget_foto_e_video': 14, 'gadget_motori': 15, 'gadget_outdoor': 16, 'gadget_videogiochi': 17, 'internet_regole': 18, 'internet_social_network': 19, 'internet_tlc': 20, 'internet_web': 21, 'lifestyle_design': 22, 'lifestyle_food': 23, 'lifestyle_mobilit_': 24, 'lifestyle_salute': 25, 'lifestyle_viaggi': 26, 'lol': 27, 'mobile_app': 28, 'mobile_smartphone': 29, 'mobile_tablet': 30, 'play_cinema': 31, 'play_cultura': 32, 'play_fumetti': 33, 'play_libri': 34, 'play_musica': 35, 'play_tv': 36, 'scienza': 37, 'scienza_biotech': 38, 'scienza_ecologia': 39, 'scienza_lab': 40, 'scienza_medicina': 41, 'scienza_spazio': 42}


In [21]:
from tensorflow.keras import backend as K
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [29]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    #tf.keras.layers.GlobalMaxPooling1D(),
    #tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(len(classes) + 1, activation='softmax')
])

In [30]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=[f1])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1000, 200)         2000000   
_________________________________________________________________
global_average_pooling1d_2 ( (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 43)                8643      
Total params: 2,008,643
Trainable params: 2,008,643
Non-trainable params: 0
_________________________________________________________________


In [31]:
history = model.fit(training_padded, training_classes_seq, epochs=num_epochs, validation_data=(test_padded, test_classes_seq), verbose=2)

Train on 13351 samples, validate on 5709 samples
Epoch 1/100
13351/13351 - 4s - loss: 3.6803 - f1: 40.0775 - val_loss: 3.6124 - val_f1: 36.9009
Epoch 2/100
13351/13351 - 4s - loss: 3.5382 - f1: 36.3205 - val_loss: 3.4601 - val_f1: 35.1849
Epoch 3/100
13351/13351 - 4s - loss: 3.3067 - f1: 34.5056 - val_loss: 3.1946 - val_f1: 32.7956
Epoch 4/100
13351/13351 - 4s - loss: 3.0146 - f1: 31.0974 - val_loss: 2.9380 - val_f1: 29.4432
Epoch 5/100
13351/13351 - 4s - loss: 2.7483 - f1: 27.7662 - val_loss: 2.7105 - val_f1: 26.5041
Epoch 6/100
13351/13351 - 4s - loss: 2.5133 - f1: 24.8285 - val_loss: 2.5140 - val_f1: 24.0448
Epoch 7/100
13351/13351 - 4s - loss: 2.3079 - f1: 22.0826 - val_loss: 2.3501 - val_f1: 21.7033
Epoch 8/100
13351/13351 - 4s - loss: 2.1317 - f1: 19.7401 - val_loss: 2.2130 - val_f1: 19.5845
Epoch 9/100
13351/13351 - 4s - loss: 1.9805 - f1: 17.6849 - val_loss: 2.0967 - val_f1: 17.8867
Epoch 10/100
13351/13351 - 4s - loss: 1.8495 - f1: 15.8991 - val_loss: 2.0052 - val_f1: 16.1443


Epoch 88/100
13351/13351 - 4s - loss: 0.0239 - f1: 1.0805 - val_loss: 2.4218 - val_f1: 2.4287
Epoch 89/100
13351/13351 - 4s - loss: 0.0236 - f1: 1.0664 - val_loss: 2.4411 - val_f1: 2.4567
Epoch 90/100
13351/13351 - 4s - loss: 0.0222 - f1: 1.0677 - val_loss: 2.4711 - val_f1: 2.4032
Epoch 91/100
13351/13351 - 4s - loss: 0.0218 - f1: 1.0666 - val_loss: 2.4684 - val_f1: 2.3950
Epoch 92/100
13351/13351 - 4s - loss: 0.0220 - f1: 1.0669 - val_loss: 2.4901 - val_f1: 2.4078
Epoch 93/100
13351/13351 - 4s - loss: 0.0222 - f1: 1.0662 - val_loss: 2.5390 - val_f1: 2.3558
Epoch 94/100
13351/13351 - 4s - loss: 0.0217 - f1: 1.0624 - val_loss: 2.5301 - val_f1: 2.3990
Epoch 95/100
13351/13351 - 4s - loss: 0.0214 - f1: 1.0648 - val_loss: 2.5536 - val_f1: 2.3446
Epoch 96/100
13351/13351 - 4s - loss: 0.0216 - f1: 1.0631 - val_loss: 2.5652 - val_f1: 2.3247
Epoch 97/100
13351/13351 - 4s - loss: 0.0210 - f1: 1.0637 - val_loss: 2.5876 - val_f1: 2.2460
Epoch 98/100
13351/13351 - 4s - loss: 0.0208 - f1: 1.0573 - 

In [32]:
from sklearn import metrics
classes_probabilties = model.predict(test_padded, batch_size=32, verbose=1)
predicted = np.argmax(classes_probabilties, axis=1).tolist()
expected = flatten(test_classes_seq)
precision_scores = metrics.precision_score(expected, predicted, labels=np.unique(expected), average=None)
recall_scores = metrics.recall_score(expected, predicted, labels=np.unique(expected), average=None)
f1_scores = metrics.f1_score(expected, predicted, labels=np.unique(expected), average=None)
precion_recall_f1_scores = list(zip(precision_scores, recall_scores, f1_scores))
print('|Class Label'.ljust(40, ' ') + '|Pre |Rec |F1')
print('|--- |--- |--- |---')
for index, precision_recall_f1 in enumerate(precion_recall_f1_scores):
    classLabel = classes_by_index[index + 1]
    print('|%s|%.2f|%.2f|%.2f' % (classLabel.ljust(40, ' '), precision_recall_f1[0], precision_recall_f1[1], precision_recall_f1[2]))

precision_score_micro_averaged = metrics.precision_score(expected, predicted, labels=np.unique(expected), average='micro')
precision_score_macro_averaged = metrics.precision_score(expected, predicted, labels=np.unique(expected), average='macro')

recall_score_micro_averaged = metrics.recall_score(expected, predicted, labels=np.unique(expected), average='micro')
recall_score_macro_averaged = metrics.recall_score(expected, predicted, labels=np.unique(expected), average='macro')

f1_score_micro_averaged = metrics.f1_score(expected, predicted, labels=np.unique(expected), average='micro')
f1_score_macro_averaged = metrics.f1_score(expected, predicted, labels=np.unique(expected), average='macro')

print()
print('|Average Type |Prec |Rec |F1')
print('|--- |--- |--- |---')
print('|micro|%.2f|%.2f|%.2f' % (precision_score_micro_averaged, recall_score_micro_averaged, f1_score_micro_averaged))
print('|macro|%.2f|%.2f|%.2f' % (precision_score_macro_averaged, recall_score_macro_averaged, f1_score_macro_averaged))

|Class Label                            |Pre |Rec |F1
|--- |--- |--- |---
|attualit_                               |0.21|0.26|0.23
|attualit__ambiente                      |0.53|0.55|0.54
|attualit__media                         |0.44|0.49|0.46
|attualit__politica                      |0.56|0.57|0.56
|attualit__tech                          |0.34|0.25|0.29
|economia_business                       |0.36|0.42|0.39
|economia_finanza                        |0.66|0.63|0.64
|economia_lavoro                         |0.74|0.59|0.66
|economia_startup                        |0.73|0.64|0.68
|gadget_accessori                        |0.45|0.42|0.43
|gadget_audio_e_tv                       |0.71|0.79|0.74
|gadget_computer                         |0.73|0.62|0.67
|gadget_elettrodomestici                 |0.57|0.47|0.51
|gadget_foto_e_video                     |0.69|0.74|0.71
|gadget_motori                           |0.75|0.82|0.78
|gadget_outdoor                          |0.57|0.61|0.59
|gadget_videog