In [1]:
import os

embedding_dim = 200
corpus_base_dir = os.path.join('..', '..', '..', 'corpora', 'wired_it_20190821')

vocab_size = 10000
max_length = 1000
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
num_epochs = 30

training_dir = os.path.join(corpus_base_dir, 'training')
test_dir = os.path.join(corpus_base_dir, 'test')
classes = ['attualit_','attualit__ambiente','attualit__media','attualit__politica','attualit__tech','economia_business','economia_finanza','economia_lavoro','economia_startup','gadget_accessori','gadget_audio_e_tv','gadget_computer','gadget_elettrodomestici','gadget_foto_e_video','gadget_motori','gadget_outdoor','gadget_videogiochi','internet_regole','internet_social_network','internet_tlc','internet_web','lifestyle_design','lifestyle_food','lifestyle_mobilit_','lifestyle_salute','lifestyle_viaggi','lol','mobile_app','mobile_smartphone','mobile_tablet','play_cinema','play_cultura','play_fumetti','play_libri','play_musica','play_tv','scienza','scienza_biotech','scienza_ecologia','scienza_lab','scienza_medicina','scienza_spazio']

In [2]:
def read_file_to_text(file_path):
  with open(file_path, 'r', encoding="utf8") as file:
    return file.read().replace('\n', ' ')
def list_text_files(folder):
  return [os.path.join(folder, file_name) for file_name in os.listdir(folder) if file_name.endswith('.txt')]
flatten = lambda l: [item for sublist in l for item in sublist]

In [3]:
load_class_text_pairs = lambda folder: map(lambda classLabel: map(lambda file_name: (classLabel, read_file_to_text(file_name)), list_text_files(os.path.join(folder, classLabel))) , classes)

In [4]:
training_classes, training_texts = zip(*flatten(list(load_class_text_pairs(training_dir))))

In [5]:
test_classes, test_texts = zip(*flatten(list(load_class_text_pairs(test_dir))))

In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from functools import reduce
import tensorflow as tf
import numpy as np
import keras 
from keras.models import Sequential, Model 
from keras import layers
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Input, Embedding
from keras.layers.merge import Concatenate
from keras.layers import concatenate

In [7]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)

In [8]:
tokenizer.fit_on_texts(training_texts)

In [9]:
training_sequences = tokenizer.texts_to_sequences(training_texts)
training_padded = pad_sequences(training_sequences, padding=padding_type, maxlen=max_length)

In [10]:
test_sequences = tokenizer.texts_to_sequences(test_texts)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

In [21]:
# We remove the '_' because class names features '_' as separators (and they need NOT to be split, thay must be a unique token)
classes_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
classes_tokenizer.fit_on_texts(classes)

training_classes_seq = np.array(classes_tokenizer.texts_to_sequences(training_classes))
test_classes_seq = np.array(classes_tokenizer.texts_to_sequences(test_classes))

In [22]:
training_classes_encoded = [class_encoded[0] for class_encoded in training_classes_seq]
classes_word_index = classes_tokenizer.word_index
print(classes_word_index)
classes_by_index = dict([(index, key) for (key, index) in classes_word_index.items()])

{'attualit_': 1, 'attualit__ambiente': 2, 'attualit__media': 3, 'attualit__politica': 4, 'attualit__tech': 5, 'economia_business': 6, 'economia_finanza': 7, 'economia_lavoro': 8, 'economia_startup': 9, 'gadget_accessori': 10, 'gadget_audio_e_tv': 11, 'gadget_computer': 12, 'gadget_elettrodomestici': 13, 'gadget_foto_e_video': 14, 'gadget_motori': 15, 'gadget_outdoor': 16, 'gadget_videogiochi': 17, 'internet_regole': 18, 'internet_social_network': 19, 'internet_tlc': 20, 'internet_web': 21, 'lifestyle_design': 22, 'lifestyle_food': 23, 'lifestyle_mobilit_': 24, 'lifestyle_salute': 25, 'lifestyle_viaggi': 26, 'lol': 27, 'mobile_app': 28, 'mobile_smartphone': 29, 'mobile_tablet': 30, 'play_cinema': 31, 'play_cultura': 32, 'play_fumetti': 33, 'play_libri': 34, 'play_musica': 35, 'play_tv': 36, 'scienza': 37, 'scienza_biotech': 38, 'scienza_ecologia': 39, 'scienza_lab': 40, 'scienza_medicina': 41, 'scienza_spazio': 42}


In [23]:
convs = []
filter_sizes = [3,4,5]

embedding_layer = keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length)
sequence_input = Input(shape=(max_length,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

for filter_size in filter_sizes:
    l_conv = keras.layers.Conv1D(filters=128, kernel_size=filter_size, activation='relu')(embedded_sequences)
    l_pool = keras.layers.MaxPooling1D(pool_size=3)(l_conv)
    convs.append(l_pool)

l_merge = concatenate([convs[0],convs[1],convs[2]],axis=1)

# add a 1D convnet with global maxpooling, instead of Yoon Kim model
conv = keras.layers.Conv1D(filters=128, kernel_size=3, activation='relu')(embedded_sequences)
pool = keras.layers.MaxPooling1D(pool_size=3)(conv)

x = keras.layers.Dropout(0.5)(l_merge)  
x = keras.layers.Flatten()(x)
x = keras.layers.Dense(128, activation='relu')(x)
preds = keras.layers.Dense(len(classes) + 1, activation='softmax')(x)

model = Model(sequence_input, preds)

In [31]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=[tf.keras.metrics.Precision()])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 1000)         0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 1000, 200)    2000000     input_4[0][0]                    
__________________________________________________________________________________________________
conv1d_14 (Conv1D)              (None, 998, 128)     76928       embedding_6[0][0]                
__________________________________________________________________________________________________
conv1d_15 (Conv1D)              (None, 997, 128)     102528      embedding_6[0][0]                
__________________________________________________________________________________________________
conv1d_16 

In [32]:
history = model.fit(training_padded, training_classes_seq, epochs=num_epochs, validation_data=(test_padded, test_classes_seq), verbose=2)

Train on 13351 samples, validate on 5709 samples
Epoch 1/30


InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument: Incompatible shapes: [1,1376] vs. [1,32]
	 [[{{node metrics_5/precision/LogicalAnd_1}}]]
	 [[metrics_5/precision/Mean/_377]]
  (1) Invalid argument: Incompatible shapes: [1,1376] vs. [1,32]
	 [[{{node metrics_5/precision/LogicalAnd_1}}]]
0 successful operations.
0 derived errors ignored.

In [27]:
from sklearn import metrics
classes_probabilties = model.predict(test_padded, batch_size=32, verbose=1)
predicted = np.argmax(classes_probabilties, axis=1).tolist()
expected = flatten(test_classes_seq)
precision_scores = metrics.precision_score(expected, predicted, labels=np.unique(expected), average=None)
recall_scores = metrics.recall_score(expected, predicted, labels=np.unique(expected), average=None)
f1_scores = metrics.f1_score(expected, predicted, labels=np.unique(expected), average=None)
precion_recall_f1_scores = list(zip(precision_scores, recall_scores, f1_scores))
print('|Class Label'.ljust(40, ' ') + '|Pre |Rec |F1')
print('|--- |--- |--- |---')
for index, precision_recall_f1 in enumerate(precion_recall_f1_scores):
    classLabel = classes_by_index[index + 1]
    print('|%s|%.2f|%.2f|%.2f' % (classLabel.ljust(40, ' '), precision_recall_f1[0], precision_recall_f1[1], precision_recall_f1[2]))

precision_score_micro_averaged = metrics.precision_score(expected, predicted, labels=np.unique(expected), average='micro')
precision_score_macro_averaged = metrics.precision_score(expected, predicted, labels=np.unique(expected), average='macro')

recall_score_micro_averaged = metrics.recall_score(expected, predicted, labels=np.unique(expected), average='micro')
recall_score_macro_averaged = metrics.recall_score(expected, predicted, labels=np.unique(expected), average='macro')

f1_score_micro_averaged = metrics.f1_score(expected, predicted, labels=np.unique(expected), average='micro')
f1_score_macro_averaged = metrics.f1_score(expected, predicted, labels=np.unique(expected), average='macro')

print()
print('|Average Type |Prec |Rec |F1')
print('|--- |--- |--- |---')
print('|micro|%.2f|%.2f|%.2f' % (precision_score_micro_averaged, recall_score_micro_averaged, f1_score_micro_averaged))
print('|macro|%.2f|%.2f|%.2f' % (precision_score_macro_averaged, recall_score_macro_averaged, f1_score_macro_averaged))

|Class Label                            |Pre |Rec |F1
|--- |--- |--- |---
|attualit_                               |0.15|0.14|0.15
|attualit__ambiente                      |0.38|0.34|0.36
|attualit__media                         |0.41|0.21|0.28
|attualit__politica                      |0.31|0.49|0.38
|attualit__tech                          |0.09|0.12|0.11
|economia_business                       |0.22|0.23|0.22
|economia_finanza                        |0.57|0.55|0.56
|economia_lavoro                         |0.54|0.40|0.46
|economia_startup                        |0.62|0.52|0.57
|gadget_accessori                        |0.23|0.10|0.14
|gadget_audio_e_tv                       |0.58|0.62|0.60
|gadget_computer                         |0.61|0.33|0.43
|gadget_elettrodomestici                 |0.37|0.18|0.24
|gadget_foto_e_video                     |0.63|0.58|0.61
|gadget_motori                           |0.62|0.73|0.67
|gadget_outdoor                          |0.35|0.40|0.38
|gadget_videog