In order to run this notebook:

```
pip install tqdm
pip install pandas
pip install ipywidgets
jupyter_http_over_ws
jupyter nbextension enable --py widgetsnbextension
jupyter labextension install @jupyter-widgets/jupyterlab-manager
```

And then restart Jupypter.

In [None]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [None]:
import sys 
import os
import numpy as np
import datetime

In [None]:
corpus_base_dir = 'C:\\Users\\piercarlo\\Documents\\workspace\\personal\\wired-it-text-classification-ml\\corpora\\wired_it_20190821_segmented_preprocessed'
training_dir = os.path.join(corpus_base_dir, 'training')
test_dir = os.path.join(corpus_base_dir, 'test')
classes = ['attualit_', 'attualit__ambiente', 'attualit__media', 'attualit__politica', 'attualit__tech', 'economia_business', 'economia_finanza', 'economia_lavoro', 'economia_startup', 'gadget_accessori', 'gadget_audio_e_tv', 'gadget_computer', 'gadget_elettrodomestici', 'gadget_foto_e_video', 'gadget_motori', 'gadget_outdoor', 'gadget_videogiochi', 'internet_regole', 'internet_social_network', 'internet_tlc', 'internet_web', 'lifestyle_design', 'lifestyle_food', 'lifestyle_mobilit_', 'lifestyle_salute', 'lifestyle_viaggi', 'lol', 'mobile_app', 'mobile_smartphone', 'mobile_tablet', 'play_cinema', 'play_cultura', 'play_fumetti', 'play_libri', 'play_musica', 'play_tv', 'scienza', 'scienza_biotech', 'scienza_ecologia', 'scienza_lab', 'scienza_medicina', 'scienza_spazio']

In [None]:
def read_file_to_lines(file_path):
  with open(file_path, 'r', encoding="utf8") as file:
    return [line.replace('\n', '') for line in file.readlines()]
def list_text_files(folder):
  return [os.path.join(folder, file_name) for file_name in os.listdir(folder) if file_name.endswith('.txt')]
flatten = lambda l: [item for sublist in l for item in sublist]

In [None]:
load_class_text_pairs = lambda folder: map(lambda classLabel: map(lambda file_name: (classLabel, read_file_to_lines(file_name)), list_text_files(os.path.join(folder, classLabel))) , classes)
training_classes, training_texts = zip(*flatten(list(load_class_text_pairs(training_dir))))
test_classes, test_texts = zip(*flatten(list(load_class_text_pairs(test_dir))))

In [None]:
# Tokenize classes 
from tensorflow.keras.preprocessing.text import Tokenizer

classes_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
classes_tokenizer.fit_on_texts(classes)
classes_word_index = classes_tokenizer.word_index
tokenized_training_classes = flatten(classes_tokenizer.texts_to_sequences(training_classes))
tokenized_test_classes = flatten(classes_tokenizer.texts_to_sequences(test_classes))

classes_by_index = dict([(index, key) for (key, index) in classes_word_index.items()])


*BEWARE!*

We assume that the Bert Server will encode whole lines, so its `-pooling_strategy` msut be set to something different from `NONE` - e.g. `REDUCE_MEAN`

In [None]:
from bert_serving.client import BertClient
bc = BertClient(ip='127.0.0.1', port=5555, port_out=5556)

In [None]:
MAX_LINES_PER_TEXT = 10

In [None]:
def bert_encode_texts_lines(texts_lines, data_set_name, max_lines_to_encode):
    print("start|bert encoding|{}|max lines|{}|at|{}".format(data_set_name, max_lines_to_encode, datetime.datetime.now()))
    texts_count = len(texts_lines)
    texts_lines_encoded = []
    # We pass the lines per each document because otherwise the server is prone to crash
    with tqdm(total=texts_count) as progress_bar:
        for index in range(texts_count):
            text_lines = texts_lines[index]
            
            lines_to_encode_up_to_max = text_lines[:max_lines_to_encode] if len(text_lines) > 0 else ['dummy']
            
            missing_lines_to_fill_maximum = max_lines_to_encode - len(lines_to_encode_up_to_max)
            
            padded_lines_to_encode = lines_to_encode_up_to_max + ['dummy'] * missing_lines_to_fill_maximum
            
            padded_lines_encoded = bc.encode(padded_lines_to_encode)
            
            texts_lines_encoded.append(padded_lines_encoded)
            
            progress_bar.update(1)
        return np.array(texts_lines_encoded)

In [None]:
training_texts_lines_encoded = bert_encode_texts_lines(training_texts, 'training', MAX_LINES_PER_TEXT)
test_texts_lines_encoded = bert_encode_texts_lines(test_texts, 'test', MAX_LINES_PER_TEXT)

In [None]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.GlobalAveragePooling1D(input_shape=(training_texts_lines_encoded.shape[1:])),
    tf.keras.layers.Dense(len(classes) + 1, activation='softmax')
])
model.build()

In [None]:
model.compile(loss='sparse_categorical_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
callbacks = [EarlyStopping(monitor='val_loss', patience=8),
             ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]

In [None]:
history = model.fit(training_texts_lines_encoded,
                    tokenized_training_classes,
                    epochs=60, 
                    callbacks=callbacks,
                    batch_size=128,
                    validation_data=(test_texts_lines_encoded, tokenized_test_classes))


In [None]:
from sklearn import metrics
classes_probabilties = model.predict(test_texts_lines_encoded, batch_size=32, verbose=1)
predicted = np.argmax(classes_probabilties, axis=1).tolist()
expected = tokenized_test_classes
precision_scores = metrics.precision_score(expected, predicted, labels=np.unique(expected), average=None)
recall_scores = metrics.recall_score(expected, predicted, labels=np.unique(expected), average=None)
f1_scores = metrics.f1_score(expected, predicted, labels=np.unique(expected), average=None)
precion_recall_f1_scores = list(zip(precision_scores, recall_scores, f1_scores))
print('|Class Label'.ljust(40, ' ') + '|Pre |Rec |F1')
print('|--- |--- |--- |---')
for index, precision_recall_f1 in enumerate(precion_recall_f1_scores):
    classLabel = classes_by_index[index + 1]
    print('|%s|%.2f|%.2f|%.2f' % (classLabel.ljust(40, ' '), precision_recall_f1[0], precision_recall_f1[1], precision_recall_f1[2]))

precision_score_micro_averaged = metrics.precision_score(expected, predicted, labels=np.unique(expected), average='micro')
precision_score_macro_averaged = metrics.precision_score(expected, predicted, labels=np.unique(expected), average='macro')

recall_score_micro_averaged = metrics.recall_score(expected, predicted, labels=np.unique(expected), average='micro')
recall_score_macro_averaged = metrics.recall_score(expected, predicted, labels=np.unique(expected), average='macro')

f1_score_micro_averaged = metrics.f1_score(expected, predicted, labels=np.unique(expected), average='micro')
f1_score_macro_averaged = metrics.f1_score(expected, predicted, labels=np.unique(expected), average='macro')

print()
print('|Average Type |Prec |Rec |F1')
print('|--- |--- |--- |---')
print('|micro|%.2f|%.2f|%.2f' % (precision_score_micro_averaged, recall_score_micro_averaged, f1_score_micro_averaged))
print('|macro|%.2f|%.2f|%.2f' % (precision_score_macro_averaged, recall_score_macro_averaged, f1_score_macro_averaged))
