C MODEL
========

This model is a bit more sophisticated. It is a recursive model which will generate results using the embeddings of the text as well as the penultimate layer of the previous processed writtings. 

The different writtings are put trhough an TF-IDF tokenizer, this will be the input to the next stages.

The tokenized data is then put trough a Neural Network model, which will learn from it to later classify the server occurrences.

Initialization of the Environment
==========================

In [None]:
import numpy as np
import collections
import tensorflow as tf
import os
import xml.etree.ElementTree as ET
import csv
import pandas as pd
import glob
import random


np.random.seed(1)
tf.random.set_seed(2)

Load Data
========

In [None]:
SST_HOME='/datos/erisk/eRisk/'
path_train=SST_HOME+'dataset/t2_training_data/train.csv'
path_test=SST_HOME+'dataset/t2_test_data/test.csv'

In [None]:
def xml_csv(path): 
  xml_list = []
  xml_list_test = []
  rate = 0.8 
  i = 0
  files = glob.glob(path+'/*.xml')
  random.shuffle(files)
  train = int(len(files)*rate)
  for xml_file in files: 
    i = i+1 
    #print(xml_file)
    tree = ET.parse(xml_file)
    root = tree.getroot()
    if i <= train: 
      subject_id = root.find('ID').text
      for writing in root.findall('WRITING'): 
        writing_data = {}
        writing_data['ID'] = subject_id
        writing_data['TITLE'] = writing.find('TITLE').text
        writing_data['DATE'] = writing.find('DATE').text
        writing_data['TEXT'] = writing.find('TEXT').text
        writing_data['INFO'] = writing.find('INFO').text
        xml_list.append(writing_data)

    else: 
      subject_id = root.find('ID').text
      for writing in root.findall('WRITING'): 
        writing_data = {}
        writing_data['ID'] = subject_id
        writing_data['TITLE'] = writing.find('TITLE').text
        writing_data['DATE'] = writing.find('DATE').text
        writing_data['TEXT'] = writing.find('TEXT').text
        writing_data['INFO'] = writing.find('INFO').text
        xml_list_test.append(writing_data)
  print(i)
  column_name= ['ID','DATE','TITLE','TEXT','INFO']
  xml_df = pd.DataFrame(xml_list,columns=column_name)
  xml_df = xml_df.set_index(['ID', 'DATE'])
  xml_df = xml_df.sort_index()
  xml_df_test = pd.DataFrame(xml_list_test,columns=column_name)
  xml_df_test = xml_df_test.set_index(['ID', 'DATE'])
  xml_df_test = xml_df_test.sort_index()
  with open(path_risk) as gt:
    print('llego')
    xml_df["LABEL"] = np.nan
    xml_df_test["LABEL"] = np.nan
    for line in gt:
      if line == '\n':
        break
      rec_id, value = line.split(' ', 1 );
      #print(f"El usuario {rec_id} es {value}")
      if rec_id in xml_df.index: 
        xml_df.loc[rec_id, 'LABEL'] = int(value)
        #print(xml_df.loc[rec_id])
      else: 
        xml_df_test.loc[rec_id, 'LABEL'] = int(value)
  xml_df['LABEL'] = xml_df['LABEL'].astype('int')
  xml_df_test['LABEL'] = xml_df_test['LABEL'].astype('int')
  xml_df.to_csv(path_train)
  xml_df_test.to_csv(path_test)

  return xml_df, xml_df_test



In [None]:
path = '/datos/erisk/eRisk/dataset/t2_training_data/eRisk2021_T1/data'#path donde guarda los ficheros xml 
path_risk = '/datos/erisk/eRisk/dataset/t2_training_data/eRisk2021_T1/risk_golden_truth.txt'
train_df,test_df = xml_csv(path)

In [None]:
train_x_df = train_df[["TEXT", "TITLE"]]
train_y_df = train_df["LABEL"]

test_x_df = test_df[["TEXT", "TITLE"]]
test_y_df = test_df["LABEL"]

A Model Text
===========

In [None]:
train_x = [str(text) + str(title) for text, title in train_df[["TEXT", "TITLE"]].values]
train_y = train_df["LABEL"].values

test_x = [str(text) + str(title) for text, title in test_df[["TEXT", "TITLE"]].values]
test_y = test_df["LABEL"].values

In [None]:

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words=stopwords.words('english')
# Define maximum vocabulary length
MAX_WORDS = 5000

In [None]:
# We are using this function to clean the test set
def tokenize_clean_text(text, tfidf=True, tokenizer=None, max_length=None, max_words=MAX_WORDS):
  """
  This function is in charge of tokenizing the text it is given. It also cleans
  the text from stop-words, punctuation, and gives a special token to numbers.
  
  :param text: The texts to tokenize in a bidimensional python array.
  
  :returns: The tokenized and cleaned text in a bidimensional python array.
            The tokenizer used to preprocess the text.
            The maximum length used for padding.
  """   
  # set [removed] as a special token
  text_removed = [t.replace("[removed]", "R3MOV3D") for t in text]
  
  # We remove the numbers
  cropped_numbers_text = [" ".join([word if not word.isdigit() else ""
                                for word in sentence.split()])
                               for sentence in text_removed]
  
  # Delete stopwords as well as every word less than 3 chars.
  cropped_numbers_stopw_text = [" ".join([word if not (word in stop_words or len(word) <= 3) else ""
                                      for word in sentence.split()])
                                     for sentence in cropped_numbers_text]
  
  if tfidf:
    vec = TfidfVectorizer(max_features=max_words)
    tfidf_mat = vec.fit_transform(cropped_numbers_stopw_text).toarray()
    tfid_words = vec.get_feature_names_out()

    cropped_numbers_stopw_tfidf_text = [" ".join([word if word in tfid_words else ""
                                            for word in sentence.split()])
                                            for sentence in cropped_numbers_stopw_text]
  
  if tokenizer is None:
    tokenizer = Tokenizer(num_words=max_words) # They use 5k words too
    tokenizer.fit_on_texts(cropped_numbers_stopw_tfidf_text if tfidf else cropped_numbers_stopw_text)
  # We tokenize the sentences
  tokenized_text = tokenizer.texts_to_sequences(cropped_numbers_stopw_tfidf_text if tfidf else cropped_numbers_stopw_text)
  
  if max_length == None:
    max_length = 0
    for sentence in tokenized_text:
      max_length = max_length if len(sentence) < max_length else len(sentence)
  
  # Now we return the padded the sequences.
  return pad_sequences(tokenized_text, max_length), tokenizer, max_length, cropped_numbers_stopw_tfidf_text if tfidf else cropped_numbers_stopw_text
  

In [None]:
train_x_token, tokenizer, max_length, train_x_clean = tokenize_clean_text(train_x) 
test_x_token, _, _, test_x_clean = tokenize_clean_text(test_x, tokenizer=tokenizer)

In [None]:
# Save the tokenizer for test stage pruposes
import joblib

PATH_TOKENIZER = SST_HOME + "DL/tokenizer.pkl"

joblib.dump(tokenizer, PATH_TOKENIZER)

In [None]:
token_index = np.unique(train_x_token)

In [None]:
max_words = len(token_index)

In [None]:
# the length will be set to 100
max_length = 50
train_x_token_cropp = train_x_token[:,-max_length:]
test_x_token_cropp = test_x_token[:,-max_length:]

In [None]:
assert len(train_x_token) == len(train_x_df)
assert len(train_y) == len(train_y_df)
assert len(test_x_token) == len(test_x_df)
assert len(test_y) == len(test_y_df)

train_x_df["TOKENIZED"] = train_x_token_cropp.tolist()
test_x_df["TOKENIZED"] = test_x_token_cropp.tolist()

Neural Network
==============
The neural network begins here. The first part of it is a pre-trained model A. Which was used in this task too. They will be used as a "embedding layer" for the writtings of each subject.

The rest of the network will have the different subject writting embeddings as input of a RNN-type layer. Specifically to take into account th etemporal information. (It may probably change to CNN but we can have a look at that in the future work)

In [None]:
import keras
from keras import backend as K
from keras.models import Model
from keras.layers import Dense, Activation, Embedding, Dropout, Input
from keras.layers import Lambda, Flatten, RepeatVector, Permute, Multiply
from keras.layers import LSTM, GRU, Bidirectional, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Concatenate

from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import RMSprop, Adam

from keras.models import load_model
from keras.models import Sequential

In [None]:
A_MODELS_PATH = SST_HOME + "DL/models/"
MODEL_PATH = A_MODELS_PATH + "copy_load_emb_False_num_classes_1_emb_size_300_trainable_emb_True_cnn_size_128_cnn_filter_3_pool_rnn_size_None_cell_type_LSTM_bidirectional_False_attention_False_dropout_0.5_dnn_size_32_batch_size_1024"

In [None]:
from keras.models import load_model

a_model = load_model(MODEL_PATH)

In [None]:
a_model.layers.pop()
a_model.layers.pop()
# We pop two layers, being them the output and the last Dropout layers
b_model = Sequential()

for layer in a_model.layers:
  b_model.add(layer)

In [None]:
max_token_id_train = max([max(seq) for seq in train_x_df["TOKENIZED"].tolist()])
max_token_id_test = max([max(seq) for seq in test_x_df["TOKENIZED"].tolist()])
print("Max token ID in train dataset:", max_token_id_train)
print("Max token ID in test dataset:", max_token_id_test)

In [None]:
max_vocab_size = 4152
oov_token_id = max_vocab_size - 1

def replace_oov_tokens(sequences, oov_token_id):
    return [[token if token < max_vocab_size else oov_token_id for token in seq] for seq in sequences]

train_x_df["TOKENIZED"] = replace_oov_tokens(train_x_df["TOKENIZED"].tolist(), oov_token_id)
test_x_df["TOKENIZED"] = replace_oov_tokens(test_x_df["TOKENIZED"].tolist(), oov_token_id)

In [None]:
instances = np.array(train_x_df["TOKENIZED"].tolist())
test_instances = np.array(test_x_df["TOKENIZED"].tolist())

In [None]:
embedded_text = b_model.predict(instances)
embedded_text_test = b_model.predict(test_instances)

In [None]:
train_x_df["EMBEDDINGS"] = embedded_text.tolist()
test_x_df["EMBEDDINGS"] = embedded_text_test.tolist()

In [None]:
# Set the size of the embeddings depending on the embeddings generated
input_size = len(train_x_df.iloc[0]["EMBEDDINGS"])

In [None]:
train_x_subjects = []
train_y_subjects = []
for subject in train_x_df.index.get_level_values(0).unique():
  train_x_subjects.append(train_x_df.loc[subject]["EMBEDDINGS"].tolist())
  train_y_subjects.append(train_y_df.loc[subject].values[0])
  
test_x_subjects = []
test_y_subjects = []
for subject in test_x_df.index.get_level_values(0).unique():
  test_x_subjects.append(test_x_df.loc[subject]["EMBEDDINGS"].tolist())
  test_y_subjects.append(test_y_df.loc[subject].values[0])

Experiments Configuration
=====================

In [None]:
tfidf = [False] # Not changing
concept_emb = [False]  # Not Changing 
load_emb = [False]  # Not Changing
num_classes = [None]  # Not changing
emb_size = [None]  # Not changing, addressed later
trainable_emb = [False]  # Not changing
cnn_size = [[None]]  # Not changing
cnn_filter = [[3]]  # Not changing
rnn_size = [[64]]
cell_type = [GRU, LSTM]
bidirectional = [True, False]
attention = [False]
dropout = [0.5]
dnn_size = [[32]]
batch_size = [1]

indexes = ["load_emb", "emb_size", "trainable_emb", "cnn_size", "cnn_filter", "rnn_size", "cell_type", "bidirectional", "attention", "dropout", "dnn_size", "batch_size"]
param   = [load_emb, emb_size, trainable_emb, cnn_size, cnn_filter, rnn_size, cell_type, bidirectional, attention, dropout, dnn_size, batch_size]

In [None]:
import itertools

def combine_params(param, indexes):
  """
  This function is in charge of combining the parameters. This way we can perform a grid search automatically.
  
  Inputs: The array of different parameters, and the indexes for them.
  
  Outputs: The dictionary of different configurations made.
  """
  combinations = list(itertools.product(*param))
  param_combinations = [{k:v for k, v in zip(indexes, combination)}  for combination in combinations]
  for p in param_combinations:
    # The embeddings size must adapt to the embeddings loaded.
    if p["load_emb"]:
      p["emb_size"] = None
    else:
      p["embedding_matrix"] = None
  return param_combinations

In [None]:
network_parameters = combine_params(param, indexes)

In [None]:
# We change the number of classes automatically
for p in network_parameters:
  p["max_length"] = input_size
  p["max_words"] = max_words
  try:
    p["num_classes"] = train_y.shape[1]
  except IndexError:
    p["num_classes"] = 1

Neural Network
=============

In [None]:
def add_embeddings(z, load, size, trainable, vocab_size, max_length, embedding_matrix): 
  """
  This method adds embeddings to the network.
  
  Returns the net with the embeddings added.
  """
  print(size)
  if size is None:
    return z
  
  if load:
    z = Embedding(vocab_size, size, input_length=max_length, weights=[embedding_matrix], trainable=trainable)(z)
  else:
    z = Embedding(vocab_size, size, input_length=max_length)(z)
    
  return z

def add_cnn(z, size, filter_sizes, flatten):
  """
  This method adds the CNN layers to the network.
  
  Returns the net with the CNN layers.
  """
  conv_blocks = []
  for filter_size in filter_sizes:
    if filter_size is None:
      return z
    conv = None
    for i, cnn_layer in enumerate(size):
      if cnn_layer is None:
        return z
      conv = Conv1D(cnn_layer, filter_size, padding='valid', activation='relu', strides=1)(z if conv is None else conv)
      # if (i + 1)  % 2 == 0:
      conv = MaxPooling1D(pool_size=filter_size)(conv)        
      
    if flatten:
      conv = Flatten()(conv)
    conv_blocks.append(conv)         
 
  z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
  
  return z

def add_rnn(z, size, bidirectional, cell_type, attention):
  """
  This method adds the RNN layers to the network. It also adds an attention layer if intended.
  
  Returns the net with the RNN & Attention added.
  """
  for i, rsz in enumerate(size):
    if rsz is None:
      return z
    if not bidirectional:
      if i < len(size) - 1:
        z = cell_type(rsz, return_sequences=True)(z)
      else:
        z = cell_type(rsz, return_sequences=attention)(z)
    else:
      if i < len(size) - 1:
        z = Bidirectional(cell_type(rsz, return_sequences=True))(z)
      else:
        z = Bidirectional(cell_type(rsz, return_sequences=attention))(z)

  if attention:
    z = add_attention(z)

  return z

def add_dnn(z, size, dropout, activation="relu"):
  """
  This method adds the DNN layers to the network.
  
  Returns the net with the DNN layers added.
  """
  for fsz in size:
    if fsz is None:
      return z

    z = Dense(fsz, activation=activation)(z)
    z = Dropout(dropout)(z)
    
  return z

def add_attention(activations):
  """
  This method adds an attention layer.
  
  Returns the model with the attention layer.
  """
  
  size =  K.int_shape(activations)[-1]
  attention = BatchNormalization()(activations)
  attention = Dense(1, activation='tanh')(attention)
  print(attention)
  attention = Flatten()(attention)
  attention = Activation('softmax')(attention)
  attention = RepeatVector(size)(attention)
  attention = Permute([2, 1])(attention)
  
  z = Multiply()([activations, attention])
  z = Lambda(lambda xin: K.sum(xin, axis=-2), output_shape=(size,))(z)

  return z

In [None]:
def create_model(params):
  """
  This method creates a network model with the parameters given.
  
  Returns the uncompiled model.
  """
  # We changed a bit the model to get variable length inputs.
  inputs = Input(name='inputs',shape=(None, params["max_length"]))
  
  z = add_embeddings(inputs, params["load_emb"], params["emb_size"], params["trainable_emb"], params["max_words"] ,params["max_length"], params["embedding_matrix"])
  
  z = add_cnn(z, params["cnn_size"], params["cnn_filter"], [None] == params["rnn_size"])
  
  z = add_rnn(z, params["rnn_size"], params["bidirectional"], params["cell_type"], params["attention"])
  
  z = add_dnn(z, params["dnn_size"], params["dropout"])
  
  outputs = Dense(params["num_classes"], activation='sigmoid', name='output_layer')(z)
  
  net_model = Model(inputs=inputs,outputs=outputs)
  
  return net_model

In [None]:
def create_models(network_parameters, verbose=False):
  model_list = []
  for net_p in network_parameters:
    print(net_p)
    m = create_model(net_p)
    m.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])
    model_list.append(m)
    
    if verbose:
      m.summary()
      
  return model_list

In [None]:
model_list = create_models(network_parameters, verbose=True)

# TB Colab Callback
We rewrite the tensorboardcolab callbacks to create different sessions depending on the variables our trainings have. This helps to differentiate the models in tensorboard.

In [None]:
import os
from keras.callbacks import TensorBoard
import tensorflow as tf

class TensorBoardColabCallback(TensorBoard):
    def __init__(self, tbc=None, write_graph=True, name=None, **kwargs):
        # Make the original `TensorBoard` log to a subdirectory 'training'

        if tbc is None:
            return

        log_dir = tbc.get_graph_path()

        training_log_dir = os.path.join(log_dir, 'training_{}'.format(name))
        super(TensorBoardColabCallback, self).__init__(training_log_dir, **kwargs)

        # Log the validation metrics to a separate subdirectory
        self.val_log_dir = os.path.join(log_dir, 'validation_{}'.format(name))

    def set_model(self, model):
        # Setup writer for validation metrics
        self.val_writer = tf.summary.FileWriter(self.val_log_dir)
        super(TensorBoardColabCallback, self).set_model(model)

    def on_epoch_end(self, epoch, logs=None):
        # Pop the validation logs and handle them separately with
        # `self.val_writer`. Also rename the keys so that they can
        # be plotted on the same figure with the training metrics
        logs = logs or {}
        val_logs = {k.replace('val_', ''): v for k, v in logs.items() if k.startswith('val_')}

        for name, value in val_logs.items():
            # print('val_logs:',epoch, name, value)
            summary = tf.Summary()
            summary_value = summary.value.add()
            summary_value.simple_value = value.item()
            summary_value.tag = name
            self.val_writer.add_summary(summary, epoch)
        self.val_writer.flush()

        # Pass the remaining logs to `TensorBoard.on_epoch_end`
        logs = {k: v for k, v in logs.items() if not k.startswith('val_')}
        super(TensorBoardColabCallback, self).on_epoch_end(epoch, logs)
        
    def on_train_end(self, logs=None):
        super(TensorBoardColabCallback, self).on_train_end(logs)
        self.val_writer.close()

tb.TensorBoardColabCallback = TensorBoardColabCallback

In [None]:
def define_callbacks(name):
  # Define the callbacks
  #tbc_callback = tb.TensorBoardColabCallback(tbc, name=name)  # , histogram_freq=1)
   
  callbacks = [
      ReduceLROnPlateau(),
      EarlyStopping(patience=4),
      #tbc_callback
  ]
  return callbacks

In [None]:
# Create an array of names
network_names = []
for p in network_parameters:
  name = "load_emb_{}_num_classes_{}_emb_size_{}_trainable_emb_{}_cnn_size_{}_cnn_filter_{}_pool_rnn_size_{}_cell_type_{}_bidirectional_{}_attention_{}_dropout_{}_dnn_size_{}_batch_size_{}".format(
      p["load_emb"], p["num_classes"], p["emb_size"], p["trainable_emb"], p["cnn_size"], p["cnn_filter"], p["rnn_size"], 
      str(p["cell_type"]).split(".")[-1].replace("'", "").replace(">", ""), p["bidirectional"],
      p["attention"], p["dropout"], p["dnn_size"], p["batch_size"])
  name = name.replace(" ", "").replace("[", "").replace("]", "").replace(",", "-")
  network_names.append(name)

In [None]:
for i, net_model in enumerate(model_list):
  # Set a name for the model based on the tweaked parameters
  p = network_parameters[i]
  name = network_names[i]
  model_path = SST_HOME+"DL/models/C/" + name

  # If the model exists, don't compute it again.
  if os.path.isfile(model_path):
    continue
    
  print("\n\n********************************************\n")    
  print(name)
  callbacks = define_callbacks(name)
  # Fit the model and extract its data
  for epoch in range(20):
    print("Epoch {}\n".format(epoch))
    for subject, label in zip(train_x_subjects, train_y_subjects):
      history = net_model.fit(np.array([subject]), np.array([label]), callbacks=callbacks, class_weight={0: 0.11, 1: 0.89})
      # Print the loss and accuracy of the training and validation sets for each epoch
      print(history.history['loss'])
      #print(history.history['val_loss'])
      print(history.history['binary_accuracy'])
      #print(history.history['val_binary_accuracy'])
    
  # And save the model
  net_model.save(model_path)
  
# To free memory from the gpu
from keras import backend as K
K.clear_session()

Evaluation
========

In [None]:
for trheshold in np.arange(0.1, 1.5, 0.1):
  print("THRESHOLD: {}\n**********************".format(trheshold))
  from sklearn.metrics import classification_report
  predictions_list = []
  for i, net_model in enumerate(model_list):
    print("\n\n********************************************\n")
    print(network_names[i])
    model_path = SST_HOME+"DL/models/C/" + network_names[i]
    try:
      net_model = load_model(model_path)
    except ValueError:
      print("The model {} was not loaded correctly".format(name))
      continue
    except OSError:
      print("The model {} does not exist".format(name))
      continue

    predictions = []
    for subject in test_x_subjects:
      predictions.append(net_model.predict(np.array([subject])))

    predictions = np.array([0 if prediction < trheshold else 1 for prediction in predictions])
    predictions_list.append(predictions)
    # measuring performance on test set
    cr=classification_report(test_y_subjects, predictions)
    print(cr)
    # Release memory
    K.clear_session()


In [None]:
!jupyter nbconvert --to script C-Copy2.ipynb