# DEMO
This demo does a supervised learning classification on the corpus in `corpus.xlsx` using a simple CNN model then HCAN with 1 and 2 hierarchies. Of course a data pretreatment and cleaning comes first.

# Imports 

In [None]:
import tensorflow as tf
from tensorflow import keras
import  tensorflow.keras.backend as K
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
import re
import math
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, concatenate, Dense, Activation, Dropout, Softmax, Layer
from tensorflow.keras.constraints import max_norm
from tensorflow.keras.initializers import glorot_normal
from tensorflow.keras.utils import plot_model
import os
import sys
module_path = '../src'
if module_path not in sys.path:
    sys.path.append(module_path)
from HCAN import *
from simple_CNN import *
import setup




# initialize some variables

In [None]:
UNKNOWN_TOKEN = "unknowntoken"
CURRENCY_TOKEN = "currencytoken"
EMBEDDING_DIM=50
embedding_root_dir = setup.embedding_root_dir
words_embedding_path = os.path.join(embedding_root_dir, "glove.6B.{}d.txt".format(EMBEDDING_DIM))

# Text representation and pretreatment

## Loading the corpus

In [None]:
corpus_path = 'corpus.xlsx'
corpus =pd.read_excel(corpus_path, index_col=0)


## Cleaning data
Clean the data by : 
* Remove special caracters
* Replace words of type (Anyword's, he'll, should've....) by splitting on the quotation mark => (Anyword 's,he 'll, should 've) so that it becomes two words
* Replace currencies by ```CURRENCY_TOKEN```
And Then split by special caracters for each line

In [None]:

def pretreat(line) :
    line = re.sub(r'[…―–‚»•]', ' ', line)
    line = re.sub("’", "'", line)
    line = re.sub(r"(\w+)('[sd(ll)t(ve)(re)m])", r"\g<1> \g<2>", line)
    currency_regex = r"[€£]\d+(,\d+)*(\.d+)?[m(bn)]?"
    line = re.sub(currency_regex, CURRENCY_TOKEN, line)
    return keras.preprocessing.text.text_to_word_sequence(line, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~’“—”‘©®™\t\n\xa0', lower=True, split=' ')

words = [pretreat(line) for line in corpus["Campaign Text"]]



Some transformation must be done on the word instead of the hole corpus

In [None]:
def transform(word) :
    pat = re.compile("^'?([^']*)'?$")
    result = re.match(pat,word)
    return result.group(1) if result else word

clean_words = [[transform(x)  for x in y if transform(x) != ""] for y in words]


Words of the first sentence

In [None]:
clean_words[0]

# Load embedding data and create Words dictionnary

In [None]:
def create_word_dict(lines) :
    word_dict = {}
    for line in lines : 
        splt = line.split(" ")
        key = splt[0]
        splt[-1] = splt[-1][:-1]
        value = np.array(splt[1:],  dtype= np.float32)
        word_dict[key] = value
    return word_dict

with open(words_embedding_path, "r", encoding="utf8") as f:
    lines = f.readlines()

word_dict = create_word_dict(lines)

# Last Cleaning
Filter by occurence and replace unknow words by ```UNKNOWN_TOKEN```

In [None]:
df = pd.Series([x for y in clean_words for x in y])
counts = df.value_counts()
valid_occurences = set(counts.index[counts > 2])
clean_words = [[word for word in line if word in valid_occurences] for line in clean_words]
clean_words = [[word if (word in word_dict.keys() or word == CURRENCY_TOKEN) else UNKNOWN_TOKEN  for word in line] for line in clean_words]
tokens = set([x for y in clean_words for x in y ])


# Create embedding matrix

In [None]:
def create_embedding_matrix(tokens, word_dict, dim) : 
    nrows = len(tokens)+ 2
    mat = np.zeros(shape = (nrows, dim), dtype=np.float32)
    w2i = {}
    i2w = {}
    w2i[CURRENCY_TOKEN] = 2
    i2w[2] = CURRENCY_TOKEN
    w2i[UNKNOWN_TOKEN] = 1
    i2w[1] = UNKNOWN_TOKEN
    tokens.remove(CURRENCY_TOKEN)
    tokens.remove(UNKNOWN_TOKEN)
    i = 3
    for token in tokens :    
        vec = word_dict[token]
        mat[i,:] = vec
        w2i[token] = i
        i2w[i] = token
        i+=1

    return (w2i, i2w, mat)
  


# What's on the label side ? 
Due to the embalanced nature of the dataset's labels, We apply here some weightings to privilege the rare labels.

## Weighting

In [None]:
labels = corpus["LEAD ARCHETYPE"]
inverse_weights = 1/labels.value_counts()
norm_weights = inverse_weights/ sum(inverse_weights)


## F1-score function
We create a function that calculate the F1-score

In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# Dataset creation


## Replace word by indexes and pad
* Replace words by indexes
* Pad with zeros in the end of each sequence

In [None]:
w2i, i2w, mat = create_embedding_matrix(tokens, word_dict, EMBEDDING_DIM)
index_words =  [[w2i[x]  for x in y] for y in clean_words]
max_text_length = max([ len(x) for x in  index_words])
padded_seq = keras.preprocessing.sequence.pad_sequences(index_words,maxlen=None ,padding ='post', truncating = 'post')


## Split data
Split the data into a train and test dataset, the shuffle the train data

In [None]:
def split_data(dataset, labels, train_ratio):
    unique_labels = labels.unique()
    train_data = []
    train_labels = []
    test_data = []
    test_labels= []
    for label in unique_labels : 
        lab_data = dataset[labels == label]
        train, test = train_test_split(lab_data, train_size = train_ratio)
        train_data.extend(train)
        train_labels.extend(np.repeat([label], len(train)))
        test_data.extend(test)
        test_labels.extend(np.repeat([label], len(test)))
    return (np.array(train_data),np.array(train_labels), np.array(test_data), np.array(test_labels))


In [None]:
train_data, ftrain_labels, test_data, ftest_labels = split_data(padded_seq, labels, 0.7)
weights =np.asarray([norm_weights[x] for x in ftrain_labels])
num_train_data = train_data.shape[0]
num_test_data = test_data.shape[0]

## One hot encode the labels

In [None]:
label_encoder = OneHotEncoder()
train_labels = label_encoder.fit_transform(np.array(ftrain_labels).reshape(-1,1))
test_labels = label_encoder.transform(np.array(ftest_labels).reshape(-1,1))

## Tensorflow dataset creation

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_labels.A, weights))
test_dataset = tf.data.Dataset.from_tensor_slices((test_data, test_labels.A))

# Simple CNN
Now we train a first model using a simple CNN architecture

In [None]:
Simple_CNN_BATCH = 21
Simple_CNN_NUM_EPOCHS = 15
simpleCnnModel = bulid_simple_CNN(mat, max_text_length,  [2,3], 80, "tanh", 0.3, True, 4 )
simpleCnnModel.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy', f1_m])
SCNN_train_data = train_dataset.shuffle(num_train_data).batch(Simple_CNN_BATCH).repeat()
SCNN_test_data = test_dataset.batch(num_test_data).repeat()
hist = simpleCnnModel.fit(SCNN_train_data,epochs= Simple_CNN_NUM_EPOCHS, 
                          steps_per_epoch=num_train_data/Simple_CNN_BATCH, 
                          validation_data= SCNN_test_data, validation_steps=1)

# HCAN
Now we train our model with the HCAN architecture

## Build the model with 1 hierarchy
We first train it on words directely, thus there is only one hierarchy

In [None]:
hcan_model = build_HCAN(max_text_length,mat, 4, 10)
hcan_model.compile(loss='categorical_crossentropy',
                   optimizer='adam',  
                   metrics=['accuracy', f1_m])
hcan_model.summary()

## Train the model

In [None]:
HCAN1_BATCH = 7
HCAN1_NUM_EPOCHS = 4
HCAN1_train_data = train_dataset.shuffle(num_train_data).batch(HCAN1_BATCH).repeat()
HCAN1_test_data = test_dataset.batch(num_test_data).repeat()
hcan_hist = hcan_model.fit(HCAN1_train_data,epochs= HCAN1_NUM_EPOCHS, steps_per_epoch=num_train_data/HCAN1_BATCH, 
                           validation_data= HCAN1_test_data, validation_steps=13)

## Build the model with 2 hierarchies
We spit each entry into 21 chunks. This could be done more efficiently for example by splitting by sentence in the corpus. This is done to test the model.

In [None]:
splt = 10
HCAN2_BATCH = 21
HCAN2_NUM_EPOCHS = 30
strain_data = np.array([np.array_split(x, splt) for x in train_data])
stest_data =  np.array([np.array_split(x, splt) for x in test_data])
strain_dataset = tf.data.Dataset.from_tensor_slices((strain_data, train_labels.A, weights))
stest_dataset = tf.data.Dataset.from_tensor_slices((stest_data, test_labels.A))
HCAN2_train_data = strain_dataset.shuffle(num_train_data).batch(HCAN2_BATCH).repeat()
HCAN2_test_data = stest_dataset.batch(num_test_data).repeat()

hcan_model2 = build_HCAN(strain_data.shape[1:],mat, 4, 5)

hcan_model2.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy', f1_m])
hcan_model2.summary()


In [None]:
strain_data.shape

## Train the model

In [None]:
hcan2_hist = hcan_model2.fit(HCAN2_train_data,epochs= HCAN2_NUM_EPOCHS, 
                             steps_per_epoch=num_train_data/HCAN2_BATCH, 
                           validation_data= HCAN2_test_data, validation_steps=1)