# Packages:

In [None]:
try:
  from google.colab import drive
  !nvidia-smi
  drive.mount('/content/drive')
  path = 'drive/MyDrive/Thesis/'
except:
  path = './'

In [None]:
!pip install keras_preprocessing
!pip install transformers
import nltk
nltk.download('punkt')
!pip install lime

In [None]:
# Packages for loading data:
from os import walk
import os
import pprint
import itertools
import json
import re
import pickle
import sys
import warnings

# Packages for effective data storage / math utils:
import pandas as pd
import numpy as np

# Packages for plotting:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm

# Packages for test train data prep:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold

# Packages for classical modeling:
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# Packages for deep learning:
import keras
from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import (Embedding, Dense, Flatten, Input, Lambda,
                          GlobalMaxPooling1D, MaxPooling1D, Conv1D,
                          Bidirectional, GRU, LSTM,
                          TimeDistributed, Dropout)
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from keras.callbacks import ModelCheckpoint
from keras.utils import to_categorical
from nltk.tokenize import sent_tokenize
from tensorflow.keras.regularizers import l2
from tensorflow.keras.mixed_precision import Policy, set_global_policy

# Packages for performance:
from sklearn.metrics import (classification_report, confusion_matrix,
                             accuracy_score, f1_score, roc_auc_score)

# Packages for model interpretation:
from lime import lime_text

# Misc.:
import time
import multiprocessing

seed = 101
cores = multiprocessing.cpu_count()

# Running Classical models:


**Function 1:**

1. Read in input data/model to use.

2. Runs hyperparameter tuning via 5-fold CV on training set with given model and data. Saves optimal hyperparameters and saves best model in a pickle.

---------------

**Function 2:**

3. Predicts test with opt. model.

4. Saves confusion matrix.

---------------

**Classical Models:**

* Logistic Regression -
* Gaus. Naïve Bayes -
* SVM -
* DT -
* RF -
* LightGBM -
* k-NN -
* perceptron (single & multi) -

**Deep Learning Models:**

* BERT
* LSTM
* GRU
* BiLSTM
* CNN
* HAN
* BERT
* Hier-BERT

---------------

**Input Data:**

* Bag-of-ngarms: ngram (1, 1) (+ Truncated SVD) (+ LDA)
* Bag-of-ngarms: ngram (1, 2) (+ Truncated SVD) (+ LDA)
* TF-IDF: ngram (1, 1) (+ Truncated SVD) (+ LDA)
* TF-IDF: ngram (1, 2) (+ Truncated SVD) (+ LDA)

In [None]:
def vec_path_getter(
    vecpath : str,
    contains : str,
):
    #########
    # Input:
    # Output: list of all paths to jasons to be used later
    #########

    filenames = next(walk(vecpath), (None, None, []))[2]
    filenames = [str(vecpath + "/" + file) for file in filenames
                 if contains in file]

    return(filenames)

vec_x_paths = vec_path_getter(
    path + "ECHR_Dataset_vec",
    "x"
    )
pattern = r'(?<=vec/)(.*?)(?=_test_x.parquet.gzip|_train_x.parquet.gzip)'
unique_datasets = list(set([re.search(pattern, string).group(0) for string in vec_x_paths]))
vec_x_paths = [[x for x in vec_x_paths if str(i + "_train") in x or str(i + "_test") in x] for i in unique_datasets]

y_paths = vec_path_getter(
    path + "ECHR_Dataset_clean",
    "y"
    )

classical_models = [
    LogisticRegression,
    GaussianNB,
    SVC,
    DecisionTreeClassifier,
    RandomForestClassifier,
    LGBMClassifier,
    KNeighborsClassifier,
    Perceptron,
    MLPClassifier,
    ]
params = [
    { # LogisticRegression
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': [None, 'l1', 'l2'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    },
    { # GaussianNB
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1],
    },
    { # SVC
    'C': [0.001, 0.01, 0.1, 1, 10],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    },
    { # DecisionTreeClassifier
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': np.arange(10, 50, 2),
    'splitter': ['best', 'random'],
    'ccp_alpha': np.arange(0, 0.2, 0.01),
    },
    { # RandomForestClassifier
    'n_estimators': np.arange(10, 200, 10),
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': np.arange(10, 50, 2),
    'ccp_alpha': np.arange(0, 0.2, 0.01),
    },
    { # LGBMClassifier
    'boosting_type': ['gbdt', 'dart', 'goss'],
    'num_leaves': [10, 50, 100, 200],
    'max_depth': [5, 10, 15, 20, 50],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 500, 1000],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    },
    { # KNeighborsClassifier
    'n_neighbors': np.arange(10, 50, 2),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2, 3]
    },
    { # Perceptron
    'penalty': [None, 'l1', 'l2', 'elasticnet'],
    'alpha': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1],
    'max_iter': np.arange(1000, 5000, 1000),
    },
    { # MLPClassifier
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'alpha': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    }
    ]

In [None]:
vec_x_paths = [
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_bi_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_bi_test_x.parquet.gzip'],
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_uni_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_uni_test_x.parquet.gzip'],
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_bi_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_bi_test_x.parquet.gzip'],
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_uni_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_uni_test_x.parquet.gzip'],
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_uni_lda_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_uni_lda_test_x.parquet.gzip'],
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_bi_lda_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_bi_lda_test_x.parquet.gzip'],
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_uni_lda_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_uni_lda_test_x.parquet.gzip'],
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_bi_lda_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_bi_lda_test_x.parquet.gzip'],
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_uni_tsvd_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_uni_tsvd_test_x.parquet.gzip'],
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_bi_tsvd_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/tfidf_bi_tsvd_test_x.parquet.gzip'],
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_uni_tsvd_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_uni_tsvd_test_x.parquet.gzip'],
    # ['drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_bi_tsvd_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/bow_bi_tsvd_test_x.parquet.gzip'],
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/w2v_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/w2v_test_x.parquet.gzip'],
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/d2v_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/d2v_test_x.parquet.gzip'],
    #['drive/MyDrive/Thesis/ECHR_Dataset_vec/glove_train_x.parquet.gzip', 'drive/MyDrive/Thesis/ECHR_Dataset_vec/glove_test_x.parquet.gzip'],
    ]

In [None]:
def hyperparameter_tuning(model, params, x_train, y_train):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    clf = RandomizedSearchCV(model(), params, cv=kfold, n_iter=25, n_jobs=None, random_state=42)
    search = clf.fit(x_train, y_train)
    best_params = search.best_params_
    best_model = search.best_estimator_

    return best_params, best_model

def save_model(best_model, filename):
    with open(filename, 'wb') as f:
        pickle.dump(best_model, f)

def run_models_on_datasets(models, params, datasets_paths):
    y_train = pd.read_pickle([i for i in y_paths if 'train' in i][0])
    y_test =  pd.read_pickle([i for i in y_paths if 'test'  in i][0])
    pattern = r'(?<=vec/)(.*?)(?=_test_x.parquet.gzip|_train_x.parquet.gzip)'

    for i, model in enumerate(models):
        for j, datasets in enumerate(datasets_paths):
            # Set up:
            datasets.sort()
            temp_model_name = str(model).split('.')[-1].split("'")[0]
            temp_data_name = re.search(pattern, datasets[0]).group(0)
            print(f"Running model {temp_model_name} on dataset {temp_data_name}")

            # Read correct data:
            x_train = pd.read_parquet(datasets[1])
            #x_test  = pd.read_parquet(datasets[0])

            # Hypertune:
            best_params, best_model = hyperparameter_tuning(model, params[i], x_train, y_train)

            # Save best model:
            filename = f"model_{temp_model_name}__dataset_{temp_data_name}"
            save_model(best_model, path + 'ECHR_model/' + filename + '.pkl')

            # Done:
            print(f"Best Parameters for model {i+1} on dataset {j+1}: ", best_params)

In [None]:
run_models_on_datasets(classical_models, params, vec_x_paths)

Running model SVC on dataset bow_bi_tsvd


# Running DL models:

**Deep Learning Models:**

* CNN
* GRU
* HAN
* BiLSTM
* BERT
* Hier-BERT

In [None]:
df_train_x_raw = pd.read_pickle(path + "ECHR_Dataset_clean/df_train_x.pkl")
df_train_y_raw = pd.read_pickle(path + "ECHR_Dataset_clean/df_train_y.pkl")

# Create validation split:
df_train_all = pd.concat({"TEXT": df_train_x_raw, "new_CONCLUSION": df_train_y_raw}, axis = 1)
df_train, df_val = train_test_split(df_train_all, test_size=0.2, random_state=seed)
df_train_x = df_train.TEXT
df_train_y = df_train.new_CONCLUSION
df_val_x = df_val.TEXT
df_val_y = df_val.new_CONCLUSION

df_test_x  = pd.read_pickle(path + "ECHR_Dataset_clean/df_test_x.pkl")
df_test_y  = pd.read_pickle(path + "ECHR_Dataset_clean/df_test_y.pkl")

df_all_x = pd.concat([df_train_x_raw, df_test_x])
df_all_y = pd.concat([df_train_y_raw, df_test_y])

In [None]:
vocab_size = 20000  # Max number of words in the vocabulary
embedding_dim = 256 # Dimensions of the embedding space

# Preprocessing text data
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df_train_x)
sequences_train = tokenizer.texts_to_sequences(df_train_x)
sequences_val = tokenizer.texts_to_sequences(df_val_x)
sequences_test = tokenizer.texts_to_sequences(df_test_x)

max_len = max(
    max([len(i) for i in sequences_train]),
    max([len(i) for i in sequences_val]),
    max([len(i) for i in sequences_test]),
    ) # Max length of each document'

token_train_x = pad_sequences(sequences_train, maxlen=max_len)
token_val_x = pad_sequences(sequences_val, maxlen=max_len)
token_test_x = pad_sequences(sequences_test, maxlen=max_len)

In [None]:
drop_param = 0.2
r_l2_param = l2(0.001)
embed_l2_param = l2(0.0001)

## CNN:

In [None]:
# Using 3 Conv1D layers followed by max pooling layers
model_CNN = Sequential()
model_CNN.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model_CNN.add(Conv1D(256, 2, activation='relu'))
model_CNN.add(MaxPooling1D(2))
model_CNN.add(Conv1D(128, 2, activation='relu'))
model_CNN.add(MaxPooling1D(2))
model_CNN.add(Conv1D(128, 2, activation='relu'))
model_CNN.add(MaxPooling1D(4))  # global max pooling
# Using the flatten layer to convert into 1D tensor
model_CNN.add(Flatten())
# passing the output embeddings through 2 dense layers
model_CNN.add(Dense(128, activation='relu'))
model_CNN.add(Dense(32, activation='relu'))
# Using sigmoid classifier
model_CNN.add(Dense(1, activation='sigmoid'))

model_CNN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

print("Simplified convolutional neural network")
model_CNN.summary()

Simplified convolutional neural network
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 28659, 256)        5120000   
                                                                 
 conv1d (Conv1D)             (None, 28658, 256)        131328    
                                                                 
 max_pooling1d (MaxPooling1D  (None, 14329, 256)       0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 14328, 128)        65664     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 7164, 128)        0         
 1D)                                                             
                                                                 
 conv1d_2 (Conv1

In [None]:
model_CNN.fit(token_train_x, df_train_y,
              validation_data=(token_val_x, df_val_y),
              epochs=10, verbose=1)

model_CNN.save(path + 'ECHR_model/model_cnn__dataset_all.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
y_pred = (model_CNN.predict(token_test_x) > 0.5).astype("int32")
print(accuracy_score(df_test_y, y_pred))
print(f1_score(df_test_y, y_pred, average='macro'))

0.8454591068740592
0.8453325942350332


In [None]:
# reg CNN:
model_reg_CNN = Sequential()
model_reg_CNN.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model_reg_CNN.add(Conv1D(256, 2, activation='relu',
                         kernel_regularizer = r_l2_param))
model_reg_CNN.add(MaxPooling1D(2))
model_reg_CNN.add(Conv1D(128, 2, activation='relu',
                         kernel_regularizer = r_l2_param))
model_reg_CNN.add(MaxPooling1D(2))
model_reg_CNN.add(Conv1D(128, 2, activation='relu',
                         kernel_regularizer = r_l2_param))
model_reg_CNN.add(MaxPooling1D(4))
model_reg_CNN.add(Flatten())
model_reg_CNN.add(Dense(128, activation='relu'))
model_reg_CNN.add(Dense(32, activation='relu'))
model_reg_CNN.add(Dense(1, activation='sigmoid'))

model_reg_CNN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_reg_CNN.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 28659, 256)        5120000   
                                                                 
 conv1d_10 (Conv1D)          (None, 28658, 256)        131328    
                                                                 
 max_pooling1d_9 (MaxPooling  (None, 14329, 256)       0         
 1D)                                                             
                                                                 
 conv1d_11 (Conv1D)          (None, 14328, 128)        65664     
                                                                 
 max_pooling1d_10 (MaxPoolin  (None, 7164, 128)        0         
 g1D)                                                            
                                                                 
 conv1d_12 (Conv1D)          (None, 7163, 128)        

In [None]:
model_reg_CNN.fit(token_train_x, df_train_y,
                  validation_data=(token_val_x, df_val_y),
                  epochs=10, verbose=1)

y_pred = (model_reg_CNN.predict(token_test_x) > 0.5).astype("int32")
print(accuracy_score(df_test_y, y_pred))
print(f1_score(df_test_y, y_pred, average='macro'))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.7907676869041645
0.7906023639218092


## GRU:

In [None]:
model_GRU = Sequential()

model_GRU.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model_GRU.add(GRU(32, activation = 'tanh', recurrent_activation = 'sigmoid')) # these two options allow for GPU computation
model_GRU.add(Dense(32, activation='relu'))
model_GRU.add(Dense(16, activation='relu'))
# Using sigmoid classifier
model_GRU.add(Dense(1, activation='sigmoid'))

model_GRU.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

model_GRU.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 28659, 256)        5120000   
                                                                 
 gru (GRU)                   (None, 32)                27840     
                                                                 
 dense_3 (Dense)             (None, 32)                1056      
                                                                 
 dense_4 (Dense)             (None, 16)                528       
                                                                 
 dense_5 (Dense)             (None, 1)                 17        
                                                                 
Total params: 5,149,441
Trainable params: 5,149,441
Non-trainable params: 0
_________________________________________________________________


In [None]:
model_GRU.fit(token_train_x, df_train_y,
              validation_data=(token_val_x, df_val_y),
              epochs=10, verbose=1)

model_GRU.save(path + 'ECHR_model/model_gru__dataset_all.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
y_pred = (model_GRU.predict(token_test_x) > 0.5).astype("int32")
print(accuracy_score(df_test_y, y_pred))
print(f1_score(df_test_y, y_pred, average='macro'))

0.6347215253386854
0.6344011240326486


In [None]:
model_reg_GRU = Sequential()
model_reg_GRU.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model_reg_GRU.add(GRU(32, activation = 'tanh', recurrent_activation = 'sigmoid',
                      dropout = drop_param,
                      kernel_regularizer = r_l2_param)) # these two options allow for GPU computation
model_reg_GRU.add(Dense(32, activation='relu'))
model_reg_GRU.add(Dense(16, activation='relu'))
model_reg_GRU.add(Dense(1, activation='sigmoid'))

model_reg_GRU.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_reg_GRU.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 28659, 256)        5120000   
                                                                 
 gru_3 (GRU)                 (None, 32)                27840     
                                                                 
 dense_9 (Dense)             (None, 32)                1056      
                                                                 
 dense_10 (Dense)            (None, 16)                528       
                                                                 
 dense_11 (Dense)            (None, 1)                 17        
                                                                 
Total params: 5,149,441
Trainable params: 5,149,441
Non-trainable params: 0
_________________________________________________________________


In [None]:
model_reg_GRU.fit(token_train_x, df_train_y,
                  validation_data=(token_val_x, df_val_y),
                  epochs=10, verbose=1)

y_pred = (model_reg_GRU.predict(token_test_x) > 0.5).astype("int32")
print(accuracy_score(df_test_y, y_pred))
print(f1_score(df_test_y, y_pred, average='macro'))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.6643251379829402
0.6641759974409898


## HAN:

### Backend:

In [None]:
class AttentionLayer(keras.layers.Layer):
    def __init__(self, context_vector_length=100, **kwargs):
        """
        An implementation of a attention layer. This layer
        accepts a 3d Tensor (batch_size, time_steps, input_dim) and
        applies a single layer attention mechanism in the time
        direction (the second axis).
        :param context_vector_lenght: (int) The size of the hidden context vector.
            If set to 1 this layer reduces to a standard attention layer.
        :param kwargs: Any argument that the baseclass Layer accepts.
        """
        self.context_vector_length = context_vector_length
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        dim = input_shape[2]

        # Add a weights layer for the
        self.W = self.add_weight(
            name='W', shape=(dim, self.context_vector_length),
            initializer=keras.initializers.get('uniform'),
            trainable=True
        )

        self.u = self.add_weight(
            name='context_vector', shape=(self.context_vector_length, 1),
            initializer=keras.initializers.get('uniform'),
            trainable=True
        )

        super(AttentionLayer, self).build(input_shape)

    def _get_attention_weights(self, X):
        """
        Computes the attention weights for each timestep in X
        :param X: 3d-tensor (batch_size, time_steps, input_dim)
        :return: 2d-tensor (batch_size, time_steps) of attention weights
        """
        # Compute a time-wise stimulus, i.e. a stimulus for each
        # time step. For this first compute a hidden layer of
        # dimension self.context_vector_length and take the
        # similarity of this layer with self.u as the stimulus
        u_tw = K.tanh(K.dot(X, self.W))
        tw_stimulus = K.dot(u_tw, self.u)

        # Remove the last axis an apply softmax to the stimulus to
        # get a probability.
        tw_stimulus = K.reshape(tw_stimulus, (-1, tw_stimulus.shape[1]))
        att_weights = K.softmax(tw_stimulus)

        return att_weights

    def call(self, X):
        att_weights = self._get_attention_weights(X)

        # Reshape the attention weights to match the dimensions of X
        att_weights = K.reshape(att_weights, (-1, att_weights.shape[1], 1))
        att_weights = K.repeat_elements(att_weights, X.shape[-1], -1)

        # Multiply each input by its attention weights
        weighted_input = keras.layers.Multiply()([X, att_weights])

        # Sum in the direction of the time-axis.
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[2]

    def get_config(self):
        config = {
            'context_vector_length': self.context_vector_length
        }
        base_config = super(AttentionLayer, self).get_config()
        return {**base_config, **config}

In [None]:
class HAN(Model):
    def __init__(
            self, max_words, max_sentences, output_size,
            embedding_matrix, word_encoding_dim=200,
            sentence_encoding_dim=200, inputs=None,
            outputs=None, name='han-for-docla'
    ):
        """
        A Keras implementation of Hierarchical Attention networks
        for document classification.
        :param max_words: The maximum number of words per sentence
        :param max_sentences: The maximum number of sentences
        :param output_size: The dimension of the last layer (i.e.
            the number of classes you wish to predict)
        :param embedding_matrix: The embedding matrix to use for
            representing words
        :param word_encoding_dim: The dimension of the GRU
            layer in the word encoder.
        :param sentence_encoding_dim: The dimension of the GRU
            layer in the sentence encoder.
        """
        self.max_words = max_words
        self.max_sentences = max_sentences
        self.output_size = output_size
        self.embedding_matrix = embedding_matrix
        self.word_encoding_dim = word_encoding_dim
        self.sentence_encoding_dim = sentence_encoding_dim

        in_tensor, out_tensor = self._build_network()

        super(HAN, self).__init__(
            inputs=in_tensor, outputs=out_tensor, name=name
        )

    def build_word_encoder(self, max_words, embedding_matrix, encoding_dim=200):
        """
        Build the model that embeds and encodes in context the
        words used in a sentence. The return model takes a tensor of shape
        (batch_size, max_length) that represents a collection of sentences
        and returns an encoded representation of these sentences.
        :param max_words: (int) The maximum sentence length this model accepts
        :param embedding_matrix: (2d array-like) A matrix with the i-th row
            representing the embedding of the word represented by index i.
        :param encoding_dim: (int, should be even) The dimension of the
            bidirectional encoding layer. Half of the nodes are used in the
            forward direction and half in the backward direction.
        :return: Instance of keras.Model
        """
        assert encoding_dim % 2 == 0, "Embedding dimension should be even"

        vocabulary_size = embedding_matrix.shape[0]
        embedding_dim = embedding_matrix.shape[1]

        embedding_layer = Embedding(
            vocabulary_size, embedding_dim,
            weights=[embedding_matrix], input_length=max_words,
            trainable=False
        )

        sentence_input = Input(shape=(max_words,), dtype='int32')
        embedded_sentences = embedding_layer(sentence_input)
        encoded_sentences = Bidirectional(
            GRU(int(encoding_dim / 2), return_sequences=True)
        )(embedded_sentences)

        return Model(
            inputs=[sentence_input], outputs=[encoded_sentences], name='word_encoder'
        )

    def build_sentence_encoder(self, max_sentences, summary_dim, encoding_dim=200):
        """
        Build the encoder that encodes the vector representation of
        sentences in their context.
        :param max_sentences: The maximum number of sentences that can be
            passed. Use zero-padding to supply shorter sentences.
        :param summary_dim: (int) The dimension of the vectors that summarizes
            sentences. Should be equal to the encoding_dim of the word
            encoder.
        :param encoding_dim: (int, even) The dimension of the vector that
            summarizes sentences in context. Half is used in forward direction,
            half in backward direction.
        :return: Instance of keras.Model
        """
        assert encoding_dim % 2 == 0, "Embedding dimension should be even"

        text_input = Input(shape=(max_sentences, summary_dim))
        encoded_sentences = Bidirectional(
            GRU(int(encoding_dim / 2), return_sequences=True)
        )(text_input)
        return Model(
            inputs=[text_input], outputs=[encoded_sentences], name='sentence_encoder'
        )

    def _build_network(self):
        """
        Build the graph that represents this network
        :return: in_tensor, out_tensor, Tensors representing the input and output
            of this network.
        """
        in_tensor = Input(shape=(self.max_sentences, self.max_words))

        word_encoder = self.build_word_encoder(
            self.max_words, self.embedding_matrix, self.word_encoding_dim
        )

        word_rep = TimeDistributed(
            word_encoder, name='word_encoder'
        )(in_tensor)

        # Sentence Rep is a 3d-tensor (batch_size, max_sentences, word_encoding_dim)
        sentence_rep = TimeDistributed(
            AttentionLayer(), name='word_attention'
        )(word_rep)

        doc_rep = self.build_sentence_encoder(
            self.max_sentences, self.word_encoding_dim, self.sentence_encoding_dim
        )(sentence_rep)

        # We get the final representation by applying our attention mechanism
        # to the encoded sentences
        doc_summary = AttentionLayer(name='sentence_attention')(doc_rep)

        out_tensor = Dense(
            self.output_size, activation='softmax', name='class_prediction'
        )(doc_summary)

        return in_tensor, out_tensor

    def get_config(self):
        config = {
            'max_words': self.max_words,
            'max_sentences': self.max_sentences,
            'output_size': self.output_size,
            'embedding_matrix': self.embedding_matrix,
            'word_encoding_dim': self.word_encoding_dim,
            'sentence_encoding_dim': self.sentence_encoding_dim,
            'base_config': super(HAN, self).get_config()
        }

        return config

    @classmethod
    def from_config(cls, config, custom_objects=None):
        """
        Keras' API isn't really extendible at this point
        therefore we need to use a bit hacky solution to
        be able to correctly reconstruct the HAN model
        from a config. This therefore does not reconstruct
        a instance of HAN model, but actually a standard
        Keras model that behaves exactly the same.
        """
        base_config = config.pop('base_config')

        return Model.from_config(
            base_config, custom_objects=custom_objects
        )

    def predict_sentence_attention(self, X):
        """
        For a given set of texts predict the attention
        weights for each sentence.
        :param X: 3d-tensor, similar to the input for predict
        :return: 2d array (num_obs, max_sentences) containing
            the attention weights for each sentence
        """
        att_layer = self.get_layer('sentence_attention')
        prev_tensor = att_layer.input

        # Create a temporary dummy layer to hold the
        # attention weights tensor
        dummy_layer = Lambda(
            lambda x: att_layer._get_attention_weights(x)
        )(prev_tensor)

        return Model(self.input, dummy_layer).predict(X)

### Run:

In [None]:
MAX_WORDS_PER_SENT = 200
MAX_SENT = 20
MAX_VOC_SIZE = 20000
GLOVE_DIM = 100
TEST_SPLIT = 0.2

In [None]:
# NOT ORIGNAL!

#####################################################
# Tokenization                                      #
#####################################################
# Build a Keras Tokenizer that can encode every token
word_tokenizer = Tokenizer(num_words=MAX_VOC_SIZE)
word_tokenizer.fit_on_texts(df_all_x)

# Construct the input matrix. This should be a nd-array of
# shape (n_samples, MAX_SENT, MAX_WORDS_PER_SENT).
# We zero-pad this matrix (this does not influence
# any predictions due to the attention mechanism.
X = np.zeros((len(df_all_x), MAX_SENT, MAX_WORDS_PER_SENT), dtype='int32')

for i, text in enumerate(df_all_x):
    sentences = sent_tokenize(text)
    tokenized_sentences = word_tokenizer.texts_to_sequences(
        sentences
    )
    tokenized_sentences = pad_sequences(
        tokenized_sentences, maxlen=MAX_WORDS_PER_SENT
    )

    pad_size = MAX_SENT - tokenized_sentences.shape[0]

    if pad_size < 0:
        tokenized_sentences = tokenized_sentences[0:MAX_SENT]
    else:
        tokenized_sentences = np.pad(
            tokenized_sentences, ((0, pad_size), (0, 0)),
            mode='constant', constant_values=0
        )

    # Store this observation as the i-th observation in
    # the data matrix
    X[i] = tokenized_sentences[None, ...]

# Transform the labels into a format Keras can handle
y = to_categorical(df_all_y)

# We make a train/test split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state = seed)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state = seed)

In [None]:
#####################################################
# Word Embeddings                                   #
#####################################################
# Now, we need to build the embedding matrix. For this we use
# a pretrained (on the wikipedia corpus) 100-dimensional GloVe
# model.

# Load the embeddings from a file
embeddings = {}
with open(path + 'glove.6B.100d.txt', encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')

        embeddings[word] = coefs

# Initialize a matrix to hold the word embeddings
embedding_matrix = np.random.random(
    (len(word_tokenizer.word_index) + 1, GLOVE_DIM)
)

# Let the padded indices map to zero-vectors. This will
# prevent the padding from influencing the results
embedding_matrix[0] = 0

# Loop though all the words in the word_index and where possible
# replace the random initalization with the GloVe vector.
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
#####################################################
# Model Training                                    #
#####################################################
model_HAN = HAN(
    MAX_WORDS_PER_SENT, MAX_SENT, 2, embedding_matrix,
    word_encoding_dim=100, sentence_encoding_dim=100
)

model_HAN.compile(loss='binary_crossentropy', optimizer='adagrad', metrics=['acc'])

model_HAN.summary()

Model: "han-for-docla"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20, 200)]         0         
                                                                 
 word_encoder (TimeDistribut  (None, 20, 200, 100)     15152100  
 ed)                                                             
                                                                 
 word_attention (TimeDistrib  (None, 20, 100)          10100     
 uted)                                                           
                                                                 
 sentence_encoder (Functiona  (None, 20, 100)          45600     
 l)                                                              
                                                                 
 sentence_attention (Attenti  (None, 100)              10100     
 onLayer)                                            

In [None]:
model_HAN.fit(X_train, y_train,
              validation_data=(X_val, y_val),
              epochs=10, verbose=1)

model_HAN.save(path + 'ECHR_model/model_han__dataset_all.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
y_pred = (model_HAN.predict(X_test) > 0.5).astype("int32")
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred, average='macro'))

0.48319116909182136
0.3257780784844384


## LSTM:

In [None]:
model_LSTM = Sequential()

model_LSTM.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model_LSTM.add(LSTM(128, return_sequences=True))
model_LSTM.add(LSTM(128))
model_LSTM.add(Dense(64,activation='relu'))
model_LSTM.add(Dense(16,activation='relu'))
model_LSTM.add(Dense(1,activation='sigmoid'))

model_LSTM.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

model_LSTM.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 28659, 256)        5120000   
                                                                 
 lstm (LSTM)                 (None, 28659, 128)        197120    
                                                                 
 lstm_1 (LSTM)               (None, 128)               131584    
                                                                 
 dense_6 (Dense)             (None, 64)                8256      
                                                                 
 dense_7 (Dense)             (None, 16)                1040      
                                                                 
 dense_8 (Dense)             (None, 1)                 17        
                                                                 
Total params: 5,458,017
Trainable params: 5,458,017
No

In [None]:
model_LSTM.fit(token_train_x, df_train_y,
               validation_data=(token_val_x, df_val_y),
               epochs=10, verbose=1)

model_LSTM.save(path + 'ECHR_model/model_lstm__dataset_all.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
y_pred = (model_LSTM.predict(token_test_x) > 0.5).astype("int32")
print(accuracy_score(df_test_y, y_pred))
print(f1_score(df_test_y, y_pred, average='macro'))

0.6482689412945308
0.6481272017346149


In [None]:
model_reg_LSTM = Sequential()
model_reg_LSTM.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model_reg_LSTM.add(LSTM(128, return_sequences=True,
                    dropout = 0.25,
                    kernel_regularizer = r_l2_param))
model_reg_LSTM.add(LSTM(128,
                    dropout = drop_param,
                    kernel_regularizer = r_l2_param))
model_reg_LSTM.add(Dense(64,activation='relu'))
model_reg_LSTM.add(Dense(16,activation='relu'))
model_reg_LSTM.add(Dense(1,activation='sigmoid'))

model_reg_LSTM.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

model_reg_LSTM.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 28659, 256)        5120000   
                                                                 
 lstm_4 (LSTM)               (None, 28659, 128)        197120    
                                                                 
 lstm_5 (LSTM)               (None, 128)               131584    
                                                                 
 dense_18 (Dense)            (None, 64)                8256      
                                                                 
 dense_19 (Dense)            (None, 16)                1040      
                                                                 
 dense_20 (Dense)            (None, 1)                 17        
                                                                 
Total params: 5,458,017
Trainable params: 5,458,017
No

In [None]:
model_reg_LSTM.fit(token_train_x, df_train_y,
                   validation_data=(token_val_x, df_val_y),
                   epochs=10, verbose=1)

y_pred = (model_reg_LSTM.predict(token_test_x) > 0.5).astype("int32")
print(accuracy_score(df_test_y, y_pred))
print(f1_score(df_test_y, y_pred, average='macro'))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.6588058203712995
0.6587748080916729


## BERT:

In [None]:
# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize text:
max_len = 1024
all_text = df_all_x#[0:2000]
all_y = df_all_y#[0:2000]
all_input_ids = [tokenizer.encode(doc, add_special_tokens=True, max_length=max_len, truncation=True) for doc in all_text]
all_input_ids = tf.keras.preprocessing.sequence.pad_sequences(all_input_ids, maxlen=max_len, dtype="long", value=0, truncating="post", padding="post")

X_temp, X_test, Y_temp, Y_test = train_test_split(all_input_ids, all_y, test_size=0.2)
X_train, X_val, Y_train, Y_val = train_test_split(X_temp, Y_temp, test_size=0.2)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

KeyboardInterrupt: ignored

In [None]:
policy = Policy('mixed_float16')
set_global_policy(policy)

In [None]:
model_BERT = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)
model_BERT.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model_BERT.summary()

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  769       
                                                                 
Total params: 109,483,009
Trainable params: 109,483,009
Non-trainable params: 0
_________________________________________________________________


In [None]:
model_BERT.fit(X_train, Y_train,
               validation_data=(X_val, Y_val),
               batch_size=16, epochs=10)

model_BERT.save(path + './ECHR_model/model_bert__dataset_all')

In [None]:
y_pred = [int(float((np.exp(x) / (1 + np.exp(x)))) > 0.5) for x in model_BERT.predict(X_test).logits]
print(accuracy_score(Y_test, y_pred))
print(f1_score(Y_test, y_pred, average='macro'))

# Evaluation of LR:

In [None]:
df_train_x = pd.read_pickle(path + "ECHR_Dataset_clean/df_train_x.pkl")
df_train_y = pd.read_pickle(path + "ECHR_Dataset_clean/df_train_y.pkl")
df_test_x  = pd.read_pickle(path + "ECHR_Dataset_clean/df_test_x.pkl")
df_test_y  = pd.read_pickle(path + "ECHR_Dataset_clean/df_test_y.pkl")

In [None]:
# convert to 3000 component tSVD:
vec = TfidfVectorizer(ngram_range=(1, 2), min_df = 6, max_df = 0.9, dtype = np.float32)
bow_matrix_train = abs(vec.fit_transform(df_train_x))
bow_array_train = bow_matrix_train.toarray()
bow_matrix_test = vec.transform(df_test_x)
bow_array_test = bow_matrix_test.toarray()

tsvd_algo = TruncatedSVD(algorithm = 'randomized', n_components = 3000)
tsvd_train = tsvd_algo.fit_transform(bow_matrix_train.asfptype())
bow_df_train = pd.DataFrame(data=tsvd_train)

tsvd_test = tsvd_algo.transform(bow_array_test)
bow_df_test = pd.DataFrame(data=tsvd_test)

In [None]:
def hyperparameter_tuning(model, params, x_train, y_train):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    clf = RandomizedSearchCV(model(), params, cv=kfold, n_iter=100, n_jobs=-2, random_state=42, verbose=1)
    search = clf.fit(x_train, y_train)

    return search.best_params_, search.best_estimator_

def run_model_on_dataset(x_train, y_train):
    best_params, best_model = hyperparameter_tuning(
        LogisticRegression, {
            'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'penalty': ['l1', 'l2'],
            'solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga', 'sag'],
            }, x_train, y_train)

    print("Best Parameters:", best_params)

    return best_model

best_mod = run_model_on_dataset(bow_df_train, df_train_y)

In [None]:
# Get tsvd components as lin combo:
tsvd_components = tsvd_algo.components_
# Get coefs of LR:
coefficients = best_mod.coef_[0]
# Get fature contribution to predictions:
feature_contributions = np.dot(coefficients, tsvd_components)
# Get corresponding bi-grams:
feature_names = vec.get_feature_names_out()
# Feature importance dict:
feature_importance_dict = dict(zip(feature_names, feature_contributions))
# Most important features:
sorted_feature_importance_dict = {k: v for k, v in sorted(feature_importance_dict.items(), key=lambda item: abs(item[1]), reverse=True)}

In [None]:
def highlight_text_html(text, importance_dict):
    max_imp = max(feature_importance_dict.values())
    min_imp = min(feature_importance_dict.values())

    words = text.split()
    i = 0
    result = '<html><body><p>'
    while i < len(words):
        # Check 2-gram
        if i < len(words) - 1 and ' '.join(words[i:i+2]) in importance_dict:
            ngram = ' '.join(words[i:i+2])
            i += 2
        # Check 1-gram
        elif words[i] in importance_dict:
            ngram = words[i]
            i += 1
        # No n-gram found, move to next word
        else:
            result += words[i] + ' '
            i += 1
            continue

        # Calculate color based on score
        score = importance_dict[ngram]
        norm_score = (score - min_imp) / (max_imp - min_imp)
        if norm_score < 0.4 or norm_score > 0.6:
            rgb = matplotlib.colors.rgb2hex(cm.coolwarm(norm_score)[:3])
            # Append highlighted n-gram to result
            result += f'<span style="color: {rgb}">{ngram}</span> '
        else:
            result += ngram + ' '

    result += '</p></body></html>'
    return result

html_text = highlight_text_html(df_train_x[2916], feature_importance_dict)

# Write the html text to a file
with open(path + 'highlighted_text.html', 'w') as f:
    f.write(html_text)

In [None]:
feature_importance_df = {key: value for key, value in feature_importance_dict.items() if not re.search(r'\d', key)}
feature_importance_df = pd.DataFrame(list(feature_importance_df.items()), columns=['Feature', 'Importance'])
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Get the top 20 positive and top 20 negative features
top_pos = feature_importance_df.head(20)
top_neg = feature_importance_df.tail(20)

# Create subplots
sns.set(font_scale=1.2)
fig, axs = plt.subplots(ncols=2, figsize=(14, 6))

# Plot the top 20 positive features
sns.barplot(x='Importance', y='Feature', data=top_pos, ax=axs[0], palette='viridis')
axs[0].set_title('Top 20 Positive Features')

# Plot the top 20 negative features
sns.barplot(x='Importance', y='Feature', data=top_neg, ax=axs[1], palette='viridis')
axs[1].set_title('Top 20 Negative Features')

plt.tight_layout()
plt.show()

# Evaluate CNN:

In [None]:
unique_y = ["violation", "no-violation"]
explainer = lime_text.LimeTextExplainer(class_names=unique_y, verbose=True)

def predict_proba(arr):
  list_tokenized_ex = tokenizer.texts_to_sequences(arr)
  token_ex = pad_sequences(list_tokenized_ex, maxlen=max_len)
  pred = model_CNN.predict(token_ex)

  l = []
  for i in pred:
    l.append(np.array([1-i[0], i[0]]))
  return np.array(l)

In [None]:
explainer.explain_instance(df_train_x.iloc[6071], predict_proba).show_in_notebook(text=True)