In [None]:
!pip install seqeval==0.0.5
!pip install keras==2.2.4

In [None]:
!pip install git+https://www.github.com/keras-team/keras-contrib.git

In [None]:
!pip install pythainlp

Collecting pythainlp
  Downloading pythainlp-2.2.6-py3-none-any.whl (10.6 MB)
[K     |████████████████████████████████| 10.6 MB 4.4 MB/s eta 0:00:01
[?25hCollecting python-crfsuite>=0.9.6
  Downloading python_crfsuite-0.9.7-cp37-cp37m-manylinux1_x86_64.whl (743 kB)
[K     |████████████████████████████████| 743 kB 26.1 MB/s eta 0:00:01
[?25hCollecting tinydb>=3.0
  Downloading tinydb-4.4.0-py3-none-any.whl (21 kB)
Installing collected packages: tinydb, python-crfsuite, pythainlp
Successfully installed pythainlp-2.2.6 python-crfsuite-0.9.7 tinydb-4.4.0


In [None]:
from collections import Counter
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers , regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense , Bidirectional , LSTM, GlobalMaxPool1D , Concatenate, Dropout, Embedding, Flatten, Dropout, Activation, Input, Dense, concatenate, GRU, Conv1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.optimizers import Nadam
import tensorflow.keras.backend as K
from sklearn.metrics import classification_report, precision_recall_fscore_support, f1_score, precision_score, recall_score
from pythainlp.corpus import thai_stopwords

In [None]:
tf.__version__

# Data Preparation 


In [None]:
def create_article_label(df):
    article_label_encoder = LabelEncoder()
    prediction_encoded = article_label_encoder.fit_transform(df.article)
    df.insert(df.shape[1], 'label',prediction_encoded ) 
    prediction_decoded = article_label_encoder.inverse_transform(prediction_encoded)
    map_dict = dict(zip(prediction_encoded,prediction_decoded))
    return df, article_label_encoder,map_dict

In [None]:
torts_df = pd.read_pickle('../input/processed-torts/processed_torts20200123.pkl')
df, article_label_encoder,map_dict = create_article_label(torts_df)

In [None]:
def clean_stop(lst):
    clean_list = []
    stop_words = list(thai_stopwords())
    return [word for word in lst if word not in stop_words]

def load_fasttext_fast(word_index, max_words, embed_size,file_name = "../input/word-vec-thai/cc.th.300.vec"):
    EMBEDDING_FILE = file_name
    emb_mean, emb_std = -0.0033470048, 0.109855264
    #125,302 tokenized Thai Wikipedia articles using deepcut model
    embedding_matrix = np.random.normal(emb_mean, emb_std, (max_words, embed_size))
    with open(EMBEDDING_FILE, 'r', encoding="utf8") as f:       
        for line in f:
            if len(line) <= 100:
                continue
            word, vec = line.split(' ', 1)
            if word not in word_index:
                continue
            i = word_index[word]
            if i >= max_words:
                continue
            embedding_vector = np.asarray(vec.split(' '), dtype='float32')[:300]
            if len(embedding_vector) == 300:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [None]:
X_plaintiff = []
X_defendant = []
X_both = []
Y = []
Y_set = []
cases = []
for case_id in tqdm(df.case_id.unique()):
    Y = np.zeros(article_label_encoder.classes_.shape[0])  
    rows = df[df['case_id'] == case_id]    
    token = ''
    for i, row in rows.iterrows(): 
        Y[row.label] = 1 
        plaintiff_fact_token = row.plaintiff_fact_token
        defendant_fact_token = row.defendant_fact_token   
    cases.append(case_id)
    X_plaintiff.append(' '.join(clean_stop(plaintiff_fact_token))) 
    X_defendant.append(' '.join(clean_stop(defendant_fact_token)))
    X_both.append([' '.join(clean_stop((plaintiff_fact_token))),  ' '.join(clean_stop(defendant_fact_token))]) 
    Y_set.append(Y)
Y_set = np.array(Y_set)
X_both = np.array(X_both)

100%|██████████| 2352/2352 [00:23<00:00, 98.72it/s] 


In [None]:
X_both[0]

In [None]:
freqs = np.zeros(Y_set.shape[1])
for col_idx in np.arange(0, Y_set.shape[1]):
    freq = np.sum(Y_set[:, col_idx])
    freqs[col_idx] = freq
sorted_idx = np.argsort(freqs, axis=0)[::-1]
sorted_idx

array([ 687,  251,  692, ..., 1015, 1016,    0])

In [None]:
for idx in sorted_idx[1:11]:
  print(f"{idx} : " + map_dict[idx])

In [None]:
df.groupby(['article','label']).sum()

In [None]:
import string
def process_input(num_words, X_train, X_test, X_dev):
    strings = '1234567890๑๒๓๔๕๖๗๘๙๐' + string.punctuation
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words, oov_token='<UNK>',filters = strings ) 
    tokenizer.fit_on_texts(X_train)

    #convert text data to numerical indexes
    train_seqs = tokenizer.texts_to_sequences(X_train)
    dev_seqs = tokenizer.texts_to_sequences(X_dev)
    test_seqs = tokenizer.texts_to_sequences(X_test)

    max_sequnce_len = max([len(x) for x in train_seqs])

    train_seqs = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, maxlen=max_sequnce_len, padding="post")
    test_seqs=tf.keras.preprocessing.sequence.pad_sequences(test_seqs, maxlen=max_sequnce_len, padding="post")
    dev_seqs=tf.keras.preprocessing.sequence.pad_sequences(dev_seqs, maxlen=max_sequnce_len, padding="post")

    return  train_seqs, test_seqs, dev_seqs, max_sequnce_len, tokenizer

In [None]:
def proportional_generator(data, label, p=[0.1, 0.9], batch_size=128):
    # p indicate number of class and sampling prob
   
    while (True):
        batch_data = []
        batch_label = []
        sample_id = np.random.choice(len(p), batch_size, p=p) #genearate sample id โดยให้มีโอกาสได้ 1 เยอะๆคือ 0.9 (len(p) คือสุ่มเลข 0-1) ขนาด 128 ตัว
        query_idx = [
            np.where(label == class_id)[0] for class_id in range(len(p))
        ]
        for class_id in sample_id:
            query_id = np.random.choice(query_idx[class_id], 1)[0]
            batch_data.append(data[query_id])
            batch_label.append(label[query_id])
       
        yield np.array(batch_data), np.array(batch_label)
     
      

In [None]:
def multi_input_proportional_generator(datasets,
                                       label,
                                       p=[0.1, 0.9],
                                       batch_size=128):
    # p indicate number of class and sampling prob
    while (True):
        batch_data = [[], []]
        batch_label = []
        sample_id = np.random.choice(len(p), batch_size, p=p)
        query_idx = [
            np.where(label == class_id)[0] for class_id in range(len(p))
        ]
        for class_id in sample_id:
            query_id = np.random.choice(query_idx[class_id], 1)[0]
            batch_data[0].append(datasets[0][query_id])
            batch_data[1].append(datasets[1][query_id])
            batch_label.append(label[query_id])
        batch_data[0] = np.array(batch_data[0])
        batch_data[1] = np.array(batch_data[1])
        yield batch_data, np.array(batch_label)

In [None]:
len(df.case_id.unique())

# Evaluation functions

In [None]:
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def microf1(y_true, y_pred):

    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))


def macrof1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true * y_pred, 'float'), axis=0)
    # tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1 - y_true) * y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true * (1 - y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2 * p * r / (p + r + K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

def single_class_accuracy(interesting_class_id):
    def single_class(y_true, y_pred):
        class_id_preds = K.argmax(y_pred, axis=-1)
        # Replace class_id_preds with class_id_true for recall here
        positive_mask = K.cast(K.equal(class_id_preds, interesting_class_id), 'int32')
        true_mask = K.cast(K.equal(y_true, interesting_class_id), 'int32')
        acc_mask = K.cast(K.equal(positive_mask, true_mask), 'float32')
        class_acc = K.mean(acc_mask)
        return class_acc

    return single_class

Training BiLSTM
===============

*One input : Only plaintiff*

In [None]:
BATCH_SIZE= 64 
EPOCHS = 10
NUM_WORDS=8000
embedding_size = 300
 
for f_num in sorted_idx[1:11]:  
    print(f'<<<<<<label {f_num}>>>>>>>>' )
    X = X_both
    Y = Y_set[:, f_num].astype(int)
  
    X_train, X_rest, Y_train, Y_rest = train_test_split(X, Y, test_size=0.3, random_state=42)    
    X_test, X_dev, Y_test, Y_dev = train_test_split(X_rest, Y_rest, test_size=0.5, random_state=42)  
 
    train_seqs, test_seqs, dev_seqs, max_sequnce_len, tokenizer = process_input(NUM_WORDS, X_train[:, 0], X_test[:, 0],X_dev[:, 0])
    embedding_matrix =  load_fasttext_fast(tokenizer.word_index,NUM_WORDS+1,embedding_size) #load word embedding
    
    train_generator = proportional_generator(train_seqs, Y_train, p=[0.5, 0.5], batch_size=BATCH_SIZE)
    validation_generator = proportional_generator(dev_seqs, Y_dev, batch_size=BATCH_SIZE)
 
    input_layer = Input(shape=(max_sequnce_len,))
    embedding_layer = Embedding(NUM_WORDS+1, embedding_size , weights=[embedding_matrix], trainable= False , mask_zero=True)(input_layer) 
    model = Bidirectional(LSTM(units=50, recurrent_dropout=0.3))(embedding_layer)
    f1 = Dense(50, activation='relu')(model)
    d1 = Dropout(.5)(f1)
    out = Dense(1, activation='sigmoid')(d1)
    model = Model(input_layer, out)
    
    opt = tf.keras.optimizers.Adam(
    learning_rate=0.0001,
    name='Adam'
        )
    model.compile(optimizer= opt,loss="binary_crossentropy", metrics=['accuracy', tf.keras.metrics.AUC(), recall, precision, microf1, macrof1])
    model.summary()
 
    num_batches = int(len(train_seqs)/BATCH_SIZE)
    history = model.fit(train_generator, epochs=EPOCHS, steps_per_epoch=num_batches,validation_data=validation_generator,validation_steps=num_batches,verbose = 1)

    print("Evaluation")
    print(f'<<<<<<label {f_num}>>>>>>>>' )

    predictions=model.predict(test_seqs, verbose=1)
    y_pred = [1 if lst[0] > 0.5 else 0  for lst in predictions ]
    print('f1 : ')
    print(f1_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('precision : ')
    print(precision_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('recall : ')
    print(recall_score(Y_test, y_pred,labels=[1]))

*Two inputs : Plaintiff + Defendant*

In [None]:
BATCH_SIZE=64
EPOCHS=10
NUM_WORDS=8000
embedding_size=300

for f_num in sorted_idx[4:11]:  
    X = X_both
    Y = Y_set[:, f_num].astype(int)
    X_train, X_rest, Y_train, Y_rest = train_test_split(X, Y, test_size=0.3, random_state=42)    
    X_test, X_dev, Y_test, Y_dev = train_test_split(X_rest, Y_rest, test_size=0.5, random_state=42)  
    train_seqs1, test_seqs1, dev_seqs1, max_sequence_len1, tokenizer1 = process_input(NUM_WORDS, X_train[:, 0], X_test[:, 0], X_dev[:, 0])
    train_seqs2, test_seqs2,dev_seqs2, max_sequence_len2, tokenizer2 = process_input(NUM_WORDS, X_train[:, 1], X_test[:, 1], X_dev[:, 1])
    train_generator = multi_input_proportional_generator([train_seqs1, train_seqs2], Y_train, p=[0.5, 0.5], batch_size=BATCH_SIZE)
    validation_generator = multi_input_proportional_generator([dev_seqs1, dev_seqs2], Y_dev, batch_size=BATCH_SIZE)
 
    input_layer1 = Input(shape=(max_sequence_len1,))
    embedding_layer1 = Embedding(input_dim= NUM_WORDS+1, output_dim=128, input_length= max_sequence_len1, trainable=True)(input_layer1) 
    Bi_layer1 = Bidirectional(LSTM(units=50, recurrent_dropout=0.3))(embedding_layer1)
        
    input_layer2 = Input(shape=(max_sequence_len2,))
    embedding_layer2 = Embedding(input_dim= NUM_WORDS+1, output_dim=128, input_length=max_sequence_len2, trainable=True)(input_layer2)
    Bi_layer2 = Bidirectional(LSTM(units=50, recurrent_dropout=0.3))(embedding_layer2)
            
    concat_layer = Concatenate()([Bi_layer1, Bi_layer2])
    
    f1 = Dense(100, activation='relu')(concat_layer)
    d1 = Dropout(.5)(f1)
    f2 = Dense(100, activation='relu')(d1)
    d2 = Dropout(.5)(f2)
    out = Dense(1, activation='sigmoid')(d2)
    model = Model(inputs=[input_layer1, input_layer2], outputs = out)
 
    opt = tf.keras.optimizers.Adam(
    learning_rate=0.0001,
    name='Adam'
        )
    model.compile(optimizer=opt, loss="binary_crossentropy", metrics=[single_class_accuracy(1),tf.keras.metrics.BinaryAccuracy(),tf.keras.metrics.AUC(),recall, precision, microf1, macrof1])
    model.summary()
    
    num_batches = int(len(train_seqs1)/BATCH_SIZE)
 
    history = model.fit(train_generator, epochs=EPOCHS, steps_per_epoch=num_batches,validation_data=validation_generator,validation_steps=num_batches,verbose = 1)
 
 
    print("Evaluation")
    print(f'<<<<<<label {f_num}>>>>>>>>' )

    predictions=model.predict([test_seqs1,test_seqs2], verbose=1)
    y_pred = [1 if lst[0] > 0.5 else 0  for lst in predictions ]
    print('f1 : ')
    print(f1_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('precision : ')
    print(precision_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('recall : ')
    print(recall_score(Y_test, y_pred,labels=[1]))


Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           [(None, 1067)]       0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           [(None, 521)]        0                                            
__________________________________________________________________________________________________
embedding_12 (Embedding)        (None, 1067, 128)    1024128     input_13[0][0]                   
__________________________________________________________________________________________________
embedding_13 (Embedding)        (None, 521, 128)     1024128     input_14[0][0]                   
____________________________________________________________________________________________

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Evaluation
<<<<<<label 135>>>>>>>>
f1 : 
0.12820512820512822
------------------------------------------------------------ 
precision : 
0.08928571428571429
------------------------------------------------------------ 
recall : 
0.22727272727272727
Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_17 (InputLayer)           [(None, 1067)]       0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           [(None, 521)]        0                                            
__________________________________________________________________________________________________
embedding_16 (Embedding)        (None, 10

Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Evaluation
<<<<<<label 715>>>>>>>>
f1 : 
0.3333333333333333
------------------------------------------------------------ 
precision : 
0.32
------------------------------------------------------------ 
recall : 
0.34782608695652173
Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           [(None, 1067)]       0                                            
__________________________________________________________________________________________________
input_20 (InputLayer)           [(None, 521)]        0                                            
__________________________________________________________________________________________________
embedding_18 (Embedding)        (None, 1067, 128)    1024128     input_19[0][0]

Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Evaluation
<<<<<<label 688>>>>>>>>
f1 : 
0.2608695652173913
------------------------------------------------------------ 
precision : 
0.3
------------------------------------------------------------ 
recall : 
0.23076923076923078
Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_21 (InputLayer)           [(None, 1067)]       0                                            
__________________________________________________________________________________________________
input_22 (InputLayer)           [(None, 521)]        0                                            
__________________________________________________________________________________________________
embedding_20 (Embedding)        (None, 1067, 128)    1024128     input_21[0][0]                   
___

Epoch 8/10
Epoch 9/10
Epoch 10/10
Evaluation
<<<<<<label 202>>>>>>>>
f1 : 
0.0
------------------------------------------------------------ 
precision : 
0.0
------------------------------------------------------------ 
recall : 
0.0
Model: "model_10"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_23 (InputLayer)           [(None, 1067)]       0                                            
__________________________________________________________________________________________________
input_24 (InputLayer)           [(None, 521)]        0                                            
__________________________________________________________________________________________________
embedding_22 (Embedding)        (None, 1067, 128)    1024128     input_23[0][0]                   
_______________________________________________________

Epoch 10/10
Evaluation
<<<<<<label 100>>>>>>>>
f1 : 
0.05128205128205128
------------------------------------------------------------ 
precision : 
0.037037037037037035
------------------------------------------------------------ 
recall : 
0.08333333333333333
Model: "model_11"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_25 (InputLayer)           [(None, 1067)]       0                                            
__________________________________________________________________________________________________
input_26 (InputLayer)           [(None, 521)]        0                                            
__________________________________________________________________________________________________
embedding_24 (Embedding)        (None, 1067, 128)    1024128     input_25[0][0]                   
____________________________

f1 : 
0.06666666666666667
------------------------------------------------------------ 
precision : 
0.09090909090909091
------------------------------------------------------------ 
recall : 
0.05263157894736842
