In [None]:
!pip install seqeval==0.0.5
!pip install keras==2.2.4

In [None]:
!pip install git+https://www.github.com/keras-team/keras-contrib.git

In [None]:
!pip install pythainlp

In [None]:
from collections import Counter
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers , regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense , Bidirectional , LSTM, GlobalMaxPool1D, Input, Embedding, MaxPooling1D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Embedding
from tensorflow. keras.layers import Flatten, Dropout, Activation, Input, Dense, concatenate, GRU, Dropout, Dense, Activation, Flatten, Conv1D, SpatialDropout1D
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.optimizers import Nadam
import tensorflow.keras.backend as K
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from pythainlp.corpus import thai_stopwords

In [None]:
tf.__version__

# Data Preparation 

In [None]:
def create_article_label(df):
    article_label_encoder = LabelEncoder()
    prediction_encoded = article_label_encoder.fit_transform(df.article)
    df.insert(df.shape[1], 'label',prediction_encoded ) #แปลง article ที่เกี่ยวข้องเป็น label แบบเลข
    prediction_decoded = article_label_encoder.inverse_transform(prediction_encoded)
    map_dict = dict(zip(prediction_encoded,prediction_decoded))
    return df, article_label_encoder,map_dict

In [None]:
torts_df = pd.read_pickle('../input/processed-torts/processed_torts20200123.pkl')
df, article_label_encoder,map_dict = create_article_label(torts_df)

In [None]:
#Create label

X_plaintiff = []
X_defendant = []
X_both = []
Y = []
Y_set = []
cases = []
for case_id in tqdm(df.case_id.unique()):
    Y = np.zeros(article_label_encoder.classes_.shape[0])  
    rows = df[df['case_id'] == case_id]   
    token = ''
    for i, row in rows.iterrows(): 
        Y[row.label] = 1 
        plaintiff_fact_token = row.plaintiff_fact_token
        defendant_fact_token = row.defendant_fact_token   
    cases.append(case_id)
    X_plaintiff.append(plaintiff_fact_token) 
    X_defendant.append(defendant_fact_token)
    X_both.append([' '.join(plaintiff_fact_token),  ' '.join(defendant_fact_token)]) 
    Y_set.append(Y)
Y_set = np.array(Y_set)
X_both = np.array(X_both)

In [None]:
freqs = np.zeros(Y_set.shape[1])
for col_idx in np.arange(0, Y_set.shape[1]):
    freq = np.sum(Y_set[:, col_idx])
    freqs[col_idx] = freq
sorted_idx = np.argsort(freqs, axis=0)[::-1]
sorted_idx

In [None]:
for idx in sorted_idx[1:11]:
  print(f"{idx} : " + map_dict[idx])

In [None]:
df.groupby(['article','label']).sum()

In [None]:
def clean_stop(lst):
    clean_list = []
    stop_words = list(thai_stopwords())
    return [word for word in lst if word not in stop_words]

In [None]:
def clean_text(token_list):
    clean_lst = []
    for token in token_list:
        clean_tok = re.sub(r'[\d๐-๙]','',token)
        clean_tok = re.sub(r'[{}]'.format(string.punctuation),'',clean_tok)
        if len(clean_tok.strip()) != 0:
            clean_lst.append(clean_tok)
    return clean_lst

# Evaluation functions

In [None]:
def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def microf1(y_true, y_pred):

    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))


def macrof1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true * y_pred, 'float'), axis=0)
    # tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1 - y_true) * y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true * (1 - y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2 * p * r / (p + r + K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

# **Only plaintiff as feature** 

# Feature engineering functions

In [None]:
def featurize(token_list):
    token_list = clean_text(token_list)
    dic = {}
    for i in range(len(token_list)):
        dic[token_list[i]] = 1
    return dic

def unibi_featurize(token_list):
  token_list = clean_text(token_list)
  unibi_dic = {}
  for i in range(len(token_list)-1):
    unibi_dic[token_list[i]] = 1
    unibi_dic[token_list[i] + '_' +token_list[i+1]] = 1
  return unibi_dic

In [None]:
unibi_featurize(X_plaintiff[1])

In [None]:
featurize(X_plaintiff[1])

Training Maxent 
===============


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction import DictVectorizer

In [None]:
#Tfidf

for f_num in sorted_idx[1:11]:   
    vectorizer = TfidfVectorizer(ngram_range=(1, 1))
    X = X_both[:, 0]
    Y = Y_set[:, f_num].astype(int)
  
    X_train, X_rest, Y_train, Y_rest = train_test_split(X, Y, test_size=0.3, random_state=42)    #train_seqs-Y_train   , test_seqs - Y_test
    X_test, X_dev, Y_test, Y_dev = train_test_split(X_rest, Y_rest, test_size=0.5, random_state=42) 
    X_train = vectorizer.fit_transform(X_train)
    model = LogisticRegression(solver='liblinear', n_jobs=1, verbose=1, random_state=42, C=1e5, penalty='l2', max_iter=2000)
    model.fit(X_train, Y_train)
    
    X_test = vectorizer.transform(X_test)
    
    print("Evaluation")
    print(f'<<<<<<label {f_num}>>>>>>>>' )

    y_pred = model.predict(X_test)
    
    print('f1 : ')
    print(f1_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('precision : ')
    print(precision_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('recall : ')
    print(recall_score(Y_test, y_pred,labels=[1]))

In [None]:
#bag of words

X_bow_uni = [featurize(lst) for lst in X_plaintiff]

for f_num in sorted_idx[1:11]:   
    vectorizer = DictVectorizer(sparse=True)
    X = X_bow_uni
    Y = Y_set[:, f_num].astype(int)
  
    X_train, X_rest, Y_train, Y_rest = train_test_split(X, Y, test_size=0.3, random_state=42)    #train_seqs-Y_train   , test_seqs - Y_test
    X_test, X_dev, Y_test, Y_dev = train_test_split(X_rest, Y_rest, test_size=0.5, random_state=42) 
    X_train = vectorizer.fit_transform(X_train)
    model = LogisticRegression(solver='liblinear', n_jobs=1, verbose=1, random_state=42, C=1e5, penalty='l2', max_iter=2000)
    model.fit(X_train, Y_train)
    
    X_test = vectorizer.transform(X_test)
    
    print("Evaluation")
    print(f'<<<<<<label {f_num}>>>>>>>>' )

    y_pred = model.predict(X_test)
    
    print('f1 : ')
    print(f1_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('precision : ')
    print(precision_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('recall : ')
    print(recall_score(Y_test, y_pred,labels=[1]))

In [None]:
#unigram + bigram

X_bow = [unibi_featurize(lst) for lst in X_plaintiff]

for f_num in sorted_idx[1:11]:   
    vectorizer = DictVectorizer(sparse=True)
    X = X_bow
    Y = Y_set[:, f_num].astype(int)
  
    X_train, X_rest, Y_train, Y_rest = train_test_split(X, Y, test_size=0.3, random_state=42)    #train_seqs-Y_train   , test_seqs - Y_test
    X_test, X_dev, Y_test, Y_dev = train_test_split(X_rest, Y_rest, test_size=0.5, random_state=42) 
    X_train = vectorizer.fit_transform(X_train)
    model = LogisticRegression(solver='liblinear', n_jobs=1, verbose=1, random_state=42, C=1e5, penalty='l2', max_iter=2000)
    model.fit(X_train, Y_train)
    
    X_test = vectorizer.transform(X_test)
    
    print("Evaluation")
    print(f'<<<<<<label {f_num}>>>>>>>>' )

    y_pred = model.predict(X_test)
    
    print('f1 : ')
    print(f1_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('precision : ')
    print(precision_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('recall : ')
    print(recall_score(Y_test, y_pred,labels=[1]))

# **Both plaintiff and defendant as features**

# Feature engineering functions

In [None]:
#unigram
def two_facts_feature(X1,X2):
    X_facts_feature = {}
    X1 = clean_text(X1)
    X2 = clean_text(X2)
    for p in X1:
        X_facts_feature[p] = 1
    for d in X2:
        X_facts_feature[d] = 1
    return X_facts_feature

In [None]:
 #identify source
def two_facts_uni_feature(X1,X2): 
    X_facts_feature = {}
    X1 = clean_text(X1)
    X2 = clean_text(X2)
    for p in X1:
        X_facts_feature[p + '_' + 'plaintiff'] = 1
        X_facts_feature[p] = 1
    for d in X2:
        X_facts_feature[d + '_' + 'defendant'] = 1
        X_facts_feature[d] = 1
    return X_facts_feature

In [None]:
def two_facts_unibi_feature(X1,X2):
    X_facts_feature = {}
    X1 = clean_text(X1)
    X2 = clean_text(X2)
    for p in X1:
        X_facts_feature[p + '_' + 'plaintiff'] = 1
        X_facts_feature[p] = 1
    for i in range(len(X1)-1):
        X_facts_feature[X1[i] + '_'+ X1[i+1] + '_' + 'plaintiff'] = 1
        X_facts_feature[X1[i] + '_'+ X1[i+1]] = 1
    for i in range(len(X2)-1):
        X_facts_feature[X2[i] + '_'+ X2[i+1] + '_' + 'defendant'] = 1
        X_facts_feature[X2[i] + '_'+ X2[i+1]] = 1
    return X_facts_feature

In [None]:
two_facts_feature(X_plaintiff[0],X_defendant[0])

In [None]:
two_facts_uni_feature(X_plaintiff[1],X_defendant[1])

In [None]:
two_facts_unibi_feature(X_plaintiff[1],X_defendant[1])

#Training Maxent 

In [None]:

X_two_feature = [two_facts_feature(lst1,lst2) for lst1,lst2 in zip(X_plaintiff,X_defendant)]
for f_num in sorted_idx[1:11]:   
    vectorizer = DictVectorizer(sparse=True)
    X = X_two_feature
    Y = Y_set[:, f_num].astype(int)
  
    X_train, X_rest, Y_train, Y_rest = train_test_split(X, Y, test_size=0.3, random_state=42)    #train_seqs-Y_train   , test_seqs - Y_test
    X_test, X_dev, Y_test, Y_dev = train_test_split(X_rest, Y_rest, test_size=0.5, random_state=42) 
    X_train = vectorizer.fit_transform(X_train)
    model = LogisticRegression(solver='liblinear', n_jobs=1, verbose=1, random_state=42, C=1e5, penalty='l2', max_iter=2000)
    model.fit(X_train, Y_train)
    
    X_test = vectorizer.transform(X_test)
    
    print("Evaluation")
    print(f'<<<<<<label {f_num}>>>>>>>>' )

    y_pred = model.predict(X_test)
    
    print('f1 : ')
    print(f1_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('precision : ')
    print(precision_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('recall : ')
    print(recall_score(Y_test, y_pred,labels=[1]))


In [None]:
X_two_uni = [two_facts_uni_feature(lst1,lst2) for lst1,lst2 in zip(X_plaintiff,X_defendant)]

for f_num in sorted_idx[1:11]:   
    vectorizer = DictVectorizer(sparse=True)
    X = X_two_uni
    Y = Y_set[:, f_num].astype(int)
  
    X_train, X_rest, Y_train, Y_rest = train_test_split(X, Y, test_size=0.3, random_state=42)    #train_seqs-Y_train   , test_seqs - Y_test
    X_test, X_dev, Y_test, Y_dev = train_test_split(X_rest, Y_rest, test_size=0.5, random_state=42) 
    X_train = vectorizer.fit_transform(X_train)
    model = LogisticRegression(solver='liblinear', n_jobs=1, verbose=1, random_state=42, C=1e5, penalty='l2', max_iter=2000)
    model.fit(X_train, Y_train)
    
    X_test = vectorizer.transform(X_test)
    
    print("Evaluation")
    print(f'<<<<<<label {f_num}>>>>>>>>' )

    y_pred = model.predict(X_test)
    
    print('f1 : ')
    print(f1_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('precision : ')
    print(precision_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('recall : ')
    print(recall_score(Y_test, y_pred,labels=[1]))

In [None]:
X_two_uni_bi = [two_facts_unibi_feature(lst1,lst2) for lst1,lst2 in zip(X_plaintiff,X_defendant)]

for f_num in sorted_idx[1:11]:   
    vectorizer = DictVectorizer(sparse=True)
    X = X_two_uni_bi
    Y = Y_set[:, f_num].astype(int)
  
    X_train, X_rest, Y_train, Y_rest = train_test_split(X, Y, test_size=0.3, random_state=42)    #train_seqs-Y_train   , test_seqs - Y_test
    X_test, X_dev, Y_test, Y_dev = train_test_split(X_rest, Y_rest, test_size=0.5, random_state=42) 
    X_train = vectorizer.fit_transform(X_train)
    model = LogisticRegression(solver='liblinear', n_jobs=1, verbose=1, random_state=42, C=1e5, penalty='l2', max_iter=2000)
    model.fit(X_train, Y_train)
    
    X_test = vectorizer.transform(X_test)
    
    print("Evaluation")
    print(f'<<<<<<label {f_num}>>>>>>>>' )

    y_pred = model.predict(X_test)
    
    print('f1 : ')
    print(f1_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('precision : ')
    print(precision_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('recall : ')
    print(recall_score(Y_test, y_pred,labels=[1]))

# Evaluation

In [None]:
for f_num in sorted_idx[1:11]:   
    vectorizer = TfidfVectorizer(ngram_range=(1, 1))
    X = X_both
    Y = Y_set[:, f_num].astype(int)
  
    X_train, X_rest, Y_train, Y_rest = train_test_split(X, Y, test_size=0.3, random_state=42)    
    X_test, X_dev, Y_test, Y_dev = train_test_split(X_rest, Y_rest, test_size=0.5, random_state=42) 
    
    X_train_0 = vectorizer.fit_transform(X_train[:, 0])
    X_test_0 = vectorizer.transform(X_test[:, 0])
    X_train_1 = vectorizer.fit_transform(X_train[:, 1])
    X_test_1 = vectorizer.transform(X_test[:, 1])
    X_train = np.hstack([X_train_0.todense(), X_train_1.todense()])
    model = LogisticRegression(solver='liblinear', n_jobs=1, verbose=1, random_state=42, C=1e5, penalty='l2', max_iter=2000)
    model.fit(X_train, Y_train)
    
    X_2 = np.hstack([X_test_0.todense(), X_test_1.todense()])
    
    print("Evaluation")
    print(f'<<<<<<label {f_num}>>>>>>>>' )

    y_pred = model.predict(X_2)
    
    print('f1 : ')
    print(f1_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('precision : ')
    print(precision_score(Y_test, y_pred,labels=[1]))
    print('------------------------------------------------------------ ')
    print('recall : ')
    print(recall_score(Y_test, y_pred,labels=[1]))