### IMPORTING THE LIBRARIES 

1. DATA HANDLING

In [70]:
import pandas as pd
import numpy as np

2. DATA PREPROCESSING

In [71]:
from ast import literal_eval
from sklearn.preprocessing import MultiLabelBinarizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import text, sequence
from keras.preprocessing.sequence import pad_sequences
from numpy import array, asarray, zeros

3. MODEL BUILDING

In [72]:
from sklearn.model_selection import train_test_split
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Conv1D, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
import tensorflow as tf
from sklearn.metrics import average_precision_score, f1_score, recall_score, precision_score

4. SAVING THE MODEL 

In [73]:
import joblib

### DATASET 

In [74]:
# Loading preprocessed dataset
file_path = "/Users/lalitaneeharikavajjhala/Desktop/Research credits /Data/preprocessed_data.csv"
df = pd.read_csv(file_path)

In [75]:
# Previewing data head and extend the max column width
pd.set_option('display.max_colwidth', None)
df.head()

Unnamed: 0,tag,sentence
0,['obligation'],we will issue a certificate of completion for each manager trainee who completes the initial training program we require to our satisfaction each such person will be referred to a a certified manager
1,['obligation'],elephant talk bear the risk of and shall indemnify against high usage fraud and bed of it elephant talk customer
2,['obligation'],subject to the term and condition of this agreement aimmune shall be responsible for the development of the product a set forth herein aimmune itself or with or through it affiliate and sublicensees shall use commercially reasonable effort to perform the development activity for the product to i achieve the development milestone set forth in section and ii obtain regulatory approval for the product
3,['obligation'],ediets shall ensure that the ediets content complies with editorial guideline
4,['obligation'],auriemma will participate in one recording session annually during the service period of not more than two hour not including travel time to record a radio advertising spot at a date and location to be mutually agreed upon


In [76]:
# Converting tags from strings to lists
df['tag'] = df['tag'].apply(lambda x: literal_eval(x))

In [77]:
# Encoding tags 'y'
y = df['tag']
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(y)
y

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       ...,
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1]])

In [78]:
# Standard keras pre-processing
maxlen = 200 # Highest word count is 691 and mean is 52; however, 691 is an outlier
max_words = 2000
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(df.sentence)

# Functions to transform text to feature_vectors 
def get_features(text_series):
    sequences = tokenizer.texts_to_sequences(text_series)
    return pad_sequences(sequences, maxlen=maxlen)

In [79]:
# Calling function to create features 'X'
X = get_features(df.sentence)

# Transforming y
y = multilabel.transform(df.tag)

print(X.shape, y.shape)

(947, 200) (947, 3)


In [80]:
X

array([[  0,   0,   0, ...,   8, 577, 372],
       [  0,   0,   0, ..., 105, 106, 109],
       [  0,   0,   0, ...,  19,   1,  31],
       ...,
       [  0,   0,   0, ...,  14,  11,  37],
       [  0,   0,   0, ...,   1,  12,   9],
       [  0,   0,   0, ..., 276,   5, 238]], dtype=int32)

In [81]:
# law2vec 100 dimensional word embeddings
vocab_size = len(tokenizer.word_index) + 1 # Adding 1 accounts for the possibility of having an out-of-vocabulary token

embeddings_dictionary = dict()

law2vec_file = open("/Users/lalitaneeharikavajjhala/Desktop/Research credits /Data/Law2Vec.100d.txt", encoding="utf8")

In [82]:
# Parsing each line and storing word-vector pairs in a dictionary
for line in law2vec_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
law2vec_file.close()

In [83]:
# Each row corresponds to a word with its 100 dimensional word vector
embedding_matrix = zeros((vocab_size, 100))

# tokenizer.word_index is a list of (word, id) tuples
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

### MODEL DEVELOPMENT

In [84]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0, shuffle=True, stratify=y)

In [85]:
# Modelling - Convolutional Neural Network with law2vec embedding

filter_length = 300
num_classes = 3 #so that the final layer is capable of outputting multiple labels

embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen, trainable=False)
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 200, 100)          292700    
                                                                 
 dropout_3 (Dropout)         (None, 200, 100)          0         
                                                                 
 conv1d_3 (Conv1D)           (None, 198, 300)          90300     
                                                                 
 global_max_pooling1d_3 (Gl  (None, 300)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_3 (Dense)             (None, 3)                 903       
                                                                 
 activation_3 (Activation)   (None, 3)                 0         
                                                      

In [86]:
# Fitting the model
callbacks = [
    ReduceLROnPlateau(), # This callback reduces the learning rate when a monitored metric has stopped improving
    EarlyStopping(patience=4)
]

history = model.fit(X_train, y_train,
                    epochs=20,
                    batch_size=32,
                    callbacks=callbacks)

Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### EVALUATION METRICS

In [87]:
# Evaluation metrics
metrics = model.evaluate(X_test, y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.20908187329769135
categorical_accuracy: 0.8736842274665833


In [88]:
y_pred = model.predict(X_test)
thresholded_preds = (y_pred > 0.5).astype(int)  # Applying threshold for binary classification
precision = precision_score(y_test, thresholded_preds, average = 'weighted')
recall = recall_score(y_test, thresholded_preds, average = 'weighted')
f1 = f1_score(y_test, thresholded_preds, average= 'weighted')
# print("Precision Score: {:.2}".format(average_precision_score(y_test,y_pred)))
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Precision: 0.9053182330863692
Recall: 0.8651162790697674
F1 Score: 0.8839369000968755


### PREDICTION 

In [89]:
# x = ["Each Party shall return to the other all of the other’s Confidential Information and any other material, information or samples relating to the Product which have been provided or made available to the other and shall not retain any copies and the Parties further agree not to make any further use of each other’s Confidential Information or any other information, data or samples relating to the Product provided or made available by the other Party, except as necessary to comply with its statutory, regulatory or licensing obligations; provided, however, that Kitov may retain such material, information and/or samples relating to the Product as may be necessary for Kitov to continue to sell the Product as permitted by Section ​5.4.4 below, following which, Kitov shall refrain from making any further use of Dexcel’s Confidential Information or any other information, data or samples and shall return any remaining Confidential Information and material, information or samples relating to the Product."]
x = ["The confidentiality obligations contained in this section XI shall not apply to the extent that the receiving Party (the 'Recipient') is required (a) to disclose information by law, order or regulation of a governmental agency or a court of competent jurisdiction , or (b) to disclose information to any governmental agency for purposes of obtaining approval to test or market a Product , provided in either case that the Recipient shall provide written notice thereof to the other Party and sufficient opportunity to object to any such disclosure or to request confidential treatment thereof."]
xt = get_features(x)
prediction = model.predict(xt)
# probas = np.array(prediction)
# labels = (probas > 0.5).astype(np.int)

probas = (prediction > 0.5).astype(int)
tags = multilabel.inverse_transform(probas)
# tags = multilabel.inverse_transform(labels)

print(prediction)
# print(labels)
print(tags)

[[0.9671399  0.0086486  0.61855996]]
[('obligation', 'prohibition')]


### SAVING THE MODEL 

In [90]:
# Save tokenizer
# joblib.dump(tokenizer, "/Users/lalitaneeharikavajjhala/Desktop/Research credits /Models/MultiLabelTokenizer.pkl")

In [91]:
# Save binarizer
# joblib.dump(multilabel, "/Users/lalitaneeharikavajjhala/Desktop/Research credits /Models/MultiLabelBinarizer_CNN.pkl")

In [92]:
# Save model
# joblib.dump(model, "/Users/lalitaneeharikavajjhala/Desktop/Research credits /Models/MultiLabelModel_CNN.pkl")