In [73]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.preprocessing import MultiLabelBinarizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from numpy import array, asarray, zeros
from sklearn.model_selection import train_test_split, KFold
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalMaxPool1D, Conv1D, Dropout
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.losses import binary_crossentropy
from sklearn.metrics import average_precision_score, recall_score, f1_score, precision_score


In [74]:

# Load preprocessed dataset
file_path = "/Users/lalitaneeharikavajjhala/Desktop/Research credits /Data/preprocessed_data.csv"
df = pd.read_csv(file_path)


In [75]:

# Previewing data head and extend the max column width
pd.set_option('display.max_colwidth', None)
df.head()


Unnamed: 0,tag,sentence
0,['obligation'],we will issue a certificate of completion for each manager trainee who completes the initial training program we require to our satisfaction each such person will be referred to a a certified manager
1,['obligation'],elephant talk bear the risk of and shall indemnify against high usage fraud and bed of it elephant talk customer
2,['obligation'],subject to the term and condition of this agreement aimmune shall be responsible for the development of the product a set forth herein aimmune itself or with or through it affiliate and sublicensees shall use commercially reasonable effort to perform the development activity for the product to i achieve the development milestone set forth in section and ii obtain regulatory approval for the product
3,['obligation'],ediets shall ensure that the ediets content complies with editorial guideline
4,['obligation'],auriemma will participate in one recording session annually during the service period of not more than two hour not including travel time to record a radio advertising spot at a date and location to be mutually agreed upon


In [76]:

# Converting tags from strings to lists
df['tag'] = df['tag'].apply(lambda x: literal_eval(x))

# Encoding tags 'y'
y = df['tag']
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(y)




In [77]:
# Standard keras pre-processing
maxlen = 200
max_words = 2000
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(df.sentence)

# Functions to transform text to feature_vectors
def get_features(text_series):
    sequences = tokenizer.texts_to_sequences(text_series)
    return pad_sequences(sequences, maxlen=maxlen)

# Calling function to create features 'X'
X = get_features(df.sentence)


In [78]:

# Load Law2Vec embeddings
embeddings_dictionary = {}
law2vec_file = open("/Users/lalitaneeharikavajjhala/Desktop/Research credits /Data/Law2Vec.100d.txt", encoding="utf8")

for line in law2vec_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions

law2vec_file.close()


In [79]:

# Create embedding matrix
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
embedding_matrix = zeros((vocab_size, embedding_dim))

for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True, stratify=y)

# Define CNN model with Law2Vec embeddings
filter_length = 300
num_classes = 3

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=maxlen, trainable=False))  # Removed weights argument
model.add(Dropout(0.2))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

# Set weights for the embedding layer
print("Shape of embedding matrix:", embedding_matrix.shape)
model.layers[0].set_weights([embedding_matrix])

# Define the callbacks
callbacks = [
    ReduceLROnPlateau(),
    EarlyStopping(patience=4)
]

# Perform cross-validation
kf = KFold(n_splits=6, shuffle=True)
train_loss = []
val_loss = []

for train_index, val_index in kf.split(X_train):
    X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
    y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]

    history = model.fit(
        X_fold_train,
        y_fold_train,
        epochs=25,
        batch_size=32,
        callbacks=callbacks,
        validation_data=(X_fold_val, y_fold_val)
    )

    train_loss.append(history.history['loss'][-1])
    val_loss.append(history.history['val_loss'][-1])

# Print the training and validation loss for each fold
for fold in range(5):
    print("Fold %d - Train Loss: %.4f - Val Loss: %.4f" % (fold+1, train_loss[fold], val_loss[fold]))



Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 200, 100)          292700    
                                                                 
 dropout_8 (Dropout)         (None, 200, 100)          0         
                                                                 
 conv1d_8 (Conv1D)           (None, 198, 300)          90300     
                                                                 
 global_max_pooling1d_8 (Gl  (None, 300)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_8 (Dense)             (None, 3)                 903       
                                                                 


Total params: 383903 (1.46 MB)
Trainable params: 91203 (356.26 KB)
Non-trainable params: 292700 (1.12 MB)
_________________________________________________________________
Shape of embedding matrix: (2927, 100)
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Fold 1 - Train Loss: 0.0652 - Val Loss: 0.1795
Fold 2 - Train Loss: 0.0561 - Val Loss: 0.0588
Fold 3 - Train Loss: 0.0267 - Val Loss: 0.0246
Fold 4 - Train 

### EVALUATION METRICS

In [80]:
# Evaluate model on test data
metrics = model.evaluate(X_test, y_test)
print("Test Loss: {}".format(metrics[0]))
print("Test Accuracy: {}".format(metrics[1]))

Test Loss: 0.2122933268547058
Test Accuracy: 0.878947377204895


In [81]:
y_pred = model.predict(X_test)
thresholded_preds = (y_pred > 0.5).astype(int)  # Applying threshold for binary classification
precision = precision_score(y_test, thresholded_preds, average = 'weighted')
recall = recall_score(y_test, thresholded_preds, average = 'weighted')
f1 = f1_score(y_test, thresholded_preds, average= 'weighted')
# print("Precision Score: {:.2}".format(average_precision_score(y_test,y_pred)))
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Precision: 0.9070055869770711
Recall: 0.8651162790697674
F1 Score: 0.8854923548241157
