### IMPORTING LIBRARIES 

1. DATA HANDLING

In [125]:
import pandas as pd
import numpy as np

2. DATA PRE PROCESSING

In [126]:
from ast import literal_eval
from sklearn.preprocessing import MultiLabelBinarizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import text, sequence
from keras.preprocessing.sequence import pad_sequences

3. MODEL BUILDING

In [127]:
from sklearn.model_selection import train_test_split
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Conv1D, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
import tensorflow as tf
from sklearn.metrics import average_precision_score, precision_score, recall_score, f1_score

4. TO SAVE MODEL

In [128]:
import joblib

### DATASET 

In [129]:
# Loading preprocessed dataset
file_path = "/Users/lalitaneeharikavajjhala/Desktop/Research credits /Data/preprocessed_data.csv"
df = pd.read_csv(file_path)

In [130]:
# Previewing data head and extend the max column width
pd.set_option('display.max_colwidth', None)
df.head()

Unnamed: 0,tag,sentence
0,['obligation'],we will issue a certificate of completion for each manager trainee who completes the initial training program we require to our satisfaction each such person will be referred to a a certified manager
1,['obligation'],elephant talk bear the risk of and shall indemnify against high usage fraud and bed of it elephant talk customer
2,['obligation'],subject to the term and condition of this agreement aimmune shall be responsible for the development of the product a set forth herein aimmune itself or with or through it affiliate and sublicensees shall use commercially reasonable effort to perform the development activity for the product to i achieve the development milestone set forth in section and ii obtain regulatory approval for the product
3,['obligation'],ediets shall ensure that the ediets content complies with editorial guideline
4,['obligation'],auriemma will participate in one recording session annually during the service period of not more than two hour not including travel time to record a radio advertising spot at a date and location to be mutually agreed upon


In [131]:
# Converting tags from strings to lists
df['tag'] = df['tag'].apply(lambda x: literal_eval(x))

In [132]:
# Encodeing tags 'y'
y = df['tag']
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(y)
y

array([[1, 0, 0],
       [1, 0, 0],
       [1, 0, 0],
       ...,
       [0, 1, 0],
       [0, 0, 1],
       [0, 0, 1]])

In [133]:
# Standard keras pre-processing
maxlen = 200 # Highest word count is 691 and mean is 52; however, 691 is an outlier
max_words = 2000
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(df.sentence)

# Functions to transform text to feature_vectors 
def get_features(text_series):
    sequences = tokenizer.texts_to_sequences(text_series)
    return pad_sequences(sequences, maxlen=maxlen)

In [134]:
# Calling function to create features 'X'
X = get_features(df.sentence)

# Transforming y
y = multilabel.transform(df.tag)

print(X.shape, y.shape)

(947, 200) (947, 3)


In [135]:
X

array([[  0,   0,   0, ...,   8, 577, 372],
       [  0,   0,   0, ..., 105, 106, 109],
       [  0,   0,   0, ...,  19,   1,  31],
       ...,
       [  0,   0,   0, ...,  14,  11,  37],
       [  0,   0,   0, ...,   1,  12,   9],
       [  0,   0,   0, ..., 276,   5, 238]], dtype=int32)

### MODEL DEVELOPMENT

INITIAL ORIGINAL MODEL

In [136]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0, shuffle=True, stratify=y)

In [137]:
from sklearn.model_selection import KFold

In [138]:
filter_length = 300
num_classes = 3

model = Sequential()
model.add(Embedding(max_words, 20, input_length=maxlen))
model.add(Dropout(0.1))
model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['categorical_accuracy'])
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 200, 20)           40000     
                                                                 
 dropout_8 (Dropout)         (None, 200, 20)           0         
                                                                 
 conv1d_8 (Conv1D)           (None, 198, 300)          18300     
                                                                 
 global_max_pooling1d_8 (Gl  (None, 300)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_12 (Dense)            (None, 3)                 903       
                                                                 
 activation_4 (Activation)   (None, 3)                 0         
                                                      

In [139]:
# Define the callbacks
callbacks = [
    ReduceLROnPlateau(),
    EarlyStopping(patience=4)
]

# Perform cross-validation
kf = KFold(n_splits=5, shuffle=True)
train_loss = []
val_loss = []

for train_index, val_index in kf.split(X_train):
    X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
    y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]

    history = model.fit(
        X_fold_train,
        y_fold_train,
        epochs=20,
        batch_size=32,
        callbacks=callbacks,
        validation_data=(X_fold_val, y_fold_val)
    )

    train_loss.append(history.history['loss'][-1])
    val_loss.append(history.history['val_loss'][-1])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


In [140]:
# Print the training and validation loss for each fold
for fold in range(5):
    print("Fold %d - Train Loss: %.4f - Val Loss: %.4f" % (fold+1, train_loss[fold], val_loss[fold]))

Fold 1 - Train Loss: 0.0404 - Val Loss: 0.2262
Fold 2 - Train Loss: 0.0298 - Val Loss: 0.0257
Fold 3 - Train Loss: 0.0186 - Val Loss: 0.0151
Fold 4 - Train Loss: 0.0085 - Val Loss: 0.0142
Fold 5 - Train Loss: 0.0088 - Val Loss: 0.0046


### EVALUATION METRICS

In [141]:
metrics = model.evaluate(X_test, y_test)
print("{}: {}".format(model.metrics_names[0], metrics[0]))
print("{}: {}".format(model.metrics_names[1], metrics[1]))

loss: 0.21197573840618134
categorical_accuracy: 0.8631578683853149


In [142]:
# Calculating loss and precision
y_pred = model.predict(X_test)
print("Precision Score: {:.2}".format(average_precision_score(y_test,y_pred)))

Precision Score: 0.97


#### HIGHER ACCURACY MODEL

In [143]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.preprocessing import MultiLabelBinarizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from numpy import array, asarray, zeros
from sklearn.model_selection import train_test_split, KFold
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalMaxPool1D, Conv1D, Dropout, SpatialDropout1D, Bidirectional, LSTM
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.losses import binary_crossentropy
from sklearn.metrics import label_ranking_average_precision_score, label_ranking_loss, average_precision_score

In [144]:
# Load preprocessed dataset
file_path = "/Users/lalitaneeharikavajjhala/Desktop/Research credits /Data/preprocessed_data.csv"
df = pd.read_csv(file_path)

# Previewing data head and extend the max column width
pd.set_option('display.max_colwidth', None)
df.head()

# Converting tags from strings to lists
df['tag'] = df['tag'].apply(lambda x: literal_eval(x))

# Encoding tags 'y'
y = df['tag']
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(y)


In [145]:
# Standard keras pre-processing
maxlen = 200
max_words = 2000
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(df.sentence)

# Functions to transform text to feature_vectors
def get_features(text_series):
    sequences = tokenizer.texts_to_sequences(text_series)
    return pad_sequences(sequences, maxlen=maxlen)

# Calling function to create features 'X'
X = get_features(df.sentence)


In [146]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True, stratify=y)

# Define CNN model with Word2Vec embeddings
filter_length = 300
num_classes = y.shape[1]

model1 = Sequential()
model1.add(Embedding(max_words, 100, input_length=maxlen))  
model1.add(SpatialDropout1D(0.2))  
model1.add(Conv1D(64, 5, activation='relu'))  
model1.add(GlobalMaxPool1D())
model1.add(Dense(128, activation='relu'))  
model1.add(Dropout(0.5))  
model1.add(Dense(num_classes, activation='sigmoid'))

model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model1.summary()

# Define the callbacks
callbacks = [
    ReduceLROnPlateau(),
    EarlyStopping(patience=4)
]


Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 200, 100)          200000    
                                                                 
 spatial_dropout1d_4 (Spati  (None, 200, 100)          0         
 alDropout1D)                                                    
                                                                 
 conv1d_9 (Conv1D)           (None, 196, 64)           32064     
                                                                 
 global_max_pooling1d_9 (Gl  (None, 64)                0         
 obalMaxPooling1D)                                               
                                                                 
 dense_13 (Dense)            (None, 128)               8320      
                                                                 
 dropout_9 (Dropout)         (None, 128)              

In [147]:

# Perform cross-validation
kf = KFold(n_splits=6, shuffle=True)
train_loss = []
val_loss = []

for train_index, val_index in kf.split(X_train):
    X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
    y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]

    history = model1.fit(
        X_fold_train,
        y_fold_train,
        epochs=25,
        batch_size=64,  # Increase batch size
        callbacks=callbacks,
        validation_data=(X_fold_val, y_fold_val)
    )

    train_loss.append(history.history['loss'][-1])
    val_loss.append(history.history['val_loss'][-1])

# Print the training and validation loss for each fold
for fold in range(5):
    print("Fold %d - Train Loss: %.4f - Val Loss: %.4f" % (fold+1, train_loss[fold], val_loss[fold]))




Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 

HIGH ACCURACY MODEL EVALUATION

In [148]:
# Evaluate model on test data
metrics = model1.evaluate(X_test, y_test)
print("Test Loss: {}".format(metrics[0]))
print("Test Accuracy: {}".format(metrics[1]))

Test Loss: 0.18982639908790588
Test Accuracy: 0.9105263352394104


In [149]:
y_pred = model.predict(X_test)
thresholded_preds = (y_pred > 0.5).astype(int)  # Applying threshold for binary classification
precision = precision_score(y_test, thresholded_preds, average = 'weighted')
recall = recall_score(y_test, thresholded_preds, average = 'weighted')
f1 = f1_score(y_test, thresholded_preds, average= 'weighted')
print("Precision Score: {:.2}".format(average_precision_score(y_test,y_pred)))
# print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


1/6 [====>.........................] - ETA: 0s

Precision Score: 0.97
Recall: 0.8930232558139535
F1 Score: 0.8943316883064705


### PREDICTION

In [150]:
# Predicting
# x = ["Each Party shall return to the other all of the other’s Confidential Information and any other material, information or samples relating to the Product which have been provided or made available to the other and shall not retain any copies and the Parties further agree not to make any further use of each other’s Confidential Information or any other information, data or samples relating to the Product provided or made available by the other Party, except as necessary to comply with its statutory, regulatory or licensing obligations; provided, however, that Kitov may retain such material, information and/or samples relating to the Product as may be necessary for Kitov to continue to sell the Product as permitted by Section ​5.4.4 below, following which, Kitov shall refrain from making any further use of Dexcel’s Confidential Information or any other information, data or samples and shall return any remaining Confidential Information and material, information or samples relating to the Product."]
x = ["The confidentiality obligations contained in this section XI shall not apply to the extent that the receiving Party (the 'Recipient') is required (a) to disclose information by law, order or regulation of a governmental agency or a court of competent jurisdiction , or (b) to disclose information to any governmental agency for purposes of obtaining approval to test or market a Product , provided in either case that the Recipient shall provide written notice thereof to the other Party and sufficient opportunity to object to any such disclosure or to request confidential treatment thereof."]
xt = get_features(x)
prediction = model.predict(xt)
# probas = np.array(prediction)
# labels = (probas > 0.5).astype(np.int)

probas = (prediction > 0.5).astype(int)
tags = multilabel.inverse_transform(probas)
# tags = multilabel.inverse_transform(labels)

print(prediction)
# print(labels)
print(tags)

[[0.9649912  0.00227412 0.78824747]]
[('obligation', 'prohibition')]


In [151]:
print('\n'.join(tags[0]).upper())

OBLIGATION
PROHIBITION


In [152]:
print(*tags[0], sep=' ')

obligation prohibition


### SAVING THE MODELS

In [153]:
# Save tokenizer
joblib.dump(tokenizer, "/Users/lalitaneeharikavajjhala/Desktop/Research credits /Models/MultiLabelTokenizer.pkl")

['/Users/lalitaneeharikavajjhala/Desktop/Research credits /Models/MultiLabelTokenizer.pkl']

In [154]:
# Save binarizer
joblib.dump(multilabel, "/Users/lalitaneeharikavajjhala/Desktop/Research credits /Models/MultiLabelBinarizer_CNN.pkl")

['/Users/lalitaneeharikavajjhala/Desktop/Research credits /Models/MultiLabelBinarizer_CNN.pkl']

In [155]:
# Save the model
joblib.dump(model, "/Users/lalitaneeharikavajjhala/Desktop/Research credits /Models/MultiLabelModel_CNN.pkl")

['/Users/lalitaneeharikavajjhala/Desktop/Research credits /Models/MultiLabelModel_CNN.pkl']