In [1]:
from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
from keras.preprocessing.text import *

import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import GRU
from keras.layers import TimeDistributed
from keras.layers import Dense
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
import json




**Loading the data**

General Function for getting train_test_val

In [2]:
def create_train_test_val(file):
    lst_out_text=[]
    lst_out_labels=[]
    for i in file.values():
        lst =[]
        sentence = i['text']
        sentence = sentence.lower()
        labels = i['labels']
        lst = sentence.split()
        lst_out_text.append(lst)
        lst_out_labels.append(labels)
    return lst_out_text,lst_out_labels    

Reading ATE JSON Files

In [3]:
ATE_train = None
ATE_test = None
ATE_val = None
with open("Task1\processed\ATE_train.json") as f1:
    ATE_train  = json.load(f1)

with open("Task1\processed\ATE_test.json") as f2:
    ATE_test = json.load(f2)

with open("Task1\processed\ATE_val.json") as f3:
    ATE_val = json.load(f3)


Reading NER JSON Files

In [4]:
NER_train = None
NER_test = None
NER_val = None
with open("Task1\processed\\NER_train.json") as f1:
    NER_train  = json.load(f1)

with open("Task1\processed\\NER_test.json") as f2:
    NER_test = json.load(f2)

with open("Task1\processed\\NER_val.json") as f3:
    NER_val = json.load(f3)


ATE_Dataset

In [5]:
X_train_ate,Y_train_ate= create_train_test_val(ATE_train)
X_test_ate,Y_test_ate = create_train_test_val(ATE_test)
X_val_ate,Y_val_ate = create_train_test_val(ATE_val)

print(X_train_ate)
print(Y_train_ate)



[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'O'], ['O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'B', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'B', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'B', 'O', 'O', 'O', 'B', 'I', 'O', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'O', 'O', 'O'], ['O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'B', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 

NER_Dataset

In [6]:
X_train_ner,Y_train_ner = create_train_test_val(NER_train)
X_test_ner,Y_test_ner = create_train_test_val(NER_test)
X_val_ner,Y_val_ner = create_train_test_val(NER_val)

print(X_train_ner)
print(Y_train_ner)

[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'B_CASE_NUMBER', 'I_CASE_NUMBER', 'I_CASE_NUMBER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_ORG', 'I_ORG', 'I_ORG', 'I_ORG', 'I_ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_OTHER_PERSON', 'I_OTHER_PERSON', 'I_OTHER_PERSON', 'O', 'O'], ['O', 'O', 'B_CASE_NUMBER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_STATUTE', 'I_STATUTE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_STATUTE', 'I_STATUTE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'B_OTHER_PERSON', 'I_OTHER_PERSON', 'O', 'O', 'O', 'O', 'B_CASE_NUMBER', 'I_CASE_NUMBER', 'I_CASE_NUMBER', 'O', 'O', 'O', 'O', 'O', 'B_CASE_NUMBER', 'I_CASE_NUMBER', 'I_CASE_NUMBE

In [7]:
model = KeyedVectors.load_word2vec_format('WordEmbeddings/Word2Vec.bin',binary=True)

In [8]:
def TokenCreator(sentences,tokenizer=None):
    
    temp_wordlist = None
    if(tokenizer == None):
        tokenizer2 = Tokenizer()
        tokenizer2.fit_on_texts(sentences)
        
        temp_wordlist = tokenizer2.texts_to_sequences(sentences)
        return (temp_wordlist,tokenizer2)
    
    else:
        temp_wordlist = tokenizer.texts_to_sequences(sentences)
        return (temp_wordlist,tokenizer)
    


In [9]:
def find_vocab(dataset):
    lst=[]
    for i in dataset:
        for j in i:
            lst.append(j)
    st = set(lst)
    return st

In [10]:
st1 = find_vocab(X_train_ate)
vocab_ate  = list(st1)
vocab_ate.append("#UK")



In [11]:
for i in X_test_ate:
    for j in range(0,len(i)):
        if i[j] not in st1:
            i[j]="#UK"

for i in X_val_ate:
    for j in range(0,len(i)):
        if i[j] not in st1:
            i[j]="#UK"
        

In [12]:
X_train_ate_tokenized, toke = TokenCreator(X_train_ate)
X_test_ate_tokenized = TokenCreator(X_test_ate, tokenizer = toke)
X_val_ate_tokenized = TokenCreator(X_val_ate, tokenizer = toke)
Y_train_ate_tokenized,toke2 = TokenCreator(Y_train_ate)
Y_test_ate_tokenized = TokenCreator(Y_test_ate, tokenizer = toke2)
Y_val_ate_tokenized = TokenCreator(Y_val_ate, tokenizer = toke2)

# print(X_test_ate_tokenized)
# print(X_val_ate_tokenized)
# print(X_test_ate_tokenized)


In [13]:
vocab_size_ate = len(vocab_ate)
vocab_size_ate

2405

In [14]:
embedding_weight = np.zeros((vocab_size_ate, 300))

word2id = toke.word_index

In [15]:
for word, index in word2id.items():
    try:
        embedding_weight[index:] = model[word]
    except KeyError:
        pass

In [16]:
print(X_test_ate_tokenized)

([[290, 82, 8, 1002, 106, 3, 209, 1103, 67, 1452, 2227, 6, 352, 1874, 2], [537, 167, 63, 19, 247, 1, 92, 806, 7, 145, 117, 766, 11, 195, 317, 2], [297, 52, 17, 46, 2], [107, 19, 491, 1, 93, 55, 671, 4, 551, 2], [88, 56, 19, 307, 9, 269, 10, 1685, 41, 596, 354, 578, 36, 4, 1, 1948, 361, 3, 5, 42, 65, 11, 44, 6, 244, 250, 75, 16, 306, 7, 59, 29, 73, 3, 523, 2281, 1, 195, 66, 1516, 2], [5, 42, 106, 3, 243, 3, 4, 335, 6, 24, 2], [104, 68, 3, 4, 7, 113, 410, 569, 6, 37, 116, 6, 72, 87, 235, 2], [449, 5, 42, 19, 243, 4, 22, 1, 95, 200, 52, 11, 5, 2347, 703, 2], [7, 113, 554, 12, 1, 106, 20, 3, 1711, 1950, 485, 4, 1, 142, 33, 53, 41, 1619, 1981, 1013, 36, 2], [1, 87, 25, 19, 338, 902, 1, 1207, 252, 2], [213, 41, 890, 10, 14, 466, 36, 21, 2372, 6, 2], [3, 11, 39, 10, 894, 1, 3, 1, 74, 8, 698, 45, 171, 2], [50, 92, 13, 7, 34, 17, 13, 1, 770, 186, 17, 19, 21, 70, 11, 44, 3, 7, 50, 34, 9, 92, 260, 49, 792, 12, 5, 3, 22, 700, 14, 23, 17, 9, 233, 2313, 2], [5, 8, 1002, 106, 4, 31, 1775, 152, 2], [2

In [17]:
MAX_SEQ_LENGTH = 100  # sequences greater than 100 in length will be truncated

X_padded = pad_sequences(X_train_ate_tokenized, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
Y_padded = pad_sequences(Y_train_ate_tokenized, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")

X_padded_test = pad_sequences(X_test_ate_tokenized, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
Y_padded_test = pad_sequences(Y_test_ate_tokenized, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")

# X_padded_val = pad_sequences(X_val_ate_tokenized, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")
# Y_padded_val = pad_sequences(Y_val_ate_tokenized, maxlen=MAX_SEQ_LENGTH, padding="pre", truncating="post")

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (328,) + inhomogeneous part.

In [None]:
Y_padded[0]
Y_padded.shape

(906, 100)

In [None]:
Y_padded = to_categorical(Y_padded)
# can change to label based encoding

array([[[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        ...,
        [0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.]],

       [[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]],

       [[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]],

       ...,

       [[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [0., 1., 0., 0.]],

       [[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        ...,
        [0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.]],

       [[1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        ...,
        [0., 1., 0., 0.],
        [0., 1.

In [None]:
X_padded.shape

(906, 100)

In [None]:
Y_padded.shape


(906, 100, 4)

<a href="https://www.kaggle.com/code/tanyadayanand/pos-tagging-using-rnn#1.-Preprocess-data"> Reference </a>

In [None]:
gru_model = Sequential()
gru_model.add(Embedding(input_dim = vocab_size_ate,
                        output_dim = 300,
                        input_length  = 100,
                        weights = [embedding_weight],
                        ))

gru_model.add(GRU(64, return_sequences=True))
gru_model.add(TimeDistributed(Dense(Y_padded.shape[2], activation='softmax')))

In [None]:
gru_model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [None]:
gru_model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 100, 300)          822000    
                                                                 
 gru_5 (GRU)                 (None, 100, 64)           70272     
                                                                 
 time_distributed_5 (TimeDi  (None, 100, 4)            260       
 stributed)                                                      
                                                                 
Total params: 892532 (3.40 MB)
Trainable params: 892532 (3.40 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
gru_training = gru_model.fit(X_padded, Y_padded, batch_size=128, epochs=5, validation_data=(X_padded_val, Y_padded_val))

ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {'(<class \'list\'> containing values of types {"<class \'int\'>"})'}), <class 'numpy.ndarray'>