In [39]:
import pandas as pd
import numpy as np
from keras.layers import Embedding
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences

In [3]:
xtrain = pd.read_csv("/content/drive/MyDrive/NLP/2Competição/train.conll", sep = ' ', skip_blank_lines=False)

In [4]:
len(xtrain)

314385

In [5]:
xtest = pd.read_csv("/content/drive/MyDrive/NLP/2Competição/test.conll", sep = ' ', skip_blank_lines=False)

In [6]:
len(xtest)

14078

In [7]:
from sklearn.model_selection import train_test_split

train, dev = train_test_split(xtrain, test_size=0.210014 , shuffle=False)

In [8]:
train.head(50)

Unnamed: 0,E,O
0,M,O
1,E,O
2,N,O
3,T,O
4,A,O
5,Órgão,O
6,:,O
7,8ª,B-ORGANIZACAO
8,TURMA,I-ORGANIZACAO
9,CÍVEL,I-ORGANIZACAO


In [9]:
dev.head(5)

Unnamed: 0,E,O
248359,Quanto,O
248360,às,O
248361,apresentações,O
248362,",",O
248363,não,O


In [None]:
# train.to_csv('/content/drive/MyDrive/NLP/2 Competição/trainC.conll',sep=' ',index=False)

In [None]:
# dev.to_csv('/content/drive/MyDrive/NLP/2 Competição/testC.conll',sep=' ',index=False)

In [15]:
def split_text_label(filename):
  f = open(filename)
  split_labeled_text = []
  sentence = []
  for line in f:
    if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\n":
       if len(sentence) > 0:
         split_labeled_text.append(sentence)
         sentence = []
       continue
    splits = line.split(' ')
    sentence.append([splits[0],splits[-1].rstrip("\n")])
  if len(sentence) > 0:
    split_labeled_text.append(sentence)
    sentence = []
  return split_labeled_text

In [10]:
!cat /content/drive/MyDrive/NLP/2Competição/trainC.conll | cut -d " " -f 1,3 > train_temp.txt
!cat /content/drive/MyDrive/NLP/2Competição/testC.conll | cut -d " " -f 1,3 > dev_temp.txt

In [20]:
split_train = split_text_label("train_temp.txt")
split_valid = split_text_label("dev_temp.txt")

In [21]:
labelSet = set()
wordSet = set()
# words and labels
for data in [split_train, split_valid]:
  for labeled_text in data:
    for word, label in labeled_text:
      labelSet.add(label)
      wordSet.add(word.lower())


In [22]:
# Sort the set to ensure '0' is assigned to 0
sorted_labels = sorted(list(labelSet), key=len)
# Create mapping for labels
label2Idx = {}
for label in sorted_labels:
  label2Idx[label] = len(label2Idx)
idx2Label = {v: k for k, v in label2Idx.items()}
# Create mapping for words
word2Idx = {}
if len(word2Idx) == 0:
  word2Idx["PADDING_TOKEN"] = len(word2Idx)
  word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
for word in wordSet:
  word2Idx[word] = len(word2Idx)

In [24]:
def createMatrices(data, word2Idx, label2Idx):
  sentences = []
  labels = []
  for split_labeled_text in data:
     wordIndices = []
     labelIndices = []
     for word, label in split_labeled_text:
       if word in word2Idx:
          wordIdx = word2Idx[word]
       elif word.lower() in word2Idx:
          wordIdx = word2Idx[word.lower()]
       else:
          wordIdx = word2Idx['UNKNOWN_TOKEN']
       wordIndices.append(wordIdx)
       labelIndices.append(label2Idx[label])
     sentences.append(wordIndices)
     labels.append(labelIndices)
  return sentences, labels
train_sentences, train_labels = createMatrices(split_train, word2Idx, label2Idx)
valid_sentences, valid_labels = createMatrices(split_valid, word2Idx, label2Idx)

In [28]:
def padding(sentences, labels, max_len, padding='post'):
  padded_sentences = pad_sequences(sentences, max_len,       
  padding='post')
  padded_labels = pad_sequences(labels, max_len, padding='post')
  return padded_sentences, padded_labels

max_seq_len = 128  
train_features, train_labels = padding(train_sentences, train_labels, max_seq_len, padding='post' )
valid_features, valid_labels = padding(valid_sentences, valid_labels, max_seq_len, padding='post' )

In [37]:
# Loading glove embeddings
embeddings_index = {}
f = open('/content/drive/MyDrive/NLP/2Competição/embeddings/glove.6B.100d.txt', encoding="utf-8")
for line in f:
  values = line.strip().split(' ')
  word = values[0] # the first entry is the word
  coefs = np.asarray(values[1:], dtype='float32') #100d vectors   
  embeddings_index[word] = coefs
f.close()
embedding_matrix = np.zeros((len(word2Idx), 100))
# Word embeddings for the tokens
for word,i in word2Idx.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

In [40]:
train_batch_size = 32
valid_batch_size = 64
test_batch_size = 64
train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_labels))
valid_dataset = tf.data.Dataset.from_tensor_slices((valid_features, valid_labels))
# test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_labels))
shuffled_train_dataset = train_dataset.shuffle(buffer_size=train_features.shape[0], reshuffle_each_iteration=True)
batched_train_dataset = shuffled_train_dataset.batch(train_batch_size, drop_remainder=True)
batched_valid_dataset = valid_dataset.batch(valid_batch_size, drop_remainder=True)
# batched_test_dataset = test_dataset.batch(test_batch_size, drop_remainder=True)

In [46]:
import tensorflow as tf
from tensorflow.keras import layers
class TFNer(tf.keras.Model):
  def __init__(self, max_seq_len, embed_input_dim, embed_output_dim, num_labels, weights):
   super(TFNer, self).__init__() 
   self.embedding = layers.Embedding(input_dim=embed_input_dim, 
   output_dim=embed_output_dim, weights=weights,    
   input_length=max_seq_len, trainable=False, mask_zero=True)        
   
   self.bilstm = layers.Bidirectional(layers.LSTM(128,  
   return_sequences=True))
   self.dense = layers.Dense(num_labels)
  def call(self, inputs):
   x = self.embedding(inputs) # batchsize, max_seq_len,      
   embedding_output_dim
   x = self.bilstm(x) #batchsize, max_seq_len, hidden_dim_bilstm
   logits = self.dense(x) #batchsize, max_seq_len, num_labels
   return logits

In [49]:
model = TFNer(max_seq_len=max_seq_len,embed_input_dim=len(word2Idx), embed_output_dim=100, weights=[embedding_matrix], num_labels=9)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [76]:
train_loss_metric = tf.keras.metrics.Mean('training_loss', dtype=tf.float32)
valid_loss_metric = tf.keras.metrics.Mean('valid_loss', dtype=tf.float32)


def train_step_fn(sentences_batch, labels_batch):
  with tf.GradientTape() as tape:
    logits = model(sentences_batch)
    loss = scce(labels_batch, logits)
  grads = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(list(zip(grads,   
  model.trainable_variables)))
  return loss, logits

def valid_step_fn(sentences_batch, labels_batch):
  logits = model(sentences_batch)
  loss = scce(labels_batch, logits)
  return loss, logits

for epoch in epoch_bar:
  for sentences_batch, labels_batch in progress_bar(batched_train_dataset, total=train_pb_max_len,parent=epoch_bar):
    loss, logits = train_step_fn(sentences_batch, labels_batch)
    train_loss_metric(loss)
  train_loss_metric.reset_states()
  for sentences_batch, labels_batch in progress_bar(batched_valid_dataset, total=valid_pb_max_len,parent=epoch_bar):
    loss,logits = valid_step_fn(sentences_batch,labels_batch,valid_loss_metric.update_state(loss))
    valid_loss_metric.reset_states()
    
model.save_weights(f"{args.output}/model_weights",save_format='tf')

TypeError: ignored