In [1]:
import os
import sys
sys.path.append("..")
import tensorflow as tf
from src.utils.reader import SciciteReader
import configparser
from src.models.keras_model import MultitaskLearner, SingletaskLearner
import numpy as np
from sklearn.metrics import classification_report

In [2]:
config = configparser.ConfigParser()
config.read("../configs/default.conf")

['../configs/default.conf']

In [3]:
config["preprocessor"]["dataset"] = "../data/scicite/"

In [4]:
reader = SciciteReader(config["preprocessor"])

In [5]:
print("Loading data...")
text, labels, sections, worthiness = reader.load_data(_type="train", multitask=True)
text_dev, labels_dev, _, _ = reader.load_data(_type="dev", multitask=False)
text_test, labels_test, _, _ = reader.load_data(_type="test", multitask=False)

# keras_model = MultitaskLearner(
#     config
# )

keras_model = SingletaskLearner(
            config
        )

print("Preparing data...")
text_tensor, text_tokenizer = keras_model.prepare_data(text, max_len=int(config["multitask_trainer"]["max_len"]))
labels_tensor, labels_tokenizer = keras_model.prepare_data(labels)
sections_tensor, sections_tokenizer = keras_model.prepare_data(sections)
worthiness_tensor, worthiness_tokenizer = keras_model.prepare_data(worthiness)

text_tensor_dev = keras_model.prepare_dev_data(text_dev, text_tokenizer, max_len=int(config["multitask_trainer"]["max_len"]))
labels_tensor_dev = keras_model.prepare_dev_data(labels_dev, labels_tokenizer)

text_tensor_test = keras_model.prepare_dev_data(text_test, text_tokenizer, max_len=int(config["multitask_trainer"]["max_len"]))
labels_tensor_test = keras_model.prepare_dev_data(labels_test, labels_tokenizer)

print("Creating datasets...")
dataset = keras_model.create_dataset(
    text_tensor,
    labels_tensor,
#     sections_tensor,
#     worthiness_tensor
)
dev_dataset = keras_model.create_dev_dataset(
    text_tensor_dev,
    labels_tensor_dev
)
test_dataset = keras_model.create_dev_dataset(
    text_tensor_test,
    labels_tensor_test
)

Loading data...
Preparing data...
Creating datasets...


In [6]:
vocab_size = len(text_tokenizer.word_index.keys())
labels_size = len(labels_tokenizer.word_index.keys())
section_size = len(sections_tokenizer.word_index.keys())
worthiness_size = len(worthiness_tokenizer.word_index.keys())

# keras_model.create_model(
#             vocab_size, labels_size, section_size, worthiness_size
#         )
keras_model.create_model(
            vocab_size, labels_size
        )

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 150)         8381250   
_________________________________________________________________
bidirectional (Bidirectional [(None, None, 128), (None 110080    
_________________________________________________________________
weird_attention (WeirdAttent (None, 128)               128       
_________________________________________________________________
dense (Dense)                (None, 5)                 645       
Total params: 8,492,103
Trainable params: 8,492,103
Non-trainable params: 0
_________________________________________________________________


In [14]:
b_test_dataset = test_dataset.padded_batch(64, drop_remainder=False)

In [7]:
keras_model.eval(test_dataset, save_output=False)

[4 4 2 ... 4 3 2]
[2, 4, 2, 3, 2, 3, 4, 4, 2, 3, 2, 2, 4, 4, 3, 2, 3, 4, 2, 3, 3, 2, 2, 3, 2, 2, 4, 2, 2, 4, 3, 2, 3, 2, 4, 2, 2, 3, 2, 2, 2, 3, 2, 3, 2, 2, 2, 3, 2, 3, 2, 3, 4, 3, 2, 2, 2, 2, 2, 3, 3, 3, 2, 3, 3, 3, 4, 3, 3, 2, 3, 2, 2, 3, 2, 2, 2, 2, 3, 2, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 2, 2, 2, 3, 4, 2, 2, 3, 3, 2, 2, 3, 2, 2, 3, 4, 4, 2, 4, 4, 4, 4, 3, 4, 3, 2, 3, 2, 4, 2, 2, 4, 3, 2, 4, 4, 4, 3, 3, 2, 2, 2, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 4, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 3, 4, 2, 4, 2, 2, 4, 3, 2, 2, 4, 3, 2, 3, 2, 4, 4, 2, 2, 2, 2, 3, 2, 2, 2, 2, 3, 3, 2, 3, 3, 3, 2, 2, 2, 3, 3, 2, 2, 2, 3, 2, 2, 3, 3, 4, 2, 2, 2, 2, 3, 2, 2, 2, 2, 4, 2, 4, 3, 3, 3, 3, 2, 2, 2, 3, 2, 2, 2, 3, 3, 4, 3, 2, 2, 4, 2, 3, 3, 2, 3, 2, 4, 3, 2, 2, 2, 2, 3, 2, 2, 3, 3, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 3, 3, 3, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 2, 3, 3, 2, 2, 3, 2, 2, 4, 3, 2, 2, 2, 2, 3, 3, 2, 2, 3, 3, 3, 

In [None]:
preds

In [None]:
len(y_true), len(_true)

In [None]:
false_idxs = np.concatenate((np.where(y_true.flatten() == 0)[0], np.where(y_true.flatten() == 1)[0]))
_true = np.delete(y_true.flatten(), false_idxs)
_pred = np.delete(y_pred[:,2:].argmax(1)+2, false_idxs)
# report = classification_report(_true, _pred, labels=[2,3,4], output_dict=True)

In [None]:
preds[0][:,2:].argmax(1)+2

In [None]:
print(classification_report(_true, _pred, labels=[2,3,4], output_dict=False))

In [None]:
preds[0]

In [None]:
test_dataset.take(-1)

In [None]:
_true[-10:]

In [None]:
__true = []
for images, labels in test_dataset.take(-1):
    __true.append(labels["dense"][0].numpy())

In [None]:
print(classification_report(np.asarray(__true), _pred, labels=[2,3,4], output_dict=False))

In [15]:
_BPREDS = keras_model.model.predict(b_test_dataset)
_PREDS = keras_model.model.predict(test_dataset)

In [17]:
_PREDS.shape, _BPREDS.shape

((93050, 5), (1861, 5))

In [None]:
preds = keras_model.model.predict(b_test_dataset)

In [None]:
keras_model.eval(test_dataset)

In [None]:
y_pred = preds[:,2:].argmax(1)+2
y_pred

In [None]:
print(classification_report)

In [None]:
__true = np.asarray(__true)
__pred = 

In [2]:
import nlp

dataset = nlp.load_dataset('scicite')

len(dataset["train"]), len(dataset["validation"]), len(dataset["test"])

Using custom data configuration default


(8194, 916, 1859)

In [3]:
len(dataset["train"]) + len(dataset["validation"]) + len(dataset["test"])

10969