# Testing for Bert and LSTM

In [1]:
from transformers import BertTokenizer
from transformers import TFBertModel

model_name = "allenai/scibert_scivocab_uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import tensorflow as tf
large_capacity_model = tf.keras.models.load_model('sci_classification_model')

In [1]:
import pandas as pd

df = pd.read_csv("test data.csv")

import numpy as np
import sklearn
import tensorflow as tf
import re

regex = r"(\$+)(?:(?!\1)[\s\S])*\1"

x = df['abstract'].to_numpy()
y = df['category'].to_numpy()


df['encoded_cat'] = df['category'].astype('category').cat.codes

processed_x = []


for t in x:
    string_encode = t.encode("ascii", "ignore")
    t = string_encode.decode()
    new_t = re.sub(regex, '', t)
    new_t = new_t.replace('\n', ' ')
    processed_x.append(new_t)

df['processed abstract'] = processed_x

df.groupby('encoded_cat').describe()
df.to_csv('processed_labeled_test.csv')

In [4]:
from tqdm import tqdm
X_input_ids = np.zeros((len(df), 256))
X_attn_masks = np.zeros((len(df), 256))

def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['processed abstract'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [5]:
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

1400it [00:03, 404.80it/s]


In [2]:
labels = np.zeros((len(df), 7))
labels[np.arange(len(df)), df['encoded_cat'].values] = 1

In [3]:
labels

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [16]:
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))

In [17]:
def MyMap(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

dataset = dataset.map(MyMap)
dataset.take(1)

<TakeDataset shapes: ({input_ids: (256,), attention_mask: (256,)}, (7,)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.float64)>

In [18]:
dataset = dataset.shuffle(10000).batch(100, drop_remainder=True)
dataset.take(1)

<TakeDataset shapes: ({input_ids: (100, 256), attention_mask: (100, 256)}, (100, 7)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.float64)>

In [19]:
import time

start = time.time()
acc1 = tf.keras.metrics.TopKCategoricalAccuracy(k=2)
large_capacity_model.compile(loss='categorical_crossentropy', metrics=[acc1])
results = large_capacity_model.evaluate(dataset)
end = time.time()
print(end - start)

15.607146501541138


In [4]:
import pickle
with open('LSTM_tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [5]:
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

test_abstract = df['processed abstract']
processed_x = []

for t in test_abstract:
    string_encode = t.encode("ascii", "ignore")
    t = string_encode.decode()
    new_t = re.sub(regex, '', t)
    for word in STOPWORDS:
        token = ' ' + word + ' '
        new_t = new_t.replace(token, ' ')
        new_t = new_t.replace(' ', ' ')
    processed_x.append(new_t)

In [6]:
processed_x[1]

'The need modern data analytics combine relational, procedural, map-reduce-style functional processing widely recognized. State-of-the-art systems like Spark added SQL front-ends relational query optimization, promise increase expressiveness performance. But good extensions extracting high performance modern hardware platforms?   While Spark made impressive progress, show relational workloads, still significant gap compared best-of-breed query engines. And stepping outside relational world, query optimization techniques ineffective large parts computation treated user-defined functions (UDFs).   We present Flare: new back-end Spark brings performance closer best SQL engines, without giving added expressiveness Spark. We demonstrate order magnitude speedups relational workloads TPC-H, well range machine learning kernels combine relational iterative functional processing.   Flare achieves results (1) compilation native code, (2) replacing parts Spark runtime system, (3) extending scope o

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 8000
embedding_dim = 256
max_length = 180
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

sequences = tokenizer.texts_to_sequences(processed_x)
padded_x = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [8]:
labels.shape

(1400, 7)

In [9]:
padded_x.shape

(1400, 180)

In [10]:
lstm = tf.keras.models.load_model('fine_tuned_student model attention')

In [12]:
import time
start = time.time()
results_1 = lstm.evaluate(padded_x, labels)
end = time.time()
print(end - start)

0.7929182052612305


In [13]:
import time

start = time.time()
acc1 = tf.keras.metrics.TopKCategoricalAccuracy(k=2)
lstm.compile(loss='categorical_crossentropy', metrics=[acc1])
results_2 = lstm.evaluate(padded_x, labels)
end = time.time()
print(end - start)

1.3225352764129639
