In [12]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

!pip install --quiet tensorflow-text
!pip install tensorflow_text
import tensorflow_text as text

import os

# Install Transformers library 
!pip install transformers
!pip install bert-for-tf2

import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('dark_background')

[K     |████████████████████████████████| 4.4 MB 38.1 MB/s 
Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 29.6 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 5.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 50.5 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 75.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 75.6 MB/s 
Installing collected packages: pyyaml, to

The input data is in list, already.

Defined as: "input_list"

In [1]:
input_list = ['This RCT examined the efficacy of a manualized social intervention for children with HFASDs',
              'Participants were randomly assigned to treatment or wait-list conditions. Treatment included instruction and therapeutic activities targeting social skills, face-emotion recognition, interest expansion, and interpretation of non-literal language. A response-cost program was applied to reduce problem behaviors and foster skills acquisition'
              'Significant treatment effects were found for five of seven primary outcome measures (parent ratings and direct child measures)',
              'Secondary measures based on staff ratings (treatment group only) corroborated gains reported by parents',
              'High levels of parent, child and staff satisfaction were reported, along with high levels of treatment fidelity',
              'Standardized effect size estimates were primarily in the medium and large ranges and favored the treatment group']

## I. TOKEN-LEVEL DATA

In [10]:
token_inputs = []
for sequence in input_list:
    token_inputs.append(sequence.lower())

token_inputs

['this rct examined the efficacy of a manualized social intervention for children with hfasds. participants were randomly assigned to treatment or wait-list conditions. treatment included instruction and therapeutic activities targeting social skills, face-emotion recognition, interest expansion, and interpretation of non-literal language. a response-cost program was applied to reduce problem behaviors and foster skills acquisition. significant treatment effects were found for five of seven primary outcome measures (parent ratings and direct child measures).',
 'secondary measures based on staff ratings (treatment group only) corroborated gains reported by parents.',
 'high levels of parent, child and staff satisfaction were reported, along with high levels of treatment fidelity.',
 'standardized effect size estimates were primarily in the medium and large ranges and favored the treatment group.']

In [13]:
module = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module, trainable=True)

In [14]:
from bert import bert_tokenization

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [15]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [16]:
token_bert_inputs = bert_encode(token_inputs, tokenizer, max_len=128)

In [17]:
token_bert_inputs

(array([[  101,  2023, 22110,  2102,  8920,  1996, 21150,  1997,  1037,
          6410,  3550,  2591,  8830,  2005,  2336,  2007,  1044,  7011,
         16150,  2015,  1012,  6818,  2020, 18154,  4137,  2000,  3949,
          2030,  3524,  1011,  2862,  3785,  1012,  3949,  2443,  7899,
          1998, 17261,  3450, 14126,  2591,  4813,  1010,  2227,  1011,
          7603,  5038,  1010,  3037,  4935,  1010,  1998,  7613,  1997,
          2512,  1011, 18204,  2653,  1012,  1037,  3433,  1011,  3465,
          2565,  2001,  4162,  2000,  5547,  3291, 15592,  1998,  6469,
          4813,  7654,  1012,  3278,  3949,  3896,  2020,  2179,  2005,
          2274,  1997,  2698,  3078,  9560,  5761,  1006,  6687,  8599,
          1998,  3622,  2775,  5761,  1007,  1012,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,

## II. CHARACTER-LEVEL DATA

In [18]:
# Create a character-level tokenizer
# Create a function to split sentences into characters
def split_characters(text):
    text_split_list = ' '.join(list(text))
    return text_split_list

In [19]:
# Apply split_character function to train dataset
character_inputs = [split_characters(sentence) for sentence in token_inputs]

# Convert character_list into numpy array
character_inputs = np.array(character_inputs)

In [20]:
character_inputs

array(['t h i s   r c t   e x a m i n e d   t h e   e f f i c a c y   o f   a   m a n u a l i z e d   s o c i a l   i n t e r v e n t i o n   f o r   c h i l d r e n   w i t h   h f a s d s .   p a r t i c i p a n t s   w e r e   r a n d o m l y   a s s i g n e d   t o   t r e a t m e n t   o r   w a i t - l i s t   c o n d i t i o n s .   t r e a t m e n t   i n c l u d e d   i n s t r u c t i o n   a n d   t h e r a p e u t i c   a c t i v i t i e s   t a r g e t i n g   s o c i a l   s k i l l s ,   f a c e - e m o t i o n   r e c o g n i t i o n ,   i n t e r e s t   e x p a n s i o n ,   a n d   i n t e r p r e t a t i o n   o f   n o n - l i t e r a l   l a n g u a g e .   a   r e s p o n s e - c o s t   p r o g r a m   w a s   a p p l i e d   t o   r e d u c e   p r o b l e m   b e h a v i o r s   a n d   f o s t e r   s k i l l s   a c q u i s i t i o n .   s i g n i f i c a n t   t r e a t m e n t   e f f e c t s   w e r e   f o u n d   f o r   f i v e   o f   s e v e n   p r 

## III. LINE_NUMBER AND TOTAL_LINE DATA

#### 1. line_number

In [23]:
line_number = []
for line_index, _ in enumerate(input_list):
    line_number.append(line_index)

line_number_encoded = np.array(tf.one_hot(line_number, depth=15))

In [24]:
line_number_encoded

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
      dtype=float32)

#### 2. total_line

In [27]:
total_line = []
for line in range(len(input_list)):
    total_line.append(len(input_list))

total_line_encoded = np.array(tf.one_hot(total_line, depth=20))

In [28]:
total_line_encoded

array([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.]], dtype=float32)

## MODEL INPUT DATASET

In [46]:
model_inputs = [token_bert_inputs, character_inputs, line_number_encoded, total_line_encoded]

In [48]:
model_inputs

[(array([[  101,  2023, 22110,  2102,  8920,  1996, 21150,  1997,  1037,
           6410,  3550,  2591,  8830,  2005,  2336,  2007,  1044,  7011,
          16150,  2015,  1012,  6818,  2020, 18154,  4137,  2000,  3949,
           2030,  3524,  1011,  2862,  3785,  1012,  3949,  2443,  7899,
           1998, 17261,  3450, 14126,  2591,  4813,  1010,  2227,  1011,
           7603,  5038,  1010,  3037,  4935,  1010,  1998,  7613,  1997,
           2512,  1011, 18204,  2653,  1012,  1037,  3433,  1011,  3465,
           2565,  2001,  4162,  2000,  5547,  3291, 15592,  1998,  6469,
           4813,  7654,  1012,  3278,  3949,  3896,  2020,  2179,  2005,
           2274,  1997,  2698,  3078,  9560,  5761,  1006,  6687,  8599,
           1998,  3622,  2775,  5761,  1007,  1012,   102,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,

## MODEL PREDICTION

In [None]:
# Load model
loaded_model = tf.keras.models.load_model('/content/gdrive/MyDrive/abstract_section_classification/model/Abstract_Section_Classification')

In [None]:
# Use model to predict the demo data
model_pred_proba = loaded_model.predict(model_inputs) # token_character_val_dataset is required to have the same format as the training dataset
model_pred_proba

In [None]:
# Convert probability predictions into labels
model_preds = tf.argmax(model_pred_proba, axis=1)
model_preds

In [None]:
# Create a function to return and display results
def display_results(model_preds_list, input_list):
    predictions = []
    for prediction in model_preds_list:
        if prediction == '0':
            predictions.append('BACKGROUND')

        elif prediction == '1':
            predictions.append('CONCLUSIONS')

        elif prediction == '2':
            predictions.append('METHODS')

        elif prediction == '3':
            predictions.append('OBJECTIVE')

        elif prediction == '4':
            predictions.append('RESULTS')
    
    results = []
    for index in range(len(model_preds_list)):
        results.append((model_preds_list[index], input_list[index]))

    return results

### DISCOVER MORE:

1. Training, Validation, and Testing data preparation: https://colab.research.google.com/drive/1rN-BfkA5Vc5nMToka_Pyr1q1DtxlXxki#scrollTo=FhIyNphGtFAY

2. Model creation and training: https://colab.research.google.com/drive/1ZrEdJBmeU0FDtNN5F_s0eU6yqvAEIPMt#scrollTo=d0etEZ_Mpqw1&uniqifier=1

3. Demo data preprocessing and model prediction
https://colab.research.google.com/drive/1l3fQn91sOx7KbsLBiIPls_Aaw9CvAkFY#scrollTo=vS0raMb7tNqR