# Create baseline model

In [36]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers

Read csv files

In [3]:
train_data = pd.read_csv("dataset/csv/train.csv")
test_data = pd.read_csv("dataset/csv/test.csv")
val_data = pd.read_csv("dataset/csv/dev.csv")

In [9]:
train_data

Unnamed: 0,ID,line_number,discourse_type,discourse_text,total_lines
0,24464531,0,OBJECTIVE,To investigate the efficacy of 6 weeks of dail...,12
1,24464531,1,METHODS,A total of 125 patients with primary knee OA w...,12
2,24464531,2,METHODS,Outcome measures included pain reduction and i...,12
3,24464531,3,METHODS,Pain was assessed using the visual analog pain...,12
4,24464531,4,METHODS,Secondary outcome measures included the Wester...,12
...,...,...,...,...,...
180035,24464531,7,RESULTS,For the absolute change in percent atheroma vo...,12
180036,24464531,8,RESULTS,"For PAV , a significantly greater percentage o...",12
180037,24464531,9,RESULTS,Both strategies had acceptable side effect pro...,12
180038,24464531,10,CONCLUSIONS,"Compared with standard statin monotherapy , th...",12


Turn into text data

In [12]:
train_sentences = train_data["discourse_text"]
test_sentences = test_data["discourse_text"]
val_sentences = val_data["discourse_text"]

## Create text vectorizer layer

### turn vocab into tokenaization

See how many words are in our vocab

( taken from table 2 in: https://arxiv.org/pdf/1710.06071/pdf)

In [6]:
max_tokens = 68000  # the amount of words in dataset

Create text vectorizer

Plese refer the output_sequence_length in https://github.com/mrdbourke/tensorflow-deep-learning/blob/main/09_SkimLit_nlp_milestone_project_2.ipynb

In [7]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

text_vectorizer = TextVectorization(max_tokens = max_tokens
                                   , output_sequence_length = 55) # this is kind of magic number. See the github later

In [13]:
text_vectorizer.adapt(train_sentences)

In [19]:
# Test our text vectorizer on randome sentences
import random
target_sentence = random.choice(train_sentences)
print(f"Text:\n{target_sentence}")
print(f"\nLength of text: {len(target_sentence)}")
print(f"\nVectorized text: {text_vectorizer([target_sentence])}")

Text:
The primary end point of IS/MaR was not significantly reduced .

Length of text: 63

Vectorized text: [[    2    57   208   350     4 34919    10    31    41   200     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0]]


In [24]:
# How many words in our training vocablary

rct_20k_text_vocab = text_vectorizer.get_vocabulary()
print(f"Number of words in vocab: {len(rct_20k_text_vocab)}")
print(f"Most common words in the vocab: {rct_20k_text_vocab[:5]}")
print(f"Least common words in the vocab: {rct_20k_text_vocab[-5:]}")

Number of words in vocab: 68000
Most common words in the vocab: ['', '[UNK]', 'the', 'and', 'of']
Least common words in the vocab: ['httpsuploaduminacjpcgiopenbinctrctrcgifunctionbrowsactionbrowstypesummaryrecptnor000008238languagee', 'httpsregisterclinicaltrialsgov', 'httpsmartmicrosurgerycom', 'httpseudractemaeuropaeuindexhtml', 'httpseudractemaeuropaeu']


Get config of our text vectorizer

In [25]:
text_vectorizer.get_config()

{'name': 'text_vectorization',
 'trainable': True,
 'batch_input_shape': (None, None),
 'dtype': 'string',
 'max_tokens': 68000,
 'standardize': 'lower_and_strip_punctuation',
 'split': 'whitespace',
 'ngrams': None,
 'output_mode': 'int',
 'output_sequence_length': 55,
 'pad_to_max_tokens': False,
 'sparse': False,
 'ragged': False,
 'vocabulary': None,
 'idf_weights': None}

### turn vocab numbers into embedding

In [27]:
embedding = layers.Embedding(input_dim = 68000 # the size of input shape: the size of vocab in our data
                            ,output_dim = 128 # the output shape
                            ,mask_zero = True # use masking to handle variable
                            ,name="tokes_embedding")

In [32]:
# Show example embedding
target_original_text = random.choice(train_sentences)
target_vec_text = text_vectorizer([target_original_text])
print(f"Original text: {target_original_text}")
print(embedding(target_vec_text))

Original text: After RP , 388 patients with pT3 pN0 prostate cancer ( PCa ) were randomized to WS or three-dimensional conformal ART with 60 Gy .
tf.Tensor(
[[[-0.00244201 -0.01134448  0.02494658 ...  0.04407774 -0.02176236
   -0.03933561]
  [-0.03842627 -0.02807379 -0.03776301 ... -0.00694171  0.01628477
    0.02554039]
  [-0.03869795  0.02921561  0.01425767 ...  0.04785431 -0.03675254
    0.04732194]
  ...
  [ 0.01163961  0.01285144 -0.01969146 ... -0.01965425  0.0349629
   -0.02381796]
  [ 0.01163961  0.01285144 -0.01969146 ... -0.01965425  0.0349629
   -0.02381796]
  [ 0.01163961  0.01285144 -0.01969146 ... -0.01965425  0.0349629
   -0.02381796]]], shape=(1, 55, 128), dtype=float32)


In [33]:
embedding.get_config()

{'name': 'tokes_embedding',
 'trainable': True,
 'batch_input_shape': (None, None),
 'dtype': 'float32',
 'input_dim': 68000,
 'output_dim': 128,
 'embeddings_initializer': {'class_name': 'RandomUniform',
  'config': {'minval': -0.05, 'maxval': 0.05, 'seed': None}},
 'embeddings_regularizer': None,
 'activity_regularizer': None,
 'embeddings_constraint': None,
 'mask_zero': True,
 'input_length': None}

## Create Simple Conv1D model

In [39]:
inputs = layers.Input(shape = (1,), dtype = tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Conv1D(filters = 64, kernel_size = 5, activation = "relu", padding = "valid")(x)
x = layers.GlobalMaxPool1D()(x)
outputs = layers.Dense(1, activation = "sigmoid")(x)
model1_Conv1D = tf.keras.Model(inputs, outputs, name="model1_Conv1D")

In [41]:
model1_Conv1D.summary()

Model: "model1_Conv1D"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 55)               0         
 torization)                                                     
                                                                 
 tokes_embedding (Embedding)  (None, 55, 128)          8704000   
                                                                 
 conv1d_1 (Conv1D)           (None, 51, 64)            41024     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense (Dense)               (None, 1)               