# Create baseline model

In [17]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from helper_function import calculate_results

Read csv files

In [2]:
train_data = pd.read_csv("dataset/csv/train.csv")
test_data = pd.read_csv("dataset/csv/test.csv")
val_data = pd.read_csv("dataset/csv/dev.csv")

In [3]:
train_data

Unnamed: 0,ID,line_number,discourse_type,discourse_text,total_lines
0,24464531,0,OBJECTIVE,To investigate the efficacy of 6 weeks of dail...,12
1,24464531,1,METHODS,A total of 125 patients with primary knee OA w...,12
2,24464531,2,METHODS,Outcome measures included pain reduction and i...,12
3,24464531,3,METHODS,Pain was assessed using the visual analog pain...,12
4,24464531,4,METHODS,Secondary outcome measures included the Wester...,12
...,...,...,...,...,...
180035,24464531,7,RESULTS,For the absolute change in percent atheroma vo...,12
180036,24464531,8,RESULTS,"For PAV , a significantly greater percentage o...",12
180037,24464531,9,RESULTS,Both strategies had acceptable side effect pro...,12
180038,24464531,10,CONCLUSIONS,"Compared with standard statin monotherapy , th...",12


Turn into text data

In [4]:
train_sentences = train_data["discourse_text"]
test_sentences = test_data["discourse_text"]
val_sentences = val_data["discourse_text"]

## Create text vectorizer layer

### turn vocab into tokenaization

See how many words are in our vocab

( taken from table 2 in: https://arxiv.org/pdf/1710.06071/pdf)

In [5]:
max_tokens = 68000  # the amount of words in dataset

Create text vectorizer

Plese refer the output_sequence_length in https://github.com/mrdbourke/tensorflow-deep-learning/blob/main/09_SkimLit_nlp_milestone_project_2.ipynb

In [6]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

text_vectorizer = TextVectorization(max_tokens = max_tokens
                                   , output_sequence_length = 55) # this is kind of magic number. See the github later

In [7]:
text_vectorizer.adapt(train_sentences)

In [8]:
# Test our text vectorizer on randome sentences
import random
target_sentence = random.choice(train_sentences)
print(f"Text:\n{target_sentence}")
print(f"\nLength of text: {len(target_sentence)}")
print(f"\nVectorized text: {text_vectorizer([target_sentence])}")

Text:
Intragroup comparisons showed that CERO ( 2 ) increased by 14.3 % ( P = 0.009 , LR group ) and 13.2 % ( P = 0.032 , HES group ) , respectively , and SjvO ( 2 ) was decreased by 8.8 % ( P = 0.016 , LR group ) and 8.1 % ( P = 0.026 , HES group ) , respectively , after tumor removal , compared with baseline .

Length of text: 307

Vectorized text: [[ 7384  1444   158    28 30721    32   101    22  2149    14  2205  3277
     13     3  2167    14  5243  3953    13    86     3 28024    32    10
    218    22  1166    14  3189  3277    13     3  1017    14  4575  3953
     13    86    21   789  1400    34     7    51     0     0     0     0
      0     0     0     0     0     0     0]]


In [9]:
# How many words in our training vocablary

rct_20k_text_vocab = text_vectorizer.get_vocabulary()
print(f"Number of words in vocab: {len(rct_20k_text_vocab)}")
print(f"Most common words in the vocab: {rct_20k_text_vocab[:5]}")
print(f"Least common words in the vocab: {rct_20k_text_vocab[-5:]}")

Number of words in vocab: 68000
Most common words in the vocab: ['', '[UNK]', 'the', 'and', 'of']
Least common words in the vocab: ['httpsuploaduminacjpcgiopenbinctrctrcgifunctionbrowsactionbrowstypesummaryrecptnor000008238languagee', 'httpsregisterclinicaltrialsgov', 'httpsmartmicrosurgerycom', 'httpseudractemaeuropaeuindexhtml', 'httpseudractemaeuropaeu']


Get config of our text vectorizer

In [10]:
text_vectorizer.get_config()

{'name': 'text_vectorization',
 'trainable': True,
 'batch_input_shape': (None, None),
 'dtype': 'string',
 'max_tokens': 68000,
 'standardize': 'lower_and_strip_punctuation',
 'split': 'whitespace',
 'ngrams': None,
 'output_mode': 'int',
 'output_sequence_length': 55,
 'pad_to_max_tokens': False,
 'sparse': False,
 'ragged': False,
 'vocabulary': None,
 'idf_weights': None}

### turn vocab numbers into embedding

In [11]:
embedding = layers.Embedding(input_dim = 68000 # the size of input shape: the size of vocab in our data
                            ,output_dim = 128 # the output shape
                            ,mask_zero = True # use masking to handle variable
                            ,name="tokes_embedding")

In [12]:
# Show example embedding
target_original_text = random.choice(train_sentences)
target_vec_text = text_vectorizer([target_original_text])
print(f"Original text: {target_original_text}")
print(embedding(target_vec_text))

Original text: Following adjustment , mean total time was reduced 8.71 ( 95 % CI = 5.15-12 .26 ) minutes ( adjusted P < 0.001 ) in Restart participants and 2.31 ( -2.19 to 6.81 ) minutes ( adjusted P = 0.472 ) in New Start participants receiving video counseling .
tf.Tensor(
[[[-0.01454742 -0.04727423  0.02528096 ... -0.02653695  0.00135207
   -0.04716783]
  [-0.04480843 -0.00294306 -0.01004447 ... -0.01625295  0.01970846
   -0.04635986]
  [-0.01734294  0.01029035  0.01644957 ...  0.03022437 -0.01106698
   -0.04460226]
  ...
  [-0.04470002  0.04140927  0.03871037 ... -0.03882939 -0.03799467
   -0.03720605]
  [-0.04470002  0.04140927  0.03871037 ... -0.03882939 -0.03799467
   -0.03720605]
  [-0.04470002  0.04140927  0.03871037 ... -0.03882939 -0.03799467
   -0.03720605]]], shape=(1, 55, 128), dtype=float32)


In [13]:
embedding.get_config()

{'name': 'tokes_embedding',
 'trainable': True,
 'batch_input_shape': (None, None),
 'dtype': 'float32',
 'input_dim': 68000,
 'output_dim': 128,
 'embeddings_initializer': {'class_name': 'RandomUniform',
  'config': {'minval': -0.05, 'maxval': 0.05, 'seed': None}},
 'embeddings_regularizer': None,
 'activity_regularizer': None,
 'embeddings_constraint': None,
 'mask_zero': True,
 'input_length': None}

## Make numeric lables

Make one hot encoder.

In [31]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse=False)
train_lables_one_hot = one_hot_encoder.fit_transform(train_data["discourse_type"].to_numpy().reshape(-1,1))
test_lables_one_hot = one_hot_encoder.fit_transform(test_data["discourse_type"].to_numpy().reshape(-1,1))
val_lables_one_hot = one_hot_encoder.fit_transform(val_data["discourse_type"].to_numpy().reshape(-1,1))

Make labels encoder.

In [35]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_data["discourse_type"].to_numpy())
test_labels_encoded = label_encoder.fit_transform(test_data["discourse_type"].to_numpy())
val_labels_encoded = label_encoder.fit_transform(val_data["discourse_type"].to_numpy())

In [37]:
num_class = len(label_encoder.classes_)
num_class, label_encoder.classes_

(5,
 array(['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS'],
       dtype=object))

## Create datasets with tf.data API

The main steps we use with our data is to turn it into `PrefetchDataset` of batches.

In [45]:
# turn our data into Tensorflow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_lables_one_hot))
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_lables_one_hot))
val_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_lables_one_hot))

In [46]:
# take TensorflowSliceDataset's and turn them into prefetched batches
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

In [47]:
train_dataset, test_dataset, val_dataset

(<PrefetchDataset shapes: ((None,), (None, 5)), types: (tf.string, tf.float64)>,
 <PrefetchDataset shapes: ((None,), (None, 5)), types: (tf.string, tf.float64)>,
 <PrefetchDataset shapes: ((None,), (None, 5)), types: (tf.string, tf.float64)>)

## Create Simple Conv1D model

In [49]:
inputs = layers.Input(shape = (1,), dtype = tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Conv1D(filters = 64, kernel_size = 5, activation = "relu", padding = "same")(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(num_class, activation = "softmax")(x)
model1_Conv1D = tf.keras.Model(inputs, outputs, name="model1_Conv1D")

In [50]:
model1_Conv1D.summary()

Model: "model1_Conv1D"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 55)               0         
 torization)                                                     
                                                                 
 tokes_embedding (Embedding)  (None, 55, 128)          8704000   
                                                                 
 conv1d_3 (Conv1D)           (None, 55, 64)            41024     
                                                                 
 global_average_pooling1d_1   (None, 64)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_2 (Dense)             (None, 5)               

In [51]:
# compile model
model1_Conv1D.compile(loss="categorical_crossentropy",
                     optimizer=tf.keras.optimizers.Adam(),
                     metrics=["accuracy"])

Fit the model.

In [54]:
model1_Conv1D.fit( train_dataset,
                  steps_per_epoch=int(0.1 * len(train_dataset)),
                  epochs=3,
                  validation_data = val_dataset,
                  validation_steps = int(0.1 * len(val_dataset))
                 )

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x17f26e554f0>

In [55]:
# evaluate model
model1_Conv1D.evaluate(val_dataset)



[0.585313618183136, 0.7899178862571716]

In [56]:
model1_pred_probs = model1_Conv1D.predict(val_dataset)

In [57]:
model1_pred_probs

array([[4.4027787e-01, 1.4899255e-01, 8.2412697e-02, 3.0343747e-01,
        2.4879478e-02],
       [4.3888324e-01, 2.7221543e-01, 1.5566239e-02, 2.6431456e-01,
        9.0205297e-03],
       [1.9784681e-01, 5.2143876e-03, 1.5086071e-03, 7.9539645e-01,
        3.3762430e-05],
       ...,
       [4.3376230e-07, 9.0905094e-05, 8.0912217e-04, 2.6279827e-07,
        9.9909925e-01],
       [1.9595772e-02, 4.2454383e-01, 7.0767745e-02, 2.4107980e-02,
        4.6098468e-01],
       [2.1675684e-01, 5.0665849e-01, 8.1012905e-02, 5.9875127e-02,
        1.3569674e-01]], dtype=float32)

In [60]:
# convert pred probs into classes
model1_preds = tf.argmax(model1_pred_probs, axis = 1)

In [61]:
model1_preds

<tf.Tensor: shape=(30212,), dtype=int64, numpy=array([0, 0, 3, ..., 4, 4, 1], dtype=int64)>

In [65]:
model1_results = calculate_results(y_true = val_labels_encoded,
                                  y_pred = model1_preds)

In [66]:
model1_results

{'accuracy': 78.99179134118893,
 'precision': 0.7842417701717078,
 'recall': 0.7899179134118893,
 'f1': 0.7862287791407456}