# Use Pretrained Feature Extraction model from Tensorflow Hub

Import liblaries

In [65]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_hub as hub
import random
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from helper_function import calculate_results

## read data

In [24]:
train_data = pd.read_csv('./dataset/csv/train.csv')
test_data = pd.read_csv('./dataset/csv/test.csv')
val_data = pd.read_csv('./dataset/csv/dev.csv')

In [25]:
print(train_data.columns)

Index(['ID', 'line_number', 'discourse_type', 'discourse_text', 'total_lines'], dtype='object')


In [26]:
train_sentences = train_data["discourse_text"]
test_sentences = test_data["discourse_text"]
val_sentences = val_data["discourse_text"]

## Get model from tensorflow hub

In [23]:
tf_hub_embedding_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                       trainable = False,
                                       name = "universal_sentence_encoder")

Test pretrained model

In [36]:
random_choice_sentence = random.choice(train_sentences)
embedded_random_sentence = tf_hub_embedding_layer([random_choice_sentence])

print(f"Original sentence :{random_choice_sentence}")
print(f"embeddded tensor : {embedded_random_sentence[0][:30]}")
print(f"Length of output: {len(embedded_random_sentence[0])}")

Original sentence :The sum of both the complete ( proctitis symptoms plus quality of life ) and partial ( proctitis symptoms ) scores of the EORTC QLQ-PRT23 ( European Organization for Research and Treatment of Cancer Quality of Life Module for Proctitis-23 items ) questionnaire were the main endpoints .
embeddded tensor : [ 0.00856564 -0.07985796  0.02184752  0.06190689  0.03349479 -0.08490837
 -0.03335121 -0.01556801 -0.0488999  -0.04151303  0.08705017 -0.01882007
  0.0509972  -0.007896    0.01913275 -0.0051905  -0.08388961  0.03110675
  0.07423677  0.04787413 -0.00290271  0.05419457 -0.01135959  0.01923758
  0.02862718 -0.02773495 -0.0398179   0.00015636  0.01413737  0.02854043]
Length of output: 512


## Build and fit model

In [44]:
inputs = layers.Input(shape=[], dtype=tf.string)
embedding = tf_hub_embedding_layer(inputs)
x = layers.Dense(128, activation = tf.keras.activations.relu)(embedding)
outputs = layers.Dense(5, activation = tf.keras.activations.softmax)(x)

model2 = tf.keras.Model(inputs = inputs,
                       outputs = outputs)

In [45]:
model2.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None,)]                 0         
                                                                 
 universal_sentence_encoder   (None, 512)              256797824 
 (KerasLayer)                                                    
                                                                 
 dense_2 (Dense)             (None, 128)               65664     
                                                                 
 dense_3 (Dense)             (None, 5)                 645       
                                                                 
Total params: 256,864,133
Trainable params: 66,309
Non-trainable params: 256,797,824
_________________________________________________________________


In [48]:
# build model

model2.compile(loss = tf.keras.metrics.categorical_crossentropy,
              optimizer = tf.keras.optimizers.Adam(),
              metrics = ["accuracy"] )

## Create train dataset

In [52]:
# Create one-hot encoded values

one_hot_encoder = OneHotEncoder( sparse = False )
train_lables_one_hot = one_hot_encoder.fit_transform(train_data["discourse_type"].to_numpy().reshape(-1,1))
test_lables_one_hot = one_hot_encoder.fit_transform(test_data["discourse_type"].to_numpy().reshape(-1,1))
val_lables_one_hot = one_hot_encoder.fit_transform(val_data["discourse_type"].to_numpy().reshape(-1,1))

In [54]:
# turn data into Tensorflow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_lables_one_hot))
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_lables_one_hot))
val_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_lables_one_hot))

In [55]:
# turn TensoflowSliceDataset into prefetch batches
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

## Fit the model

In [59]:
model2.fit(train_dataset,
           steps_per_epoch = int(0.1 * len(train_dataset)),
           epochs = 3,
           validation_data = val_dataset,
           validation_steps = int(0.1 * len(val_dataset)))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x188be4e0310>

In [60]:
model2.evaluate(val_dataset)



[0.7160554528236389, 0.7245796322822571]

## Evaluate the model

In [62]:
model2_pred_probs = model2.predict(val_dataset)
model2_pred = tf.argmax( model2_pred_probs, axis = 1)

In [63]:
model2_pred

<tf.Tensor: shape=(30212,), dtype=int64, numpy=array([0, 1, 3, ..., 4, 4, 2], dtype=int64)>

In [67]:
label_encoder = LabelEncoder()
val_label_encoded = label_encoder.fit_transform(val_data["discourse_type"].to_numpy())

In [68]:
calculate_results(y_true = val_label_encoded,
                 y_pred = model2_pred)

{'accuracy': 72.45796372302397,
 'precision': 0.7253693680271917,
 'recall': 0.7245796372302397,
 'f1': 0.7220015062365588}