# 🔥🔥 Let's Make PubMed abstracts easier to read !! with : " Brief-Lit " 📄 🔥🔥


---



Confirming GPU

In [None]:
!nvidia-smi -L

## All Imports

In [None]:
!pip install tensorflow tensorflow-hub
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import tensorflow as tf # Importing tensorflow after reinstalling
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import itertools
from tensorflow.keras.layers import TextVectorization
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score  , accuracy_score
import tensorflow_hub as hub # Importing tensorflow_hub after reinstalling
import tf_keras

## Getting the Data

**PubMed 200k RCT dataset**

The PubMed 200k RCT dataset is described in Franck Dernoncourt, Ji Young Lee. PubMed 200k RCT: a Dataset for Sequential Sentence Classification in Medical Abstracts. International Joint Conference on Natural Language Processing (IJCNLP). 2017.

**[Abstract:](https://)**

PubMed 200k RCT is new dataset based on PubMed for sequential sentence classification. The dataset consists of approximately 200,000 abstracts of randomized controlled trials, totaling 2.3 million sentences. Each sentence of each abstract is labeled with their role in the abstract using one of the following classes: background, objective, method, result, or conclusion. The purpose of releasing this dataset is twofold. First, the majority of datasets for sequential short-text classification (i.e., classification of short texts that appear in sequences) are small: we hope that releasing a new large dataset will help develop more accurate algorithms for this task. Second, from an application perspective, researchers need better tools to efficiently skim through the literature. Automatically classifying each sentence in an abstract would help researchers read abstracts more efficiently, especially in fields where abstracts may be long, such as the medical field.

In [None]:
!git clone https://github.com/Franck-Dernoncourt/pubmed-rct.git

In [None]:
!ls pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign

In [None]:
data_dir = "/content/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign"
filenames = [data_dir + filename for filename in os.listdir(data_dir)]
filenames

## Preprocess Data

In [None]:
# Reading The Lines  Of The Document

def get_lines(filename):
  with open(filename , "r") as f :
    return f.readlines()

In [None]:
train_lines = get_lines(data_dir+"/train.txt")
train_lines[:10]

In [None]:
len(train_lines)

In [None]:
def preprocess_text_with_line_numbers(filename):
  input_lines = get_lines(filename)
  abstract_lines = ""
  abstract_samples = []

  for line in input_lines:
    if line.startswith("###"):
      abstract_id = line
      abstract_lines = ""
    elif line.isspace():
      abstract_line_split = abstract_lines.splitlines()


      for abstract_line_number, abstract_line in enumerate(abstract_line_split):
        line_data = {}
        target_text_split = abstract_line.split("\t")
        line_data["target"] = target_text_split[0]
        line_data["text"] = target_text_split[1].lower()
        line_data["line_number"] = abstract_line_number
        line_data["total_lines"] = len(abstract_line_split) - 1
        abstract_samples.append(line_data)
    else:
      abstract_lines += line
  return abstract_samples

In [None]:
train_samples = preprocess_text_with_line_numbers(data_dir + "/train.txt")
val_samples = preprocess_text_with_line_numbers(data_dir + "/dev.txt")
test_samples = preprocess_text_with_line_numbers(data_dir + "/test.txt")

In [None]:
print("Length Of Training Samples : " , len(train_samples))
print("Length Of Validation Samples : " , len(val_samples))
print("Length Of Test Samples : " , len(test_samples))

In [None]:
train_samples[:10]

In [None]:
train_df = pd.DataFrame(train_samples)
val_df = pd.DataFrame(val_samples)
test_df = pd.DataFrame(test_samples)
train_df.head(14)

In [None]:
train_df["target"].value_counts()

In [None]:
# Let's check the length of Different lines
train_df["total_lines"].plot.hist();

In [None]:
train_sentences = train_df["text"].tolist()
test_sentences = test_df["text"].tolist()
val_sentences = val_df["text"].tolist()

len(train_sentences) , len(test_sentences) , len(val_sentences)

Transforming Numeric Labels :

One Hot Encoding

In [None]:
ohe_encoder = OneHotEncoder(sparse_output=False)
train_labels_one_hot = ohe_encoder.fit_transform(train_df["target"].to_numpy().reshape(-1 , 1))
test_labels_one_hot = ohe_encoder.transform(test_df["target"].to_numpy().reshape(-1 , 1))
val_labels_one_hot = ohe_encoder.transform(val_df["target"].to_numpy().reshape(-1 , 1))

# Note : The fit_transform method of OneHotEncoder expects a 2D array as input. Reshaping the target column from a 1D array (single column) to a 2D array (with one row and many columns) allows the encoder to function properly. This is because it treats each sample as a row and each category as a column when creating the one-hot encoding.

Label Encoding

In [None]:
label_encoder = LabelEncoder()
train_labels_encoder = label_encoder.fit_transform(train_df["target"].to_numpy().reshape(-1 , 1))
test_labels_encoder = label_encoder.transform(test_df["target"].to_numpy().reshape(-1 , 1))
val_labels_encoder = label_encoder.transform(val_df["target"].to_numpy().reshape(-1 , 1))
train_labels_encoder

In [None]:
def evaluation_metrics_tf(model , test_pred , test_labels) :
    accuracy = tf.keras.metrics.Accuracy()
    precision = tf.keras.metrics.Precision()
    recall = tf.keras.metrics.Recall()

    # Updating state
    accuracy.update_state(test_pred , test_labels)
    precision.update_state(test_pred , test_labels)
    recall.update_state(test_pred , test_labels)

    # Evaluate Scores
    accuracy_score = accuracy.result().numpy()
    precision_score = precision.result().numpy()
    recall_score = recall.result().numpy()
    f1_score = 2 * (precision_score * recall_score) / (precision_score + recall_score + 1e-7)
    confusion_matrix_tf = tf.math.confusion_matrix(test_labels , test_pred)

    # Results Disctionary
    results = {
        "Accuracy" : accuracy_score ,
        "Recall" : recall_score ,
        "Precision" : precision_score ,
        "F1-Score" : f1_score ,
        "Confusion Matrix" : confusion_matrix_tf
    }

    return results

## Experimentation Of Different Models :

### Model 0 : Getting a baseline model : " Machine Learing Based => Multinomial NB"

In [None]:
model_0 = Pipeline([
    ("tfid", TfidfVectorizer()),
    ("clf", MultinomialNB())
]
)


model_0.fit(train_sentences , train_labels_encoder)

In [None]:
model_0.score(X= val_sentences ,
                 y= val_labels_encoder)

In [None]:
model_0_preds = model_0.predict(val_sentences)
model_0_preds

In [None]:
def evaluation_metrics_sklearn(model , test_pred , test_labels) :
    accuracy = accuracy_score(test_labels , test_pred)
    precision = precision_score(test_labels, test_pred, average='weighted')
    recall = recall_score(test_labels , test_pred, average='weighted')
    f1= f1_score(test_labels , test_pred, average='weighted')
    confusion = confusion_matrix(test_labels , test_pred)
    results = {
        "Accuracy" : accuracy ,
        "Recall" : recall ,
        "Precision" : precision ,
        "F1-Score" : f1 ,
        "Confusion Matrix" : confusion
    }
    return results

In [None]:
model_0_results = evaluation_metrics_sklearn(test_labels= val_labels_encoder ,
                                    test_pred= model_0_preds , model=model_0)
model_0_results

In [None]:
# from helper_functions import calculate_results
# model_0_results = calculate_results(y_true= val_labels_encoder ,
#                                     y_pred= model_0_preds)
# model_0_results

### Model 1 : " Conv 1D  "

In [None]:
# Lengths of Each Sentences
sentence_len_list = [len(sentence.split()) for sentence in train_sentences]
max_len = int(np.max(sentence_len_list))
min_len = int(np.min(sentence_len_list))
avg_len = int(np.mean(sentence_len_list))
_95_percentile_len = int(np.percentile(sentence_len_list , 95))
avg_len , max_len , min_len , _95_percentile_len

In [None]:
# Distribution of the lengths of the sentences
plt.hist(sentence_len_list , bins = 20)

In [None]:
# Vocab length
sentences_list = [sentence.split() for sentence in train_sentences]
word_list = list(itertools.chain.from_iterable(sentences_list))
vocab_len = len(set(word_list))
vocab_len

#### Creating the Text Vectorizer

In [None]:
text_vectorizer = TextVectorization(max_tokens = vocab_len ,
                                    output_sequence_length = _95_percentile_len)

text_vectorizer.adapt(train_sentences)

In [None]:
vocab = text_vectorizer.get_vocabulary()
print(f"Length of vocab : {len(vocab)}")

In [None]:
text_vectorizer.get_config()

 #### Creating embedding Layer

In [None]:
embedding_layer = layers.Embedding(input_dim=len(vocab),
                                   output_dim = 512 ,
                                   mask_zero = True ,
                                   name = "embedding_layer",
                                   )

In [None]:
# Checking a sample Vectorized and embedded sentence

sample_sentence = train_sentences[np.random.randint(0 , len(train_sentences))]
sample_sentence_vectorized = text_vectorizer([sample_sentence])
sample_sentence_embedded = embedding_layer(sample_sentence_vectorized)
print(f"Sample Sentence : {sample_sentence}")
print(f"Sample Sentence Vectorized : {sample_sentence_vectorized}")
print(f"Sample Sentence Embedded : {sample_sentence_embedded}")

Turning Our Datasets  Into TensorFlow Datasets:  "TensorFlow tf.data API"

In [None]:
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_sentences , train_labels_one_hot))
val_tf_dataset = tf.data.Dataset.from_tensor_slices((val_sentences , val_labels_one_hot))
test_tf_dataset = tf.data.Dataset.from_tensor_slices((test_sentences , test_labels_one_hot))

In [None]:
# Prefected Datasets
train_tf_dataset= train_tf_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
val_tf_dataset = val_tf_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
test_tf_dataset = test_tf_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
train_tf_dataset

In [None]:
# Model Building
inputs = layers.Input(shape=(1,) , dtype=tf.string)
vectorized_text = text_vectorizer(inputs)
embedding_text = embedding_layer(vectorized_text)
x = layers.Conv1D(128 , kernel_size = 5 , padding="same" , activation ="relu")(embedding_text)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(train_df['target'].nunique() , activation="softmax")(x)
model_1 = tf.keras.Model(inputs , outputs)

In [None]:
# Compile Model
model_1.compile(loss="categorical_crossentropy" ,
                optimizer = tf.keras.optimizers.Adam() ,
                metrics = ["accuracy"])

In [None]:
model_1.summary()

In [None]:
# Fitting the model_1
model_1_history = model_1.fit(train_tf_dataset ,steps_per_epoch=int(0.1*len(train_tf_dataset)) ,  epochs = 5 , validation_data = val_tf_dataset , validation_steps = int(0.1*len(val_tf_dataset)))

In [None]:
model_1.evaluate(val_tf_dataset)

In [None]:
model_1_pred_probs = model_1.predict(val_tf_dataset)
model_1_pred_probs

In [None]:
model_1_preds = tf.argmax(model_1_pred_probs , axis=1)
model_1_preds

In [None]:
model_1_results = evaluation_metrics_tf(model= model_1 ,test_labels=val_labels_encoder , test_pred=model_1_preds)
model_1_results

### Model 2 : " Using Pretrained Token Embeddings  "

In [None]:
!pip install -q sentence-transformers

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
from sentence_transformers import SentenceTransformer
from tensorflow.keras.models import Model
import matplotlib.pyplot as plt

##### Creating Sentence Encoder

In [None]:
sentence_encoder = SentenceTransformer('all-MiniLM-L6-v2')

##### Preparing our dataset (Embedding the  train , text , validation dataset)

In [None]:
print("Encoding training sentences...")
train_embeddings = sentence_encoder.encode(train_sentences)
print("Encoding validation sentences...")
val_embeddings = sentence_encoder.encode(val_sentences)
print("Encoding test sentences...")
test_embeddings = sentence_encoder.encode(test_sentences)

In [None]:
embedding_dim = train_embeddings.shape[1]  # Typically 384 for all-MiniLM-L6-v2
print(f"Embedding dimension: {embedding_dim}")

In [None]:
# Convert to TensorFlow dataset
train_ds = tf.data.Dataset.from_tensor_slices((train_embeddings, train_labels_one_hot))
val_ds = tf.data.Dataset.from_tensor_slices((val_embeddings, val_labels_one_hot))
test_ds = tf.data.Dataset.from_tensor_slices((test_embeddings, test_labels_one_hot))

In [None]:
# Batch and prefetch
batch_size = 32
train_ds = train_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [None]:
# Build the model directly without using a function
num_classes = train_df['target'].nunique()
inputs = layers.Input(shape=(embedding_dim,))

In [None]:
# Dense layers with ReLU activation
x = layers.Dense(256, activation='relu')(inputs)
x = layers.Dropout(0.2)(x)
x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(0.2)(x)

In [None]:
# Output layer
outputs = layers.Dense(num_classes, activation='softmax')(x)

# Create the model
model_2 = Model(inputs=inputs, outputs=outputs)

In [None]:
# Compile the model
model_2.compile(
    loss='categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    metrics=['accuracy']
)

In [None]:
# Model summary
model_2.summary()

In [None]:
# Train the model
model_2_history = model_2.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)

In [None]:
# Evaluate the model
test_loss, test_accuracy = model_2.evaluate(test_ds)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

In [None]:
model_2_pred_probs = model_2.predict(val_ds)
model_2_pred_probs

In [None]:
model_2_preds = tf.argmax(model_2_pred_probs , axis=1)
model_2_preds

In [None]:
model_2_results = evaluation_metrics_tf(model= model_2 ,test_labels=val_labels_encoder , test_pred=model_2_preds)
model_2_results

### Model 3 : Conv 1D with "Character Encoding"

#### Creating character Level Embeddings

In [None]:
def split_chars(text):
  return " ".join(list(text))

In [None]:
# splitting sequence-level data splits into chanracter-level data splits
train_chars = [split_chars(sentence) for sentence in train_sentences]
val_chars = [split_chars(sentence) for sentence in val_sentences]
test_chars = [split_chars(sentence) for sentence in test_sentences]
print(train_chars[0])

In [None]:
# What's the average character length?
char_lens = [len(sentence) for sentence in train_sentences]
mean_char_len = np.mean(char_lens)
mean_char_len

In [None]:
# Checking the distribution of our sequences at character-level
import matplotlib.pyplot as plt
plt.hist(char_lens, bins=7);

In [None]:
# Find what character length covers 95% of sequences
output_seq_char_len = int(np.percentile(char_lens, 95))
output_seq_char_len

In [None]:
# Get all keyboard characters for char-level embedding
import string
alphabet = string.ascii_lowercase + string.digits + string.punctuation
alphabet

In [None]:
# Create char-level token vectorizer instance
NUM_CHAR_TOKENS = len(alphabet) + 2
char_vectorizer = TextVectorization(max_tokens=NUM_CHAR_TOKENS,
                                    output_sequence_length=output_seq_char_len,
                                    standardize="lower_and_strip_punctuation",
                                    name="char_vectorizer")

# Adapt character vectorizer to training characters
char_vectorizer.adapt(train_chars)

In [None]:
# Check character vocabulary characteristics
char_vocab = char_vectorizer.get_vocabulary()
print(f"Number of different characters in character vocab: {len(char_vocab)}")
print(f"5 most common characters: {char_vocab[:5]}")
print(f"5 least common characters: {char_vocab[-5:]}")

In [None]:
# Create char embedding layer
char_embed = layers.Embedding(input_dim=NUM_CHAR_TOKENS,
                              output_dim=100,
                              mask_zero=False,
                              name="char_embed")

In [None]:
# Test out character embedding layer
print(f"Charified text (before vectorization and embedding):\n{train_chars[30]}\n")
char_embed_example = char_embed(char_vectorizer([train_chars[30]]))
print(f"Embedded chars (after vectorization and embedding):\n{char_embed_example}\n")
print(f"Character embedding shape: {char_embed_example.shape}")

#### Building The Model

In [None]:
inputs = layers.Input(shape=(1,) , dtype = "string")
char_vectors = char_vectorizer(inputs)
char_embeddings = char_embed(char_vectors)
x = layers.Conv1D(128 , kernel_size = 5 , padding="same" , activation ="relu")(char_embeddings)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(train_df['target'].nunique() , activation="softmax")(x)
model_3 = tf.keras.Model(inputs , outputs , name = "model_3_conv1d_char_embeddings")

In [None]:
model_3.compile(loss = "categorical_crossentropy" , optimizer = tf.keras.optimizers.Adam() , metrics = ["accuracy"])

In [None]:
model_3.summary()

In [None]:
# Create char datasets
train_char_dataset = tf.data.Dataset.from_tensor_slices((train_chars, train_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)
val_char_dataset = tf.data.Dataset.from_tensor_slices((val_chars, val_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)

In [None]:
model_3_history = model_3.fit(train_char_dataset,
                              steps_per_epoch=int(0.1 * len(train_char_dataset)),
                              epochs=10,
                              validation_data=val_char_dataset,
                              validation_steps=int(0.1 * len(val_char_dataset)))

#### Evaluating our model

In [None]:
model_3.evaluate(val_char_dataset)

In [None]:
model_3_pred_probs = model_3.predict(val_char_dataset)
model_3_pred_probs

In [None]:
model_3_preds = tf.argmax(model_3_pred_probs, axis=1)
model_3_preds

In [None]:
model_3_results = evaluation_metrics_tf(model = model_3 , test_labels=val_labels_encoder,test_pred=model_3_preds)
model_3_results

### Model4 : Combining Pretrained Embeddings + Characters Embeddings (hybrid Embedding)
- Create a token-level embedding model
- Crate a character-level model
- Building a series of output layers
- construct a modle which takes character-level sequences as input and produces sequence label probabilities as output

#### Create a token-level embedding model

In [None]:
# # Define the custom Sentence Encoder Layer
# class SentenceEncoderLayer(tf.keras.layers.Layer):
#     def __init__(self, model_name='all-MiniLM-L6-v2', **kwargs):
#         super(SentenceEncoderLayer, self).__init__(**kwargs)
#         self.model_name = model_name
#         self.encoder = SentenceTransformer(model_name)
#         self.trainable = False  # Set to True if you want it to be trainable
#         self.output_dim = self.encoder.get_sentence_embedding_dimension()

#     def build(self, input_shape):
#         # This ensures the layer knows its output shape
#         self.built = True

#     def call(self, inputs):
#         # Remove extra dimension if needed: inputs shape might be (batch_size, 1)
#         inputs = tf.squeeze(inputs, axis=1)
#         return tf.py_function(
#             func=self._encode_sentences,
#             inp=[inputs],
#             Tout=tf.float32
#         )

#     def _encode_sentences(self, sentences):
#         # Convert tensor to list of strings
#         if isinstance(sentences, tf.Tensor):
#             sentences = [s.decode('utf-8') for s in sentences.numpy()]

#         # Get embeddings
#         embeddings = self.encoder.encode(sentences)
#         return tf.convert_to_tensor(embeddings, dtype=tf.float32)

#     def compute_output_shape(self, input_shape):
#         # Specify the output shape explicitly
#         return (input_shape[0], self.output_dim)

#     def get_config(self):
#         config = {
#             'model_name': self.model_name
#         }
#         base_config = super(SentenceEncoderLayer, self).get_config()
#         return dict(list(base_config.items()) + list(config.items()))

In [None]:
# # Setup token input branch
# token_inputs = layers.Input(shape=(1,), dtype=tf.string, name="token_input_main")
# sentence_encoder_layer = SentenceEncoderLayer()
# token_embeddings = sentence_encoder_layer(token_inputs)
# # Ensure the shape is set explicitly after processing
# # token_embeddings = tf.reshape(token_embeddings, [-1, sentence_encoder_layer.output_dim])
# token_output = layers.Dense(128, activation="relu")(token_embeddings)
# token_model = tf.keras.Model(inputs=token_inputs,
#                             outputs=token_output)

In [None]:
# token_model.summary()

#### Crate a character-level model

In [None]:
# # 2. Setting up char inputs/model
# char_inputs = layers.Input(shape=(1,), dtype=tf.string, name="char_input")
# char_vectors = char_vectorizer(char_inputs)
# char_embeddings = char_embed(char_vectors)
# char_bi_lstm = layers.Bidirectional(layers.LSTM(25))(char_embeddings)
# char_model = tf.keras.Model(inputs=char_inputs,
#                             outputs=char_bi_lstm)

In [None]:
# char_model.summary()

#### Building a series of output layers

In [None]:
# # 3. Concatenating token and char inputs (create hybrid token embedding)
# token_char_concat = layers.Concatenate(name="token_char_hybrid")([token_model.output,
#                                                                   char_model.output])

In [None]:
# # 4. Creating output layers - adding of dropout
# combined_dropout = layers.Dropout(0.5)(token_char_concat)
# combined_dense = layers.Dense(200, activation="relu")(combined_dropout)
# final_dropout = layers.Dropout(0.5)(combined_dense)
# output_layer = layers.Dense(num_classes, activation="softmax")(final_dropout)

In [None]:
# # 5. Constructing the model with char and token inputs
# model_4 = tf.keras.Model(inputs=[token_model.input, char_model.input],
#                          outputs=output_layer,
#                          name="model_4_token_and_char_embeddings")

In [None]:
# model_4.summary()

In [None]:
# # Plot and visualizations
# from keras.utils import plot_model
# plot_model(model_4 , show_shapes=True)

#### Compiling the model

In [None]:
# # Combine Char + token model
# model_4.compile(loss = "categorical_crossentropy",
#                 optimizer=tf.keras.optimizers.Adam(),
#                 metrics =["accuracy"])

#### Combining token and character data into a tf.data Dataset

In [None]:

# # Combine chars and tokens into a dataset
# train_char_token_data = tf.data.Dataset.from_tensor_slices((train_sentences, train_chars)) # make data
# train_char_token_labels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot) # make labels
# train_char_token_dataset = tf.data.Dataset.zip((train_char_token_data, train_char_token_labels)) # combine data and labels

# # Prefetch and batch train data
# train_char_token_dataset = train_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

# # Repeat same steps validation data
# val_char_token_data = tf.data.Dataset.from_tensor_slices((val_sentences, val_chars))
# val_char_token_labels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot)
# val_char_token_dataset = tf.data.Dataset.zip((val_char_token_data, val_char_token_labels))
# val_char_token_dataset = val_char_token_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

#### Fitting the model

In [None]:
# model_4_history = model_4.fit(train_char_token_dataset, # train on dataset of token and characters
#                               steps_per_epoch=int(0.1 * len(train_char_token_dataset)),
#                               epochs=10,
#                               validation_data=val_char_token_dataset,
#                               validation_steps=int(0.1 * len(val_char_token_dataset)))

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model
from sentence_transformers import SentenceTransformer
import numpy as np

# For reproducibility (optional)
tf.random.set_seed(42)

# Define the number of classes for the final prediction (adjust as needed)
num_classes = train_labels_one_hot.shape[1]


In [None]:
# Initialize the SentenceTransformer model
sent_encoder = SentenceTransformer('all-MiniLM-L6-v2')
output_dim = sent_encoder.get_sentence_embedding_dimension()

# Precompute embeddings for training and validation sentences
train_sentence_embeddings = sent_encoder.encode(train_sentences, show_progress_bar=True)
val_sentence_embeddings = sent_encoder.encode(val_sentences, show_progress_bar=True)

# Convert the embeddings to float32 numpy arrays (or tensors later)
train_sentence_embeddings = np.array(train_sentence_embeddings, dtype=np.float32)
val_sentence_embeddings = np.array(val_sentence_embeddings, dtype=np.float32)


In [None]:
# Define the numeric input for precomputed sentence embeddings
token_input = layers.Input(shape=(output_dim,), dtype=tf.float32, name="token_input_main")
# Optionally, add a fully connected layer
token_dense = layers.Dense(128, activation="relu")(token_input)

# Create a model for the token branch
token_model = Model(inputs=token_input, outputs=token_dense, name="token_branch")


In [None]:
# Define character input, assuming each example is a single string (shape=())
char_input = layers.Input(shape=(), dtype=tf.string, name="char_input")

# Example: using a TextVectorization layer to tokenize the char input.
# Adjust parameters as needed. Here we assume a maximum of 100 characters (you can adjust).
max_chars = 100
char_vectorizer = layers.TextVectorization(max_tokens=2000, output_mode='int', output_sequence_length=max_chars)
# If you haven't adapted the vectorizer yet, do it on training data (convert input to a tf.data.Dataset of strings)
char_vectorizer.adapt(train_chars)

# Create an embedding layer for characters (vocab_size matches the vectorizer's output size)
vocab_size = char_vectorizer.vocabulary_size()
char_embed = layers.Embedding(input_dim=vocab_size, output_dim=64)

# Process the char input through the vectorizer and embedding layers
char_vectors = char_vectorizer(char_input)
char_embeddings = char_embed(char_vectors)
char_bi_lstm = layers.Bidirectional(layers.LSTM(25))(char_embeddings)

# Build a model for the char branch
char_model = Model(inputs=char_input, outputs=char_bi_lstm, name="char_branch")


In [None]:
# Concatenate the outputs of the token branch and the char branch
combined = layers.Concatenate(name="token_char_hybrid")([token_model.output, char_model.output])

# Add dropout and dense layers as needed
x = layers.Dropout(0.5)(combined)
x = layers.Dense(200, activation="relu")(x)
x = layers.Dropout(0.5)(x)
output_layer = layers.Dense(num_classes, activation="softmax")(x)

# Construct the final multi-input model
model_4 = Model(inputs=[token_model.input, char_model.input], outputs=output_layer, name="model_4_token_and_char_embeddings")

# Compile the model
model_4.compile(loss="categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

model_4.summary()


In [None]:
# Build a tf.data.Dataset that provides the token embeddings and char inputs along with labels

# For training:
train_dataset = tf.data.Dataset.from_tensor_slices((
    {
        "token_input_main": train_sentence_embeddings,  # numeric embeddings
        "char_input": train_chars                        # raw text characters
    },
    train_labels_one_hot
))
train_dataset = train_dataset.batch(8).prefetch(tf.data.AUTOTUNE)

# For validation:
val_dataset = tf.data.Dataset.from_tensor_slices((
    {
        "token_input_main": val_sentence_embeddings,
        "char_input": val_chars
    },
    val_labels_one_hot
))
val_dataset = val_dataset.batch(8).prefetch(tf.data.AUTOTUNE)


In [None]:
# Train the model (adjust steps_per_epoch and validation_steps as appropriate)
model_4_history = model_4.fit(
    train_dataset,
    epochs=10,
    validation_data=val_dataset
)


### Model 5: Transfer Learning with pretrained token embeddings + character embeddings + positional embeddings

In [None]:

# Inspect training dataframe
train_df.head()