In [None]:
!git clone https://github.com/Franck-Dernoncourt/pubmed-rct
!ls pubmed-rct

In [None]:
!ls pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/

In [None]:
data_dir = "/content/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/"

import os
filename = [data_dir + filename for filename in os.listdir(data_dir)]
filename

In [None]:
def get_line(filename):
  with open(filename, "r") as f:
    lines = f.readlines()
  return lines

def preprocess_text_with_line_number(filename):
  input_line = get_line(filename)
  abstract_line = ""
  abstract_samples = []

  for line in input_line:
    if line.startswith('###'):
      abstract_id = line
      abstract_lines = ""
    elif line.isspace():
      abstract_line_split = abstract_line.splitlines()
      for abstract_line_number, abstract_line in enumerate(abstract_line_split):
        line_data = {}
        target_text_split = abstract_line.split("\t")
        line_data["target"] = target_text_split[0]
        line_data["text"] = target_text_split[1].lower()
        line_data["line_number"] = abstract_line_number
        line_data["total_lines"] = len(abstract_line_split) - 1
        abstract_samples.append(line_data)

    else:
      abstract_line += line

  return abstract_samples

In [None]:
%%time
train_samples = preprocess_text_with_line_number(data_dir+"train.txt")
val_samples = preprocess_text_with_line_number(data_dir+"dev.txt")
test_samples = preprocess_text_with_line_number(data_dir+"test.txt")
len(train_samples), len(val_samples), len(test_samples)

In [None]:
import pandas as pd
train_df = pd.DataFrame(train_samples)
val_df = pd.DataFrame(val_samples)
test_df = pd.DataFrame(test_samples)
train_df.head()

In [None]:
train_sentences = train_df.text.tolist()
val_sentences = val_df.text.tolist()
test_sentences = test_df.text.tolist()
len(train_sentences), len(val_sentences), len(test_sentences)

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)
train_labels_one_hot = ohe.fit_transform((train_df[["target"]]).to_numpy().reshape(-1, 1))
val_labels_one_hot = ohe.transform((val_df[["target"]]).to_numpy().reshape(-1, 1))
test_labels_one_hot = ohe.transform((test_df[["target"]]).to_numpy().reshape(-1, 1))
train_labels_one_hot

In [None]:
class_names = ohe.categories_[0]
class_names

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_labels_encoded = le.fit_transform((train_df["target"]).to_numpy())
val_labels_encoded = le.transform((val_df["target"]).to_numpy())
test_labels_encoded = le.transform((test_df["target"]).to_numpy())
train_labels_encoded

In [None]:
sen_lens = [len(sentence.split()) for sentence in train_sentences]
output_seq_len = int(np.percentile(sen_lens, 95))
max_seq_len = max(sen_lens)
avg_sent = np.mean(sen_lens)
max_seq_len, output_seq_len

In [None]:
max_token = 68000

from tensorflow.keras.layers import TextVectorization

text_vectorizer = TextVectorization(max_tokens=max_token,
                                    output_sequence_length=output_seq_len)

In [None]:
text_vectorizer.adapt(train_sentences)

In [None]:
rct_20k_text_vocab = text_vectorizer.get_vocabulary()
token_embed = layers.Embedding(input_dim=len(rct_20k_text_vocab),
                               output_dim=128,
                               mask_zero=True,
                               name="token_embedding")

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_sentences, train_labels_one_hot))
val_dataset = tf.data.Dataset.from_tensor_slices((val_sentences, val_labels_one_hot))
test_dataset = tf.data.Dataset.from_tensor_slices((test_sentences, test_labels_one_hot))

train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

train_dataset

In [None]:
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = token_embed(x)
x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(x)
x = layers.GlobalMaxPool1D()(x)
outputs = layers.Dense(5, activation="softmax")(x)
model_1 = tf.keras.Model(inputs, outputs)
model_1.compile(loss="categorical_crossentropy",
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"])
model_1.summary()

In [None]:
#model_1.fit(train_dataset, epochs=5, validation_data=val_dataset)

In [None]:
# Pre-compute embeddings for training, validation, and test sentences
import numpy as np
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
# Define the input layer for numerical embeddings
inputs = layers.Input(shape=(512,), dtype=tf.float32) # Changed shape to (512,) and dtype to float32
# Define a function to create embeddings in batches to avoid memory issues
def create_use_embeddings(sentences, batch_size=32):
    embeddings = []
    dataset = tf.data.Dataset.from_tensor_slices(sentences).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    with tf.device('/cpu:0'): # Ensure embedding happens on CPU
        for batch in dataset:
            embeddings.append(embed(batch).numpy())
    return np.concatenate(embeddings, axis=0)

train_embeddings = create_use_embeddings(train_sentences)
val_embeddings = create_use_embeddings(val_sentences)
test_embeddings = create_use_embeddings(test_sentences)

print(f"Shape of train_embeddings: {train_embeddings.shape}")
print(f"Shape of val_embeddings: {val_embeddings.shape}")
print(f"Shape of test_embeddings: {test_embeddings.shape}")



# The rest of the model layers
x = layers.Dense(128, activation="relu")(inputs) # Removed the USEEmbeddingLayer
outputs = layers.Dense(5, activation="softmax")(x)
model_2 = tf.keras.Model(inputs, outputs)
model_2.compile(loss="categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])
model_2.summary()

In [None]:
"""# Train model_2 on pre-computed embeddings
history_model_2 = model_2.fit(train_embeddings, train_labels_one_hot,
                              epochs=5,
                              validation_data=(val_embeddings, val_labels_one_hot))"""

In [None]:
#model_1.evaluate(test_dataset)

In [None]:
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = token_embed(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.GlobalMaxPool1D()(x)
outputs = layers.Dense(5, activation="softmax")(x)
model = tf.keras.Model(inputs, outputs)
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"])

In [None]:
model.fit(train_dataset,
            epochs=5,
            validation_data=val_dataset)

In [None]:
model.evaluate(test_dataset)

In [None]:
train_line_one_hot = tf.one_hot(train_df.line_number.to_numpy(), depth=15)
val_line_one_hot = tf.one_hot(val_df.line_number.to_numpy(), depth=15)
test_line_one_hot = tf.one_hot(test_df.line_number.to_numpy(), depth=15)

train_total_lines_one_hot = tf.one_hot(train_df.total_lines.to_numpy(), depth=20)
val_total_lines_one_hot = tf.one_hot(val_df.total_lines.to_numpy(), depth=20)
test_total_lines_one_hot = tf.one_hot(test_df.total_lines.to_numpy(), depth=20)

In [None]:
# Define inputs for each branch
token_inputs = layers.Input(shape=(1,), dtype=tf.string, name="token_inputs_embedding")
line_number_inputs = layers.Input(shape=(15,), dtype=tf.float32, name="line_number_inputs") # Corrected dtype and shape
total_lines_inputs = layers.Input(shape=(20,), dtype=tf.float32, name="total_number_inputs") # Corrected dtype and shape


token_x = text_vectorizer(token_inputs)
token_x = token_embed(token_x)
token_x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(token_x)
token_x = layers.GlobalMaxPool1D()(token_x)
token_outputs = layers.Dense(5, activation="softmax")(token_x)
model = tf.keras.Model(token_inputs, token_outputs)

# Token branch
x = layers.Dense(128, activation="relu")(inputs)
x = layers.Dense(64)(x)
x = layers.Dropout(0.5)(x)

# Line number branch: Dense layer
line_number_output = layers.Dense(32, activation="relu")(line_number_inputs)

# Total lines branch: Dense layer
total_lines_output = layers.Dense(32, activation='relu')(total_lines_inputs)

# Combine combined embeddings with positional embeddings
tribrid_embeddings = layers.Concatenate(name="char_token_positional_embedding")([line_number_output, total_lines_output, model.output])

# Output layer
output_layer = layers.Dense(5, activation="softmax", name="output_layer")(tribrid_embeddings)

# Create the final model
model_5 = tf.keras.Model(inputs=[line_number_inputs, total_lines_inputs, token_inputs],
                         outputs=output_layer)

model_5.compile(loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [None]:
# Create training dataset for model_5
train_dataset_complex = tf.data.Dataset.from_tensor_slices((train_line_one_hot,
                                                            train_total_lines_one_hot,
                                                            train_sentences)) # Changed train_sentences to vectorized sentences
train_dataset_clabels = tf.data.Dataset.from_tensor_slices(train_labels_one_hot)

train_dataset_model_5 = tf.data.Dataset.zip((train_dataset_complex, train_dataset_clabels))

# Create validation dataset for model_5
val_dataset_complex = tf.data.Dataset.from_tensor_slices((val_line_one_hot,
                                                          val_total_lines_one_hot,
                                                          val_sentences)) # Changed val_sentences to vectorized sentencesA

v_dataset_clabels = tf.data.Dataset.from_tensor_slices(val_labels_one_hot) # Changed train_labels_one_hot to val_labels_one_hot

val_dataset_model_5 = tf.data.Dataset.zip((val_dataset_complex, v_dataset_clabels))

# Batch and prefetch the datasets
train_dataset_model_5 = train_dataset_model_5.batch(32).prefetch(tf.data.AUTOTUNE)
val_dataset_model_5 = val_dataset_model_5.batch(32).prefetch(tf.data.AUTOTUNE)

print("Datasets for model_5 created:")
print(train_dataset_model_5.element_spec)
print(val_dataset_model_5.element_spec)

In [None]:
model_5.fit(train_dataset_model_5, epochs=5, validation_data=val_dataset_model_5)

In [None]:
test_dataset_complex = tf.data.Dataset.from_tensor_slices((test_line_one_hot,
                                                           test_total_lines_one_hot,
                                                           test_sentences)) # Changed test_sentences to vectorized sentences
test_dataset_clabels = tf.data.Dataset.from_tensor_slices(test_labels_one_hot) # Changed train_labels_one_hot to test_labels_one_hot
test_dataset_model_5 = tf.data.Dataset.zip((test_dataset_complex, test_dataset_clabels))
test_dataset_model_5 = test_dataset_model_5.batch(32).prefetch(tf.data.AUTOTUNE)
model_5.evaluate(test_dataset_model_5)

In [None]:
# Define inputs for each branch
token_inputs = layers.Input(shape=(1,), dtype=tf.string, name="token_inputs_embedding")
line_number_inputs = layers.Input(shape=(15,), dtype=tf.float32, name="line_number_inputs") # Corrected dtype and shape
total_lines_inputs = layers.Input(shape=(20,), dtype=tf.float32, name="total_number_inputs") # Corrected dtype and shape


token_x = text_vectorizer(token_inputs)
token_x = token_embed(token_x)
token_x = layers.Dense(64, activation="relu")(token_x)
token_x = layers.GlobalMaxPool1D()(token_x)
token_outputs = layers.Dense(5, activation="softmax")(token_x)
model = tf.keras.Model(token_inputs, token_outputs)

# Token branch
x = layers.Dense(128, activation="relu")(inputs)
x = layers.Dense(64)(x)
x = layers.Dropout(0.5)(x)

# Line number branch: Dense layer
line_number_output = layers.Dense(32, activation="relu")(line_number_inputs)

# Total lines branch: Dense layer
total_lines_output = layers.Dense(32, activation='relu')(total_lines_inputs)

# Combine combined embeddings with positional embeddings
tribrid_embeddings = layers.Concatenate(name="char_token_positional_embedding")([line_number_output, total_lines_output, model.output])

# Output layer
output_layer = layers.Dense(5, activation="softmax", name="output_layer")(tribrid_embeddings)

# Create the final model
model_4 = tf.keras.Model(inputs=[line_number_inputs, total_lines_inputs, token_inputs],
                         outputs=output_layer)

model_4.compile(loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [None]:
model_4.fit(train_dataset_model_5, epochs=5, validation_data=val_dataset_model_5)

In [None]:
model_4.evaluate(test_dataset_model_5)

Creating a use case for my model

In [None]:
abstract = "Scaling the input image resolution is essential for enhancing the performance of Vision Language Models (VLMs), particularly in text-rich image understanding tasks. However, popular visual encoders such as ViTs become inefficient at high resolutions due to the large number of tokens and high encoding latency. At different operational resolutions, the vision encoder of a VLM can be optimized along two axes: reducing encoding latency and minimizing the number of visual tokens passed to the LLM, thereby lowering overall latency. Based on a comprehensive efficiency analysis of the interplay between image resolution, vision latency, token count, and LLM size, we introduce FastVLM—a model that achieves an optimized tradeoff between resolution, latency, and accuracy. FastVLM incorporates FastViTHD, a novel hybrid vision encoder designed to output fewer tokens and significantly reduce encoding time for high-resolution images. Unlike previous methods, FastVLM achieves the optimal balance between visual token count and image resolution solely by scaling the input image, eliminating the need for additional token pruning and simplifying the model design. In the LLaVA1.5 setup, FastVLM achieves 3.2× improvement in time-tofirst-token (TTFT) while maintaining similar performance on VLM benchmarks compared to prior works. Compared to LLaVa-OneVision at the highest resolution (1152×1152), FastVLM achieves better performance on key benchmarks like SeedBench, MMMU and DocVQA, using the same 0.5B LLM, but with 85× faster TTFT and a vision encoder that is 3.4× smaller. Code and models are availab"

In [None]:
def split_sentence(text):
  sentences = text.split("**.")
  return sentences

def preprocess_text(text):
  input_line = split_sentence(text)
  total_lines = len(input_line)
  df = pd.DataFrame({
    'line_number': range(total_lines), # Start from 0
    'total_lines': total_lines - 1, # Total lines is 0-indexed
    'text': input_line
})

  text_line_one_hot = tf.one_hot(df.line_number.to_numpy(), depth=15)
  text_total_lines_one_hot = tf.one_hot(df.total_lines.to_numpy(), depth=20)
  text_sentences = df.text.tolist()

  # Create a dataset where each element is a dictionary of the three inputs
  text_dataset_complex = tf.data.Dataset.from_tensor_slices({
      "line_number_inputs": text_line_one_hot,
      "total_number_inputs": text_total_lines_one_hot,
      "token_inputs_embedding": tf.constant(text_sentences, dtype=tf.string)
  })

  text_dataset = text_dataset_complex.batch(32).prefetch(tf.data.AUTOTUNE)

  return text_dataset

def Skimlit(text):
  text_dataset = preprocess_text(text)
  predictions = model_5.predict(text_dataset)
  predicted_classes = np.argmax(predictions, axis=1)
  predicted_labels = [class_names[i] for i in predicted_classes]
  result = dict(zip(split_sentence(text), predicted_labels))
  result = pd.DataFrame(list(result.items()), columns=['Sentence', 'Label'])
  return result


Skimlit(abstract)

In [None]:
split_sentence(abstract)

In [None]:
Test = """This study investigated the safety and immunological impact of a third mRNA vaccine dose among immunocompromised adults in a multi-center cohort. Over a six-month period, more than one thousand participants from five tertiary hospitals were enrolled and followed after receiving a booster dose. Data were collected through patient-maintained diaries and weekly telehealth check-ins to track the incidence and severity of local and systemic adverse reactions. Blood samples were obtained before and after vaccination to measure immune response indicators, including IgG antibody titers, cytokine levels, and CD4/CD8 ratios. Participants included individuals with various immunosuppressive conditions such as cancer, organ transplants, and autoimmune disorders. Safety was assessed using standardized adverse event grading scales, while immunogenicity was analyzed through lab-based assays. The findings aim to inform public health guidelines on booster vaccination strategies for high-risk populations."""
Skimlit(Test)

In [None]:
last_test = """Hypertension continues to be a major contributor to cardiovascular disease, affecting nearly 1.3 billion individuals globally. Despite numerous therapeutic options, effective long-term blood pressure control remains suboptimal in many populations.
This study aimed to evaluate the effectiveness of a daily low-dose combination of amlodipine and hydrochlorothiazide compared to monotherapy in achieving target blood pressure levels in adults aged 40–65 years with stage 1 hypertension.
We conducted a 12-week, double-blind randomized clinical trial involving 480 participants across four urban medical centers. Patients were assigned to receive either the combination therapy or amlodipine alone. Blood pressure readings were taken weekly, and adverse effects were monitored via telehealth consultations and clinic visits.
At the end of the trial, 72.3% of patients in the combination group achieved target systolic pressure, compared to 51.7% in the monotherapy group (p < 0.01). Reported side effects were mild and evenly distributed between groups.
These findings suggest that low-dose combination therapy may provide superior outcomes in early-stage hypertension management, with minimal safety concerns and improved adherence potential."""
Skimlit(last_test)

In [None]:
# Define inputs for each branch
token_inputs = layers.Input(shape=(1,), dtype=tf.string, name="token_inputs_embedding")
line_number_inputs = layers.Input(shape=(15,), dtype=tf.float32, name="line_number_inputs") # Corrected dtype and shape
total_lines_inputs = layers.Input(shape=(20,), dtype=tf.float32, name="total_number_inputs") # Corrected dtype and shape


token_x = text_vectorizer(token_inputs)
token_x = token_embed(token_x)
token_x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(token_x)
token_x = layers.GlobalMaxPool1D()(token_x)
token_outputs = layers.Dense(5, activation="softmax")(token_x)
model = tf.keras.Model(token_inputs, token_outputs)

# Token branch
x = layers.Dense(128, activation="relu")(inputs)
x = layers.Dense(64)(x)
x = layers.Dropout(0.5)(x)

# Line number branch: Dense layer
line_number_output = layers.Dense(32, activation="relu")(line_number_inputs)

# Total lines branch: Dense layer
total_lines_output = layers.Dense(32, activation='relu')(total_lines_inputs)

# Combine combined embeddings with positional embeddings
tribrid_embeddings = layers.Concatenate(name="char_token_positional_embedding")([line_number_output, total_lines_output, model.output])

# Output layer
output_layer = layers.Dense(5, activation="softmax", name="output_layer")(tribrid_embeddings)

# Create the final model
model_6 = tf.keras.Model(inputs=[line_number_inputs, total_lines_inputs, token_inputs],
                         outputs=output_layer)

model_6.compile(loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [None]:

import string
alphabet = string.ascii_lowercase + string.digits + string.punctuation
Num_char_token = len(alphabet) + 2
Char_Vectorizer = TextVectorization(max_tokens=Num_char_token,
                                    output_sequence_length=output_seq_len,
                                    standardize="lower_and_strip_punctuation")
char_vocab = Char_Vectorizer.get_vocabulary()
char_embed = layers.Embedding(input_dim=len(char_vocab),
                              output_dim=25,
                              mask_zero=True,
                              name="char_embedding")
char_lens = [len(sentence) for sentence in train_sentences]
output_seq_len = int(np.percentile(char_lens, 95))
max_seq_len = max(char_lens)
avg_sent = np.mean(char_lens)
def split_char(text):
  return " ".join(list(text))

train_chars = [split_char(sentence) for sentence in train_sentences]
val_chars = [split_char(sentence) for sentence in val_sentences]
test_chars = [split_char(sentence) for sentence in test_sentences]

Char_Vectorizer.adapt(train_chars)

In [None]:
from tensorflow.keras.layers import Bidirectional, LSTM

char_inputs = layers.Input(shape=(1,), dtype=tf.string, name="char_inputs")
char_vector = Char_Vectorizer(char_inputs)
char_embeddings = char_embed(char_vector) # Assuming char_embed is defined as layers.Embedding
char_bi_lstm = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(char_embeddings) # Set return_sequences=True and use_cudnn=False
char_pooled = layers.GlobalMaxPool1D()(char_bi_lstm)

# Output layer
output = layers.Dense(5, activation='softmax')(char_pooled)
model_6 = tf.keras.Model(char_inputs, outputs=output)
model_6.compile(loss=tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.2),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

train_char_dataset = tf.data.Dataset.from_tensor_slices((train_chars, train_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)
val_char_dataset = tf.data.Dataset.from_tensor_slices((val_chars, val_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)
test_char_dataset = tf.data.Dataset.from_tensor_slices((test_chars, test_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE)

model_6.fit(train_char_dataset, epochs=5, validation_data=(val_char_dataset))