In [33]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
from datasets import load_dataset
import os
from PIL import Image
import numpy as np
from tqdm import tqdm

# Define paths and parameters
image_folder_path = '/Selected_LG'
vector_model_name = 'openpecha/tibetan_RoBERTa_S_e6'
dataset_name = 'ta4tsering/Lhasa_kanjur_transcription_datasets'
image_height = 64
image_width = 2048

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(vector_model_name)
model = TFAutoModel.from_pretrained(vector_model_name)

def load_transcription_vector(transcription):
    inputs = tokenizer(transcription, return_tensors="tf", padding=True, truncation=True, max_length=512)
    outputs = model(inputs)
    return tf.reduce_mean(outputs.last_hidden_state, axis=1).numpy().squeeze()

def load_image(filename):
    img_path = os.path.join(image_folder_path, filename)
    img = Image.open(img_path).convert('L')  # Convert to grayscale
    img = img.resize((image_width, image_height))
    img = np.array(img) / 255.0
    img = np.expand_dims(img, axis=-1)  # Add channel dimension
    return img.astype(np.float32)

def process_example(example):
    transcription = example['label']
    filename = example['filename']
    if filename in local_filenames:
        vector = load_transcription_vector(transcription)
        image = load_image(filename)
        return vector, image
    else:
        return None

# Get the list of local filenames
local_filenames = set(os.listdir(image_folder_path))

# Load the dataset
dataset = load_dataset(dataset_name, split='test')

# Process the dataset using a for-loop
vectors = []
images = []

for example in tqdm(dataset):
    result = process_example(example)
    if result is not None:
        vector, image = result
        vectors.append(vector)
        images.append(image)

# Convert lists to numpy arrays
vectors = np.array(vectors, dtype=np.float32)
images = np.array(images, dtype=np.float32)

# Create TensorFlow datasets
tf_dataset = tf.data.Dataset.from_tensor_slices((vectors, images)).batch(32).cache().prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

# Display the dataset structure
for vec, img in tf_dataset.take(1):
    print("Vectors shape:", vec.shape)
    print("Images shape:", img.shape)

# Print a few samples from the dataset
print("Samples from the dataset:")
for vec, img in tf_dataset.take(1):
    for i in range(3):
        print(f"Sample {i+1} - Vector:", vec[i].numpy()[:5], "Image shape:", img[i].numpy().shape)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 16640/16640

Vectors shape: (32, 768)
Images shape: (32, 64, 2048, 1)
Samples from the dataset:


2024-07-20 08:55:27.477445: W tensorflow/core/kernels/data/cache_dataset_ops.cc:913] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


Sample 1 - Vector: [-0.09002203 -0.19136767 -0.1210061   0.0845647   0.33303985] Image shape: (64, 2048, 1)
Sample 2 - Vector: [-0.5607847   0.01361169 -0.246679    0.2227611   0.15535183] Image shape: (64, 2048, 1)
Sample 3 - Vector: [-0.20265871  0.03736514  0.02197978  0.05226846 -0.09203821] Image shape: (64, 2048, 1)


2024-07-20 08:55:27.722918: W tensorflow/core/kernels/data/cache_dataset_ops.cc:913] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [2]:
import torch
import tensorflow as tf
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import os
from PIL import Image
import numpy as np
from tqdm import tqdm

# Define paths and parameters
image_folder_path = '/Selected_LG'
vector_model_name = 'openpecha/tibetan_RoBERTa_S_e6'
dataset_name = 'ta4tsering/Lhasa_kanjur_transcription_datasets'
image_height = 64
image_width = 2048

# Load the tokenizer and model (PyTorch)
tokenizer = AutoTokenizer.from_pretrained(vector_model_name)
model = AutoModel.from_pretrained(vector_model_name).to('cuda')

def load_transcription_vector(transcription):
    inputs = tokenizer(transcription, return_tensors="pt", padding=True, truncation=True, max_length=512).to('cuda')
    with torch.no_grad():
        outputs = model(**inputs)
    vector = torch.mean(outputs.last_hidden_state, dim=1).squeeze()
    return vector.cpu().numpy()

def load_image(filename):
    img_path = os.path.join(image_folder_path, filename)
    img = Image.open(img_path).convert('L')  # Convert to grayscale
    img = img.resize((image_width, image_height))
    img = np.array(img) / 255.0
    img = np.expand_dims(img, axis=-1)  # Add channel dimension
    return img.astype(np.float32)

def process_example(example):
    transcription = example['label']
    filename = example['filename']
    if filename in local_filenames:
        vector = load_transcription_vector(transcription)
        image = load_image(filename)
        return vector, image
    else:
        return None

# Get the list of local filenames
local_filenames = set(os.listdir(image_folder_path))

# Load the dataset
dataset = load_dataset(dataset_name, split='test')

# Process the dataset using a for-loop
vectors = []
images = []

for example in tqdm(dataset):
    result = process_example(example)
    if result is not None:
        vector, image = result
        vectors.append(vector)
        images.append(image)

# Convert lists to numpy arrays
vectors = np.array(vectors, dtype=np.float32)
images = np.array(images, dtype=np.float32)

# Convert numpy arrays to TensorFlow tensors
tf_vectors = tf.convert_to_tensor(vectors)
tf_images = tf.convert_to_tensor(images)

# Create TensorFlow datasets
tf_dataset = tf.data.Dataset.from_tensor_slices((tf_vectors, tf_images)).batch(32).cache().prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

# Display the dataset structure
for vec, img in tf_dataset.take(1):
    print("Vectors shape:", vec.shape)
    print("Images shape:", img.shape)

# Print a few samples from the dataset
print("Samples from the dataset:")
for vec, img in tf_dataset.take(1):
    for i in range(3):
        print(f"Sample {i+1} - Vector:", vec[i].numpy()[:5], "Image shape:", img[i].numpy().shape)

Some weights of RobertaModel were not initialized from the model checkpoint at openpecha/tibetan_RoBERTa_S_e6 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 16640/16640 [00:05<00:00, 2880.63it/s]


Vectors shape: (32, 768)
Images shape: (32, 64, 2048, 1)
Samples from the dataset:


2024-07-20 09:12:47.652246: W tensorflow/core/kernels/data/cache_dataset_ops.cc:913] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2024-07-20 09:12:47.851498: W tensorflow/core/kernels/data/cache_dataset_ops.cc:913] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


Sample 1 - Vector: [-0.09011503 -0.19129862 -0.12103254  0.08446781  0.3330148 ] Image shape: (64, 2048, 1)
Sample 2 - Vector: [-0.56067747  0.01370564 -0.24657749  0.2228653   0.15539485] Image shape: (64, 2048, 1)
Sample 3 - Vector: [-0.20247729  0.03741506  0.02215673  0.05223634 -0.09218432] Image shape: (64, 2048, 1)


2024-07-20 09:12:47.855514: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [19]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
from datasets import load_dataset
import os
from PIL import Image
import numpy as np
import time
from tqdm import tqdm

# Define paths and parameters
image_folder_path = '/Selected_LG'
vector_model_name = 'openpecha/tibetan_RoBERTa_S_e6'
dataset_name = 'ta4tsering/Lhasa_kanjur_transcription_datasets'
image_height = 64
image_width = 2048
batch_size = 32

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(vector_model_name)
model = TFAutoModel.from_pretrained(vector_model_name)

def load_transcription_vector(transcription):
    inputs = tokenizer(transcription, return_tensors="tf", padding=True, truncation=True, max_length=512)
    outputs = model(inputs)
    return tf.reduce_mean(outputs.last_hidden_state, axis=1).numpy().squeeze()

def load_image(filename):
    img_path = os.path.join(image_folder_path, filename)
    img = Image.open(img_path).convert('L')  # Convert to grayscale
    img = img.resize((image_width, image_height))
    img = np.array(img) / 255.0
    img = np.expand_dims(img, axis=-1)  # Add channel dimension
    return img.astype(np.float32)

# Get the list of local filenames
local_filenames = set(os.listdir(image_folder_path))

# Load the dataset
dataset = load_dataset(dataset_name, split='test')

# Create a dictionary for quick lookup of Hugging Face filenames
hf_filenames = {example['filename']: example['label'] for example in dataset}

# Create lists to store the vectors and images
vectors_list = []
images_list = []

start_time = time.time()

for filename in tqdm(local_filenames):
    if filename in hf_filenames:
        transcription = hf_filenames[filename]
        vector = load_transcription_vector(transcription)
        image = load_image(filename)
        vectors_list.append(vector)
        images_list.append(image)

end_time = time.time()

print("Data loading and processing took {:.2f} seconds".format(end_time - start_time))

# Convert lists to TensorFlow dataset
vectors_array = np.array(vectors_list)
images_array = np.array(images_list)

tf_dataset = tf.data.Dataset.from_tensor_slices((vectors_array, images_array))
tf_dataset = tf_dataset.batch(batch_size)
tf_dataset = tf_dataset.cache()
tf_dataset = tf_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

# Display the dataset structure
for vectors, images in tf_dataset.take(1):
    print("Vectors shape:", vectors.shape)
    print("Images shape:", images.shape)

# Print a few samples from the dataset
print("Samples from the dataset:")
for vectors, images in tf_dataset.take(1):
    for i in range(3):
        print(f"Sample {i+1} - Vector:", vectors[i].numpy()[:5], "Image shape:", images[i].numpy().shape)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1000/1000 [

Data loading and processing took 68.26 seconds
Vectors shape: (32, 768)
Images shape: (32, 64, 2048, 1)
Samples from the dataset:
Sample 1 - Vector: [ 0.25963187 -0.25010753 -0.10972076 -0.03690102  0.16365188] Image shape: (64, 2048, 1)
Sample 2 - Vector: [ 0.33421978 -0.2964837  -0.42488924 -0.04383376  0.4691976 ] Image shape: (64, 2048, 1)
Sample 3 - Vector: [ 0.02504182 -0.29210714 -0.37100416 -0.244986    0.64311194] Image shape: (64, 2048, 1)


2024-07-20 07:33:12.849346: W tensorflow/core/kernels/data/cache_dataset_ops.cc:913] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.
2024-07-20 07:33:13.032653: W tensorflow/core/kernels/data/cache_dataset_ops.cc:913] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [None]:
from tensorflow.keras.layers import Dense, Reshape, Input
from tensorflow.keras.models import Model

# Define the autoencoder model
def build_autoencoder(input_shape, vector_shape):
    # Encoder
    text_input = Input(shape=vector_shape)
    x = Dense(1024, activation='relu')(text_input)
    x = Dense(512, activation='relu')(x)
    encoded = Dense(256, activation='relu')(x)

    # Decoder
    x = Dense(512, activation='relu')(encoded)
    x = Dense(1024, activation='relu')(x)
    x = Dense(input_shape[0] * input_shape[1], activation='sigmoid')(x)
    decoded = Reshape(input_shape)(x)

    # Autoencoder model
    autoencoder = Model(text_input, decoded)
    autoencoder.compile(optimizer='adam', loss='mse')

    return autoencoder

# Define the input shape and vector shape
input_shape = (image_height, image_width, 1)
vector_shape = (768,)  # Adjust based on the actual vector size from your text embeddings

# Build the autoencoder model
autoencoder = build_autoencoder(input_shape, vector_shape)
autoencoder.summary()

# Train the autoencoder
autoencoder.fit(tf_dataset, epochs=50, verbose=1)

In [6]:
def compare_datasets(ds1, ds2):
    # Compare the types and element specs
    print("Dataset 1 type:", type(ds1))
    print("Dataset 2 type:", type(ds2))
    print("Dataset 1 element_spec:", ds1.element_spec)
    print("Dataset 2 element_spec:", ds2.element_spec)
    
    # Compare attributes
    ds1_attributes = dir(ds1)
    ds2_attributes = dir(ds2)
    
    for attr in ds1_attributes:
        if attr in ds2_attributes:
            try:
                val1 = getattr(ds1, attr)
                val2 = getattr(ds2, attr)
                try:
                    if val1 != val2:
                        print(f"Difference in attribute '{attr}':")
                        print(f"Dataset1: {val1}")
                        print(f"Dataset2: {val2}")
                except:
                    print(f"Difference in attribute '{attr}': unable to directly compare values")
            except AttributeError as e:
                print(f"Error while comparing attribute {attr}: {e}")

# Compare the datasets
compare_datasets(tf_dataset_for_loop, tf_dataset_generator)

Dataset 1 type: <class 'tensorflow.python.data.ops.prefetch_op._PrefetchDataset'>
Dataset 2 type: <class 'tensorflow.python.data.ops.prefetch_op._PrefetchDataset'>
Dataset 1 element_spec: (TensorSpec(shape=(None, 768), dtype=tf.float32, name=None), TensorSpec(shape=(None, 64, 2048, 1), dtype=tf.float32, name=None))
Dataset 2 element_spec: (TensorSpec(shape=(None, 768), dtype=tf.float32, name=None), TensorSpec(shape=(None, 64, 2048, 1), dtype=tf.float32, name=None))
Difference in attribute '__bool__':
Dataset1: <bound method DatasetV2.__bool__ of <_PrefetchDataset element_spec=(TensorSpec(shape=(None, 768), dtype=tf.float32, name=None), TensorSpec(shape=(None, 64, 2048, 1), dtype=tf.float32, name=None))>>
Dataset2: <bound method DatasetV2.__bool__ of <_PrefetchDataset element_spec=(TensorSpec(shape=(None, 768), dtype=tf.float32, name=None), TensorSpec(shape=(None, 64, 2048, 1), dtype=tf.float32, name=None))>>
Difference in attribute '__debug_string__':
Dataset1: <bound method DatasetV2.