In [1]:
import torch
import tensorflow as tf
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import os
from PIL import Image
import numpy as np
from tqdm import tqdm

# Define paths and parameters
image_folder_path = '/Selected_LG'
vector_model_name = 'openpecha/tibetan_RoBERTa_S_e6'
dataset_name = 'ta4tsering/Lhasa_kanjur_transcription_datasets'
image_height = 64
image_width = 2048

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(vector_model_name)
model = AutoModel.from_pretrained(vector_model_name).to('cuda')

def load_transcription_vector(transcription):
    inputs = tokenizer(transcription, return_tensors="pt", padding=True, truncation=True, max_length=512).to('cuda')
    with torch.no_grad():
        outputs = model(**inputs)
    vector = torch.mean(outputs.last_hidden_state, dim=1).squeeze()
    return vector.cpu().numpy()

def load_image(filename):
    img_path = os.path.join(image_folder_path, filename)
    img = Image.open(img_path).convert('L')  # Convert to grayscale
    img = img.resize((image_width, image_height))
    img = np.array(img) / 255.0
    img = np.expand_dims(img, axis=-1)  # Add channel dimension
    return img.astype(np.float32)

def process_example(example):
    transcription = example['label']
    filename = example['filename']
    if filename in local_filenames:
        vector = load_transcription_vector(transcription)
        image = load_image(filename)
        return vector, image
    else:
        return None

# Get the list of local filenames
local_filenames = set(os.listdir(image_folder_path))

# Load the dataset
dataset = load_dataset(dataset_name, split='test')

# Process the dataset using a for-loop
vectors = []
images = []

for example in tqdm(dataset):
    result = process_example(example)
    if result is not None:
        vector, image = result
        vectors.append(vector)
        images.append(image)

# Convert lists to numpy arrays
vectors = np.array(vectors, dtype=np.float32)
images = np.array(images, dtype=np.float32)

# Create TensorFlow datasets directly from NumPy arrays
tf_dataset = tf.data.Dataset.from_tensor_slices((vectors, images)).batch(32).cache().prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

2024-07-20 09:24:09.553981: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-20 09:24:09.568562: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-20 09:24:09.573123: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-20 09:24:09.587957: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of RobertaModel were not initialized fro

In [None]:
from tensorflow.keras.layers import Dense, Reshape, Input
from tensorflow.keras.models import Model

# Define the autoencoder model
def build_autoencoder(input_shape, vector_shape):
    # Encoder
    text_input = Input(shape=vector_shape)
    x = Dense(1024, activation='relu')(text_input)
    x = Dense(512, activation='relu')(x)
    encoded = Dense(256, activation='relu')(x)

    # Decoder
    x = Dense(512, activation='relu')(encoded)
    x = Dense(1024, activation='relu')(x)
    x = Dense(input_shape[0] * input_shape[1], activation='sigmoid')(x)
    decoded = Reshape(input_shape)(x)

    # Autoencoder model
    autoencoder = Model(text_input, decoded)
    autoencoder.compile(optimizer='adam', loss='mse')

    return autoencoder

# Define the input shape and vector shape
input_shape = (image_height, image_width, 1)
vector_shape = (768,)  # Adjust based on the actual vector size from your text embeddings

# Build the autoencoder model
autoencoder = build_autoencoder(input_shape, vector_shape)
autoencoder.summary()

# Train the autoencoder
autoencoder.fit(tf_dataset, epochs=50, verbose=1)

Epoch 1/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 57ms/step - loss: 0.1667
Epoch 2/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.1089
Epoch 3/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.1083
Epoch 4/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.1083
Epoch 5/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.1084
Epoch 6/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.1084
Epoch 7/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.1083
Epoch 8/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.1081
Epoch 9/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.1080
Epoch 10/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 0.1080