# Dependencies

In [1]:
import tensorflow as tf
print(tf.__version__)
import keras
from keras import layers
from keras import models
from tensorflow.keras.layers import Input, Concatenate, Dense, LSTM, Embedding
from tensorflow.keras.models import Model
import pandas as pd
import numpy as np

2.15.0


# Import Data

In [None]:
# Mount drive and make folders

from google.colab import drive
drive.mount('/content/drive')

training_csv_path =  '/content/drive/MyDrive/6.8301 Final Project/data/datasets/transcription/train.csv'
test_csv_path = '/content/drive/MyDrive/6.8301 Final Project/data/datasets/transcription/test.csv'
val_csv_path = '/content/drive/MyDrive/6.8301 Final Project/data/datasets/transcription/val.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Read CSV file
train_df = pd.read_csv(training_csv_path)

# Extract labels from DataFrame
target_latex_string = target_latex_string = np.expand_dims(train_df['latex'].values, axis=1)

# Find the maximum sequence length in the 'visible_latex_chars' column
max_sequence_length = max(len(eval(seq)) for seq in train_df['visible_char_map'])

# Initialize an empty numpy arrays to store inputs
visible_chars_map  = np.zeros((len(train_df), max_sequence_length))
xmins = np.zeros((len(train_df), max_sequence_length))
xmaxs = np.zeros((len(train_df), max_sequence_length))
ymins = np.zeros((len(train_df), max_sequence_length))
ymaxs = np.zeros((len(train_df), max_sequence_length))

# Iterate over each row in the DataFrame and unpack arrays
for i, latex_num in enumerate(train_df['visible_char_map']):
    # Split the string into individual characters
    latex_chars = eval(latex_num)
    # Fill the array with the characters, truncating or padding as needed
    visible_chars_map[i, :len(latex_chars)] = latex_chars[:max_sequence_length]


# Iterate over each row in the CSV and fill the numpy array
for i, row in enumerate(train_df['xmins']):
    # Parse the array from the string
    array_values = eval(row)
    # Fill the numpy array with the array values
    xmins[i, :len(array_values)] = array_values

for i, row in enumerate(train_df['xmaxs']):
    # Parse the array from the string
    array_values = eval(row)
    # Fill the numpy array with the array values
    xmaxs[i, :len(array_values)] = array_values

for i, row in enumerate(train_df['ymins']):
    # Parse the array from the string
    array_values = eval(row)
    # Fill the numpy array with the array values
    ymins[i, :len(array_values)] = array_values

for i, row in enumerate(train_df['ymaxs']):
    # Parse the array from the string
    array_values = eval(row)
    # Fill the numpy array with the array values
    ymaxs[i, :len(array_values)] = array_values

unique_chars = set()
for chars in visible_chars_map:
    unique_chars.update(chars)
num_chars = len(unique_chars)

unique_labels = np.unique(target_latex_string)
num_classes = len(unique_labels)

print("visible_chars_map:", visible_chars_map.shape)
print("xmins_array:", xmins.shape)
print("xmaxs_array:", xmaxs.shape)
print("ymins_array:", ymins.shape)
print("ymaxs_array:", ymaxs.shape)
print("target_latex_string:", target_latex_string.shape)

print("visible_chars_map:", visible_chars_map[:5])
print("xmins:", xmins[:5])
print("xmaxs:", xmaxs[:5])
print("ymins:", ymins[:5])
print("ymaxs:", ymaxs[:5])
print("target_latex_string:", target_latex_string[:5])


visible_chars_map: (8000, 64)
xmins_array: (8000, 64)
xmaxs_array: (8000, 64)
ymins_array: (8000, 64)
ymaxs_array: (8000, 64)
target_latex_string: (8000, 1)
visible_chars_map: [[43. 71. 68. 75. 33. 26.  7.  7. 54. 54. 68. 12. 37. 26. 28. 33. 80. 34.
  68. 21.  7. 54. 54. 68. 12. 78. 68. 26. 68. 76. 33. 68. 21.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [71. 25. 75. 28. 34.  7. 28. 40. 25. 31. 26. 35. 25. 37. 26. 28. 25. 28.
  34. 25. 35. 26. 28. 33. 25. 37.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [55. 71. 68. 75. 35. 26.  7.  7. 38. 68. 26. 34. 78. 68.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [71. 58. 75. 32. 26. 55. 79.

# Model Architecture

In [None]:
# Input layers for visible characters and bounding box coordinates
visible_chars_input = Input(shape=(64,))  # Assuming each visible character is represented as a sequence of floats

# Process bounding box coordinates
xmins_input = Input(shape=(64,))
xmaxs_input = Input(shape=(64,))
ymins_input = Input(shape=(64,))
ymaxs_input = Input(shape=(64,))
bbox_features = Concatenate()([xmins_input, xmaxs_input, ymins_input, ymaxs_input])
bbox_features = Dense(64, activation='relu')(bbox_features)

# Combine visible characters and bbox features
combined_features = Concatenate()([visible_chars_input, bbox_features])

# Embedding layer for visible characters
embedding_layer = Embedding(input_dim=num_chars, output_dim=64)(combined_features)  # Adjust output_dim based on your requirements

# RNN for sequence modeling
rnn_output = LSTM(128, return_sequences=True)(embedding_layer)  # Return sequences

# Output layer
output = Dense(num_chars, activation='softmax')(rnn_output)  # Output at each timestep

# Create the model

In [None]:
# Define the model
model = Model(inputs=[visible_chars_input, xmins_input, xmaxs_input, ymins_input, ymaxs_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

Model: "model_16"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_2 (InputLayer)        [(None, 64)]                 0         []                            
                                                                                                  
 input_3 (InputLayer)        [(None, 64)]                 0         []                            
                                                                                                  
 input_4 (InputLayer)        [(None, 64)]                 0         []                            
                                                                                                  
 input_5 (InputLayer)        [(None, 64)]                 0         []                            
                                                                                           

# Train the model


In [None]:
# Train the model
model.fit([visible_chars_map, xmins, xmaxs, ymins, ymaxs], visible_chars_map, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10


ValueError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1150, in train_step
        y_pred = self(x, training=True)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "model_15" is incompatible with the layer: expected shape=(None, 64), found shape=(32, 2304)
