In [1]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Load CSV file
csv_file = './test/test.csv'
data = pd.read_csv(csv_file, index_col=0)
data["path"] = [f"test\\{i+1}.png" for i in range(data.shape[0])]
data["lilypond"] = [' '.join(x) for x in data.drop(columns=['path']).values.tolist()]


# Load images and convert to grayscale
def load_images(image_folder):
    images = []
    for filename in os.listdir(image_folder):
        if filename.endswith('.png'):
            img = Image.open(os.path.join(image_folder, filename)).convert('L')
            img = img.resize((128, 128))  # Resize images to a fixed size
            images.append(np.array(img))
    return np.array(images)

image_folder = './test'
images = load_images(image_folder)

# Normalize the images
images = images / 255.0

# Tokenize the LilyPond data
# Assuming the 'annotations.csv' has columns 'filename' and 'lilypond'
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(data['lilypond'])

# Convert text to sequences of tokens
sequences = tokenizer.texts_to_sequences(data['lilypond'])

# Pad sequences to a fixed length
max_length = max(len(seq) for seq in sequences)
sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length, padding='post')

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(images, sequences, test_size=0.2, random_state=42)


In [10]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,path,lilypond
0,f,g,f,"b,",|,"g,",e,"g,",f,|,...,f,d,|,f,g,c,"a,",|,test\1.png,"f g f b, | g, e g, f | g f c g, | a d a g | g,..."
1,d,"b,",g,f,|,"b,",c,"g,",g,|,...,"g,",e,|,"b,",g,e,d,|,test\2.png,"d b, g f | b, c g, g | a f c g, | a f d e | d ..."
2,"b,","a,",d,c,|,"b,","a,",g,e',|,...,"b,","g,",|,g,a,d,"b,",|,test\3.png,"b, a, d c | b, a, g e' | c' e c a, | c g, g d ..."
3,a,f,g,d,|,"a,",g,e,c,|,...,a,g,|,d',a,g,e,|,test\4.png,"a f g d | a, g e c | a g e c | g, a, g e | c d..."
4,d,e,d,"b,",|,a,f,d,g,|,...,d,e,|,"b,",e,"b,","a,",|,test\5.png,"d e d b, | a f d g | d a c b, | c a, d b, | g ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,f,c,a,f,|,"a,","g,",g,c',|,...,g,"b,",|,a,b,a,b,|,test\121.png,"f c a f | a, g, g c' | f c d e | d c g, c | a,..."
121,d',g',f',d',|,b,e,a,f,|,...,c,f,|,"a,",d,"g,",f,|,test\122.png,"d' g' f' d' | b e a f | e b, a e | c b, a, g, ..."
122,"a,",g,e,d,|,"a,",g,f,d,|,...,g,e,|,f,"g,","a,",e,|,test\123.png,"a, g e d | a, g f d | b, a, f e | f e a, g | d..."
123,d,"a,","b,",a,|,d,"b,","g,",f,|,...,e,c,|,"b,",c,"b,",g,|,test\124.png,"d a, b, a | d b, g, f | c g, a f | g, d a b | ..."


In [2]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, LSTM, Embedding, RepeatVector, TimeDistributed, Add

# Define constants
MAX_SEQ_LENGTH = max_length

# Encoder
image_input = Input(shape=(128, 128, 1))
x = Conv2D(32, (3, 3), activation='relu', padding='same')(image_input)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2))(x)
x = Flatten()(x)
encoded = Dense(256, activation='relu')(x)

# Repeat vector to match sequence length
repeat = RepeatVector(MAX_SEQ_LENGTH)(encoded)

# Decoder
sequence_input = Input(shape=(MAX_SEQ_LENGTH,))
embedding = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=256, input_length=MAX_SEQ_LENGTH)(sequence_input)
lstm = LSTM(256, return_sequences=True)(embedding)

# Add the repeated vector directly to the LSTM output
add = Add()([repeat, lstm])
decoder_output = TimeDistributed(Dense(len(tokenizer.word_index) + 1, activation='softmax'))(add)

# Model
model = Model([image_input, sequence_input], decoder_output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 128, 128, 1  0           []                               
                                )]                                                                
                                                                                                  
 conv2d (Conv2D)                (None, 128, 128, 32  320         ['input_1[0][0]']                
                                )                                                                 
                                                                                                  
 max_pooling2d (MaxPooling2D)   (None, 64, 64, 32)   0           ['conv2d[0][0]']                 
                                                                                              

In [3]:
# Prepare the input sequences
y_train_input = y_train
y_train_target = np.expand_dims(y_train, -1)

y_test_input = y_test
y_test_target = np.expand_dims(y_test, -1)

# Training
history = model.fit(
    [X_train, y_train_input],
    y_train_target,
    validation_data=([X_test, y_test_input], y_test_target),
    batch_size=32,
    epochs=30
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [4]:
def preprocess_image(image_path):
    img = Image.open(image_path).convert('L')  # Convert to grayscale
    img = img.resize((128, 128))  # Resize to the same size as training images
    img_array = np.array(img) / 255.0  # Normalize the image
    img_array = np.expand_dims(img_array, axis=-1)  # Add the channel dimension
    img_array = np.expand_dims(img_array, axis=0)  # Add the batch dimension
    return img_array

image_path = 'test/1.png'
image = preprocess_image(image_path)

In [5]:
# Assuming tokenizer and MAX_SEQ_LENGTH are already defined
initial_sequence = np.zeros((1, MAX_SEQ_LENGTH))


In [6]:
predicted_sequence = model.predict([image, initial_sequence])

# Convert the predicted sequence back to text
predicted_sequence = np.argmax(predicted_sequence, axis=-1)

# Remove padding and convert indices back to characters
predicted_text = ''.join(tokenizer.index_word[idx] for idx in predicted_sequence[0] if idx != 0)

print(predicted_text)

                                                                                                


In [9]:
tokenizer.index_word

{1: ' ',
 2: ',',
 3: '|',
 4: 'a',
 5: 'g',
 6: 'c',
 7: 'b',
 8: 'd',
 9: 'e',
 10: 'f',
 11: "'"}