In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import tensorflow as tf

In [2]:
training = pd.read_csv('../data/emnist-letters-train.csv')
test = pd.read_csv('../data/emnist-letters-test.csv')

In [3]:
mappings = {}

with open('../data/emnist-letters-mapping.txt') as file:
    for line in file:
        arr = line.strip().split(' ')
        mappings[int(arr[0])] = chr(int(arr[1])) + ' / ' + chr(int(arr[2]))
        
mappings        

{1: 'A / a',
 2: 'B / b',
 3: 'C / c',
 4: 'D / d',
 5: 'E / e',
 6: 'F / f',
 7: 'G / g',
 8: 'H / h',
 9: 'I / i',
 10: 'J / j',
 11: 'K / k',
 12: 'L / l',
 13: 'M / m',
 14: 'N / n',
 15: 'O / o',
 16: 'P / p',
 17: 'Q / q',
 18: 'R / r',
 19: 'S / s',
 20: 'T / t',
 21: 'U / u',
 22: 'V / v',
 23: 'W / w',
 24: 'X / x',
 25: 'Y / y',
 26: 'Z / z'}

### Preprocessing the data

1. Merging the data so we can do repetetive sampling
2. Splitting up labels and images
3. Preprocessing the images

In [4]:
training.columns = ['label'] +[f'feature_{i}' for i in range(1, training.shape[1])]
test.columns = ['label'] +[f'feature_{i}' for i in range(1, test.shape[1])]

In [5]:
data = pd.concat([training, test], axis=0)
training.shape, test.shape, data.shape

((88799, 785), (14799, 785), (103598, 785))

In [6]:
labels = data['label']
images = data.drop('label', axis=1)

In [7]:
images = images.values.reshape(-1, 28, 28, 1)
images = images / 255.0

In [14]:
from sklearn.model_selection import train_test_split

X_train_full, X_test, y_train_full, y_test = train_test_split(images, labels, test_size=0.2, random_state=253)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=253)

y_train -= 1
y_val -= 1
y_test -= 1

In [15]:
print("Unique training labels:", np.unique(y_train))
print("Unique validation labels:", np.unique(y_val))
print("Unique test labels:", np.unique(y_test))

Unique training labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Unique validation labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]
Unique test labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]


In [16]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(y_train, 26)
y_val = to_categorical(y_val, 26)
y_test = to_categorical(y_test, 26)

In [17]:
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam

def create_model():
    model = models.Sequential([
        layers.InputLayer(shape=(28, 28, 1)),
        layers.Conv2D(32, (2, 2), activation='relu', input_shape=(28, 28, 1)),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(26, activation='softmax')
    ])

    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [18]:
model = create_model()

history = model.fit(X_train, y_train, 
                    epochs=20,  
                    batch_size=32, 
                    validation_data=(X_val, y_val), 
                    verbose=1)  

test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy:.4f}")     

Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2072/2072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.5291 - loss: 1.5704 - val_accuracy: 0.8731 - val_loss: 0.3716
Epoch 2/20
[1m2072/2072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 6ms/step - accuracy: 0.8267 - loss: 0.5417 - val_accuracy: 0.9083 - val_loss: 0.2849
Epoch 3/20
[1m2072/2072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7ms/step - accuracy: 0.8607 - loss: 0.4381 - val_accuracy: 0.9151 - val_loss: 0.2639
Epoch 4/20
[1m2072/2072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 9ms/step - accuracy: 0.8805 - loss: 0.3741 - val_accuracy: 0.9180 - val_loss: 0.2536
Epoch 5/20
[1m2072/2072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - accuracy: 0.8935 - loss: 0.3312 - val_accuracy: 0.9259 - val_loss: 0.2310
Epoch 6/20
[1m2072/2072[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - accuracy: 0.8997 - loss: 0.3021 - val_accuracy: 0.9310 - val_loss: 0.2151
Epoch 7/20
[1m2072/2

In [21]:
model.save('../model/letter_predicter.keras')