## Import Libraries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras import backend as K
from keras.utils import np_utils
from sklearn.model_selection import train_test_split

In [None]:
# Ignores warning messages 
import warnings
warnings.filterwarnings('ignore') 

In [None]:
sns.set()

In [None]:
dataset = pd.read_csv("handwritten_data_785.csv").astype('float32')

In [None]:
dataset.rename(columns={'0':'label'}, inplace=True)

In [None]:
# Splits the data into X - input data and y - output data
X = dataset.drop('label',axis = 1)
y = dataset['label']

In [None]:
import string

In [None]:
# Change label to alphabets
alphabets_map = dict(enumerate(string.ascii_uppercase, 0))
dataset['label'] = dataset['label'].map(alphabets_map)

## Data Exploration

In [None]:
print(X.info())

In [None]:
X.head(5)

In [None]:
from sklearn.utils import shuffle
X_shuffled = shuffle(X)

In [None]:
plt.figure(figsize = (10,10))
row, columns = 4, 4
for i in range(16):  
    plt.subplot(columns, row, i+1)
    plt.imshow(X_shuffled.iloc[i].values.reshape(28,28),interpolation='nearest', cmap='Greys')
plt.show()

In [None]:
# Bar chart to show number of images per alphabet in training dataset

label_size = dataset.groupby('label').size()
label_size.plot.barh(figsize=(10,10))
plt.xlabel("Number of images")
plt.ylabel("Alphabets")
plt.title("Number of images per alphabet")
plt.show()

In [None]:
print("We have very less number of images for F, I and V ")
print("F count:", label_size['F'])
print("I count:", label_size['I'])
print("V count:", label_size['V'])

## Data Preparation

In [None]:
# Splits the data 

X_train, XX, y_train, yy = train_test_split(X, y, test_size=0.4)
X_valid, X_test, y_valid, y_test = train_test_split(XX, yy, test_size=0.6, shuffle=True)

In [None]:
print('X_train shape: {}'.format(X_train.shape))
print('y_train shape: {}'.format(y_train.shape))
print('X_valid shape: {}'.format(X_valid.shape))
print('y_valid shape: {}'.format(y_valid.shape))
print('X_test shape: {}'.format(X_test.shape))
print('y_test shape: {}'.format(y_test.shape))

In [None]:
# Scales the data

standard_scaler = MinMaxScaler()
standard_scaler.fit(X_train)

X_train = standard_scaler.transform(X_train)
X_valid = standard_scaler.transform(X_valid)
X_test = standard_scaler.transform(X_test)

In [None]:
# Data after scaling

X_shuffled = shuffle(X_train)

plt.figure(figsize = (10,10))
row, colums = 4, 4
for i in range(16):  
    plt.subplot(colums, row, i+1)
    plt.imshow(X_shuffled[i].reshape(28,28),interpolation='nearest', cmap='Greys')
plt.show()

In [None]:
X_train = X_train.reshape(X_train.shape[0], 28, 28, 1).astype('float32')
X_valid = X_valid.reshape(X_valid.shape[0], 28, 28, 1).astype('float32')
X_test = X_test.reshape(X_test.shape[0], 28, 28, 1).astype('float32')

y_train = np_utils.to_categorical(y_train)
y_valid = np_utils.to_categorical(y_valid)
y_test = np_utils.to_categorical(y_test)

## **Build the Model**

In [None]:
model = Sequential([Conv2D(32, (5, 5), input_shape=(28, 28, 1), activation='relu'),
                  MaxPooling2D(pool_size=(2, 2)),
                  Dropout(0.3),
                  Flatten(),
                  Dense(128, activation='relu'),
                  Dense(len(y.unique()), activation='softmax')])

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
detail = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=4, batch_size=128, verbose=2)

In [None]:
plt.plot(detail.history['loss'])
plt.plot(detail.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
scores = model.evaluate(X_test,y_test, verbose=0)
print("CNN Score:",scores[1])

In [None]:
model.save('ocr_model.h5')

In [None]:
from keras.models import load_model
from google.colab.patches import cv2_imshow
import cv2
import os

def load_ocr_model(model_path):
    # Loads the pre-trained OCR model
    model = load_model(model_path)
    return model

def preprocess_image(img):
    # Converts the image to grayscale, resize to 28x28, and invert colors
    array = cv2.bitwise_not(cv2.resize(cv2.cvtColor(img, cv2.COLOR_BGR2GRAY), (28, 28), interpolation=cv2.INTER_AREA))

    # Normalizes pixel values
    array = array.reshape(1, 28, 28, 1).astype('float32') / 255

    return array

def predict_from_folder(model, folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith('.jpg') or filename.endswith('.png'):
            # Loads the image
            image_path = os.path.join(folder_path, filename)
            img = cv2.imread(image_path)

            array = preprocess_image(img)

            prediction = model.predict(array)

            print(filename, chr(prediction.argmax() + 65))
            cv2_imshow(img)

In [None]:
model = load_ocr_model('ocr_model.h5')
