In [None]:
# Import necessary libraries
import os
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import kagglehub

# Download the latest dataset version
path = kagglehub.dataset_download("nibinv23/iam-handwriting-word-database")
print("Path to dataset files:", path)
# Constants
IMAGE_SIZE = (128, 132)
BATCH_SIZE = 20
EPOCHS = 75
PADDING_TOKEN = 99
# DATA_INPUT_PATH = "/content/drive/MyDrive/iam-handwriting-word-database"
DATA_INPUT_PATH = os.path.join(path, "iam_words")
# Initialize variables
images_path = []
labels = []

# Function to preprocess the dataset
def preprocess_dataset():
    characters = set()
    max_len = 0
    with open(os.path.join(DATA_INPUT_PATH, 'words.txt'), 'r') as file:
        lines = file.readlines()
        print(lines)
        for line_number, line in enumerate(lines):
            if line.startswith('#') or line.strip() == '':
                continue

            parts = line.strip().split()
            word_id = parts[0]
            first_folder = word_id.split("-")[0]
            second_folder = first_folder + '-' + word_id.split("-")[1]
            image_filename = f"{word_id}.png"
            image_path = os.path.join(DATA_INPUT_PATH, 'iam_words', 'words', first_folder, second_folder, image_filename)
            print(word_id)
            if os.path.isfile(image_path) and os.path.getsize(image_path):
                images_path.append(image_path)
                label = parts[-1].strip()
                for char in label:
                    characters.add(char)
                max_len = max(max_len, len(label))
                labels.append(label)
        print(characters)

    characters = sorted(list(characters))
    char_to_num = tf.keras.layers.StringLookup(vocabulary=list(characters), mask_token=None)
    num_to_char = tf.keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True)
    return characters, char_to_num, num_to_char, max_len

characters, char_to_num, num_to_char, max_len = preprocess_dataset()

# Function for distortion-free image resizing
def distortion_free_resize(image, img_size):
    w, h = img_size
    image = tf.image.resize(image, size=(h, w), preserve_aspect_ratio=True)
    pad_height = h - tf.shape(image)[0]
    pad_width = w - tf.shape(image)[1]
    pad_height_top, pad_height_bottom = divmod(pad_height, 2)
    pad_width_left, pad_width_right = divmod(pad_width, 2)
    image = tf.pad(image, paddings=[[pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0, 0]])
    image = tf.transpose(image, perm=[1, 0, 2])
    image = tf.image.flip_left_right(image)
    return image

# Image and label preprocessing functions
def preprocess_image(image_path, img_size):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_png(image, 1)
    image = distortion_free_resize(image, img_size)
    image = tf.cast(image, tf.float32) / 255.0
    return image

def vectorize_label(label):
    label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
    length = tf.shape(label)[0]
    pad_amount = max_len - length
    label = tf.pad(label, paddings=[[0, pad_amount]], constant_values=PADDING_TOKEN)
    return label

def process_images_labels(image_path, label):
    image = preprocess_image(image_path, IMAGE_SIZE)
    label = vectorize_label(label)
    return {"image": image, "label": label}

# Prepare dataset
def prepare_dataset(image_paths, labels):
    AUTOTUNE = tf.data.AUTOTUNE
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels)).map(process_images_labels, num_parallel_calls=AUTOTUNE)
    return dataset.batch(BATCH_SIZE).cache().prefetch(AUTOTUNE)

def split_dataset():
    train_images, test_images, train_labels, test_labels = train_test_split(images_path, labels, test_size=0.2, random_state=42)
    val_images, test_images, val_labels, test_labels = train_test_split(test_images, test_labels, test_size=0.5, random_state=42)
    train_set = prepare_dataset(train_images, train_labels)
    val_set = prepare_dataset(val_images, val_labels)
    test_set = prepare_dataset(test_images, test_labels)
    return train_set, val_set, test_set

train_set, val_set, test_set = split_dataset()

# CTC Layer for loss calculation
class CTCLayer(tf.keras.layers.Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = tf.keras.backend.ctc_batch_cost

    def call(self, y_true, y_pred):
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)
        return y_pred

# Model building function
def build_model():
    input_img = tf.keras.Input(shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 1), name="image")
    labels = tf.keras.layers.Input(name="label", shape=(None,))
    x = tf.keras.layers.Conv2D(32, (3, 3), activation="relu", kernel_initializer="he_normal", padding="same")(input_img)
    x = tf.keras.layers.MaxPooling2D((2, 2))(x)
    x = tf.keras.layers.Conv2D(64, (3, 3), activation="relu", kernel_initializer="he_normal", padding="same")(x)
    x = tf.keras.layers.MaxPooling2D((2, 2))(x)
    new_shape = ((IMAGE_SIZE[0] // 4), (IMAGE_SIZE[1] // 4) * 64)
    x = tf.keras.layers.Reshape(target_shape=new_shape)(x)
    x = tf.keras.layers.Dense(64, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True, dropout=0.25))(x)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True, dropout=0.25))(x)
    x = tf.keras.layers.Dense(len(char_to_num.get_vocabulary()) + 2, activation="softmax", name="dense2")(x)
    output = CTCLayer(name="ctc_loss")(labels, x)
    model = tf.keras.models.Model(inputs=[input_img, labels], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))
    model.summary()
    return model

model = build_model()

# Save the model to Google Drive
from google.colab import drive
drive.mount('/content/drive')
model.save('/content/drive/MyDrive/handwriting_recognition_model')


In [None]:
from keras import layers
from keras.models import Model

from mltu.tensorflow.model_utils import residual_block


def train_model(input_dim, output_dim, activation="leaky_relu", dropout=0.2):

    inputs = layers.Input(shape=input_dim, name="input")

    # normalize images here instead in preprocessing step
    input = layers.Lambda(lambda x: x / 255)(inputs)

    x1 = residual_block(input, 16, activation=activation, skip_conv=True, strides=1, dropout=dropout)

    x2 = residual_block(x1, 16, activation=activation, skip_conv=True, strides=2, dropout=dropout)
    x3 = residual_block(x2, 16, activation=activation, skip_conv=False, strides=1, dropout=dropout)

    x4 = residual_block(x3, 32, activation=activation, skip_conv=True, strides=2, dropout=dropout)
    x5 = residual_block(x4, 32, activation=activation, skip_conv=False, strides=1, dropout=dropout)

    x6 = residual_block(x5, 64, activation=activation, skip_conv=True, strides=2, dropout=dropout)
    x7 = residual_block(x6, 64, activation=activation, skip_conv=True, strides=1, dropout=dropout)

    x8 = residual_block(x7, 64, activation=activation, skip_conv=False, strides=1, dropout=dropout)
    x9 = residual_block(x8, 64, activation=activation, skip_conv=False, strides=1, dropout=dropout)

    squeezed = layers.Reshape((x9.shape[-3] * x9.shape[-2], x9.shape[-1]))(x9)

    blstm = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(squeezed)
    blstm = layers.Dropout(dropout)(blstm)

    output = layers.Dense(output_dim + 1, activation="softmax", name="output")(blstm)

    model = Model(inputs=inputs, outputs=output)
    return model

In [None]:
import tensorflow as tf
import tensorflow as tf

# Check if GPU is available
if tf.config.list_physical_devices('GPU'):
    print("GPU is available")
else:
    print("GPU is not available")


from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard

from mltu.preprocessors import ImageReader
from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding, ImageShowCV2
from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate, RandomSharpen
from mltu.annotations.images import CVImage

from mltu.tensorflow.dataProvider import DataProvider
from mltu.tensorflow.losses import CTCloss
from mltu.tensorflow.callbacks import Model2onnx, TrainLogger
from mltu.tensorflow.metrics import CWERMetric


# from configs import ModelConfigs

import os
import tarfile
from tqdm import tqdm
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Set the path where you want to save the files on Google Drive
drive_path = '/content/drive/MyDrive/MLTU_Models'

# Create the directory if it doesn't exist
if not os.path.exists(drive_path):
    os.makedirs(drive_path)


def download_and_unzip(url, extract_to="Datasets", chunk_size=1024*1024):
    http_response = urlopen(url)

    data = b""
    iterations = http_response.length // chunk_size + 1
    for _ in tqdm(range(iterations)):
        data += http_response.read(chunk_size)

    zipfile = ZipFile(BytesIO(data))
    zipfile.extractall(path=extract_to)

dataset_path = os.path.join("Datasets", "IAM_Words")
if not os.path.exists(dataset_path):
    download_and_unzip("https://git.io/J0fjL", extract_to="Datasets")

    file = tarfile.open(os.path.join(dataset_path, "words.tgz"))
    file.extractall(os.path.join(dataset_path, "words"))

dataset, vocab, max_len = [], set(), 0

# Preprocess the dataset by the specific IAM_Words dataset file structure
words = open(os.path.join(dataset_path, "words.txt"), "r").readlines()
for line in tqdm(words):
    if line.startswith("#"):
        continue

    line_split = line.split(" ")
    if line_split[1] == "err":
        continue

    folder1 = line_split[0][:3]
    folder2 = "-".join(line_split[0].split("-")[:2])
    file_name = line_split[0] + ".png"
    label = line_split[-1].rstrip("\n")

    rel_path = os.path.join(dataset_path, "words", folder1, folder2, file_name)
    if not os.path.exists(rel_path):
        print(f"File not found: {rel_path}")
        continue

    dataset.append([rel_path, label])
    vocab.update(list(label))
    max_len = max(max_len, len(label))

# Create a ModelConfigs object to store model configurations
configs = ModelConfigs()

# Save vocab and maximum text length to configs
configs.vocab = "".join(vocab)
configs.max_text_length = max_len
configs.save()

# Create a data provider for the dataset
data_provider = DataProvider(
    dataset=dataset,
    skip_validation=True,
    batch_size=configs.batch_size*8,
    data_preprocessors=[ImageReader(CVImage)],
    transformers=[
        ImageResizer(configs.width, configs.height, keep_aspect_ratio=False),
        LabelIndexer(configs.vocab),
        LabelPadding(max_word_length=configs.max_text_length, padding_value=len(configs.vocab)),
        ],
)

# Split the dataset into training and validation sets
train_data_provider, val_data_provider = data_provider.split(split = 0.9)

# Augment training data with random brightness, rotation and erode/dilate
train_data_provider.augmentors = [
    RandomBrightness(),
    RandomErodeDilate(),
    RandomSharpen(),
    RandomRotate(angle=10),
    ]

# Creating TensorFlow model architecture
model = train_model(
    input_dim = (configs.height, configs.width, 3),
    output_dim = len(configs.vocab),
)

# Compile the model and print summary
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=configs.learning_rate),
    loss=CTCloss(),
    metrics=[CWERMetric(padding_token=len(configs.vocab))],
)
model.summary(line_length=110)
!pip install tf2onnx
# Define callbacks
# Define callbacks
earlystopper = EarlyStopping(monitor="val_CER", patience=20, verbose=1, mode='min') # Added mode='min'
checkpoint = ModelCheckpoint(f"{configs.model_path}/model.keras", monitor="val_CER", verbose=1, save_best_only=True, mode="min")
trainLogger = TrainLogger(configs.model_path)
tb_callback = TensorBoard(f"{configs.model_path}/logs", update_freq=1)
reduceLROnPlat = ReduceLROnPlateau(monitor="val_CER", factor=0.9, min_delta=1e-10, patience=10, verbose=1, mode="auto")
model2onnx = Model2onnx(f"{configs.model_path}/model.keras")
# Train the model
train_data_provider.to_csv(os.path.join(configs.model_path, "train.csv"))
val_data_provider.to_csv(os.path.join(configs.model_path, "val.csv"))
model.fit(
    train_data_provider,
    validation_data=val_data_provider,
    epochs=configs.train_epochs,
    callbacks=[earlystopper, checkpoint, trainLogger, reduceLROnPlat, tb_callback, model2onnx],
    # workers=configs.train_workers
)

# Save training and validation datasets as csv files

# Save the model in TensorFlow SavedModel format in Google Drive
saved_model_path = os.path.join(drive_path, 'saved_model')
model.save(saved_model_path, save_format='tf')
print(f"Model saved in TensorFlow format at {saved_model_path}")

# Save the training and validation datasets as CSV in Google Drive
train_csv_path = os.path.join(drive_path, 'train.csv')
val_csv_path = os.path.join(drive_path, 'val.csv')
train_data_provider.to_csv(train_csv_path)
val_data_provider.to_csv(val_csv_path)
print(f"Training and validation datasets saved as CSV files at {train_csv_path} and {val_csv_path}")

# Save the model in ONNX format in Google Drive
onnx_path = os.path.join(drive_path, 'model.onnx')
model2onnx = Model2onnx(onnx_path)
model2onnx.on_train_end(None)
print(f"Model saved in ONNX format at {onnx_path}")



In [None]:
!pip install flask-ngrok

In [None]:
!ngrok

In [None]:
!pip install flask pyngrok

In [None]:
from flask import Flask
from pyngrok import ngrok

from flask import Flask, request, jsonify, render_template
from PIL import Image
from transformers import AutoTokenizer, VisionEncoderDecoderModel, GPT2Tokenizer, TFGPT2Model
import tensorflow as tf
from flask_ngrok import run_with_ngrok



from torchvision import transforms
import torch
import cv2
import numpy as np
import io
import tempfile
import os
import google.generativeai as genai

genai.configure(api_key="AIzaSyBjzFNBWDmpUmXU8t9_yof1hk4QxiF4s2E")
modelg = genai.GenerativeModel("gemini-1.5-flash")
# response = model.generate_content("Explain how AI works")
# print(response.text)
# Initialize Flask app
app = Flask(__name__)
run_with_ngrok(app)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
# Load GPT-2 tokenizer and model
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
gpt2_model = TFGPT2Model.from_pretrained("gpt2-large")
# Ensure the device is set
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define preprocessing for the image
preprocess = transforms.Compose([
    transforms.Resize((384, 384)),  # Resize image to expected dimensions
    transforms.ToTensor(),  # Convert PIL Image to PyTorch tensor
    transforms.Normalize(mean=[0.5], std=[0.5])  # Normalize to match model expectations
])

# Define the prediction function
def predict_text(image):
    image = preprocess(image).unsqueeze(0).to(device)  # Add batch dimension
    output_ids = model.generate(image)  # Generate text from the image
    text = tokenizer.decode(output_ids[0], skip_special_tokens=True)  # Decode text
    return text

def extract_sentences(image_path):
    # Read the image as RGB to ensure 3 channels
    image = cv2.imread(image_path, cv2.IMREAD_COLOR)

    # Check if the image was loaded correctly
    if image is None:
        raise ValueError(f"Unable to load image at {image_path}")

    # Resize image to standardize dimensions (scaling for consistency)
    height, width = image.shape[:2]
    scaling_factor = 1000 / width  # Scale width to 1000 pixels
    image = cv2.resize(image, (int(width * scaling_factor), int(height * scaling_factor)))

    # Convert image to grayscale for better processing
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply GaussianBlur to reduce noise
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)

    # Use adaptive thresholding for more robust binarization
    binary = cv2.adaptiveThreshold(
        blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2
    )

    # Define a kernel for morphological operations
    kernel_width = max(50, int(image.shape[1] * 0.05))  # Adjust kernel width dynamically
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_width, 1))

    # Use morphological operations to detect text lines
    detected_lines = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)

    # Find contours of the lines
    contours, _ = cv2.findContours(detected_lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Sort contours top-to-bottom
    contours = sorted(contours, key=lambda ctr: cv2.boundingRect(ctr)[1])

    sentence_images = []  # Initialize the list to store sentence images

    # Loop through each contour and extract sentences (lines of text)
    for i, contour in enumerate(contours):
        x, y, w, h = cv2.boundingRect(contour)

        # Filter out small boxes to avoid noise
        if h > 10 and w > 50:  # Adjust these thresholds as needed
            line_image = image[y:y+h, x:x+w]
            sentence_images.append(line_image)

    return sentence_images
# Function to process multi-sentence input and return result
def process_multisentence_image(image):
    sentence_images = extract_sentences(image)  # Extract sentences
    results = []

    for sentence_image in sentence_images:
        # Convert sentence image to PIL format
        pil_image = Image.fromarray(sentence_image)
        # Predict text for each sentence
        text = predict_text(pil_image)
        results.append(text)

    return results


# Define GPT-2 processing function
def process_with_gpt2(text):
    encoded_input = gpt2_tokenizer(text, return_tensors='tf')
    output = gpt2_model(encoded_input)
    return output  # Return raw output for now (can refine based on use case)

# Flask route to process GPT-2 text input with custom prompt
@app.route('/process_gpt2', methods=['POST'])
def process_gpt2():
    data = request.get_json()
    if not data or 'text' not in data:
        return jsonify({'error': 'No text input provided'}), 400

    text = data['text']
    try:
        # Process the text with GPT-2
        gpt2_output = process_with_gpt2(text)
        return jsonify({'gpt2_output': str(gpt2_output)})
    except Exception as e:
        return jsonify({'error': str(e)}), 500

@app.route('/')
def index():
    # Render HTML form for file upload and text input
    return render_template('index.htm')


@app.route('/extract_text', methods=['POST'])
def extract_text():
    if 'image' not in request.files:
        return jsonify({'error': 'No image part'}), 400

    file = request.files['image']
    if file.filename == '':
        return jsonify({'error': 'No selected file'}), 400

    # Retrieve the custom question from the form
    question = request.form.get('question', '')

    # Save the image temporarily
    temp_file = tempfile.NamedTemporaryFile(delete=False)
    file_path = temp_file.name
    file.save(file_path)
    temp_file.close()

    try:
        # Process the image to extract sentences and predict text
        sentence_images = extract_sentences(file_path)
        results = [predict_text(Image.fromarray(sentence)) for sentence in sentence_images]
        print(results)

        # Join the extracted text and create the GPT-2 prompt
        extracted_text_joined = " ".join(results)
        prompt = f"{question} if this is a question and this is an answer {extracted_text_joined} then how much will you give mark"
        print(prompt)
        output_text = modelg.generate_content(prompt)
        print(output_text.text)

        return jsonify({
            'predictions': extracted_text_joined,
            'gpt2_output': output_text.text
        })

    except Exception as e:
        return jsonify({'error': str(e)}), 500

    finally:
        # Clean up temporary file
        if os.path.exists(file_path):
            try:
                os.remove(file_path)
            except PermissionError:
                print(f"Could not delete {file_path} because it is in use.")



if __name__ == '__main__':
  ngrok.set_auth_token("22YNqiM4663KekUtpIqGSOPkeFt_4hA7RdtpGq8L6zfdrUUTM")
  ngrok_tunnel = ngrok.connect(5000)
  print('Public URL:', ngrok_tunnel.public_url)
  app.run()