<a href="https://colab.research.google.com/github/TheSkyAboveTheSky/Projet-PFA/blob/main/PFA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Import**

In [41]:
import os
import cv2
import tensorflow as tf
from tensorflow import keras
import csv
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

# **Data**

In [42]:
def removeImages(data_dir) :
  # List all the images in the folder
  file_list = os.listdir(data_dir)
  # Remove them all
  for file_name in file_list:
      file_path = os.path.join(data_dir, file_name)
      if os.path.isfile(file_path):
          os.remove(file_path)

# Data folder path
data_dir = Path("./data/")
# Get list of all the images
images = sorted(list(map(str, list(data_dir.glob("*.png")))), key=lambda x: int(''.join(filter(str.isdigit, os.path.splitext(os.path.basename(x))[0]))))
# Read labels from csv file
labels = []
with open('label.csv','r') as f:
  reader = csv.reader(f)
  # skip the columns name
  next(reader)
  for row in reader:
    labels.extend([row])
# Get the list of all unique characters in the labels
characters = sorted(list(set(char for arr in labels for string in arr for char in string)))
print("Number of images found:", len(images))
print("Number of labels found:", len(labels))
print("Number of unique characters:", len(characters))
print("Characters present:", characters)


Number of images found: 10
Number of labels found: 115
Number of unique characters: 64
Characters present: [' ', '#', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'V', 'W', 'Y', 'Z', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


# **Define Our Variable**

In [37]:
img_width = 600
img_height = 200
max_length = 50
batch_size = 1
input_shape = (img_height,img_width,3)
num_classes = len(characters) +1

# **TRansform characters into numbers and numbers into characters**

In [38]:
char_to_num = keras.layers.StringLookup(
    vocabulary=characters, mask_token=None
)
num_to_char = keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
)

# Split data 

In [45]:
def split_data(images, labels, train_size, shuffle=True):
    size = tf.shape(images)[0]
    indices = tf.range(size)
    if shuffle:
        indices = tf.random.shuffle(indices)
    train_samples = tf.cast(tf.cast(size, tf.float32) * train_size, tf.int32)
    x_train, y_train = tf.gather(images, indices[:train_samples]), tf.gather(labels, indices[:train_samples])
    
    valid_samples = size - train_samples
    
    if valid_samples > 0:
        x_valid, y_valid = tf.gather(images, indices[train_samples:train_samples+valid_samples]), tf.gather(labels, indices[train_samples:train_samples+valid_samples])
    else:
        x_valid, y_valid = None, None
    
    return x_train, x_valid, y_train, y_valid
x_train, x_valid, y_train, y_valid = split_data(images, labels,0.99)

# **Create Datasets**

In [48]:
def encode_single_sample(img_path, label):
    # read the image with the tensorflow
    img = tf.io.read_file(img_path)
    # decode the png image with color
    img = tf.image.decode_png(img, channels=3)
    img = tf.image.convert_image_dtype(img, tf.float32)
    # resizing the image
    img = tf.image.resize(img, [img_height, img_width])
    # transform the label
    label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
    label = tf.reshape(label, shape=(-1,))
    label = tf.ensure_shape(label, (None,))
    return {"image": img, "label": label}

# create the datasets
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = (
    train_dataset.map(
        encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    # batch the dataset of batches of our batch_size
    .batch(batch_size)
    .prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
)
validation_dataset = tf.data.Dataset.from_tensor_slices((x_valid, y_valid))
validation_dataset = (
    validation_dataset.map(
        encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    .batch(batch_size)
    .prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
)
