# Handwritten Recognition

## Initialization

### Imports

In [33]:
import os
import numpy as np

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from build_db import build_db
from data_augmentation import data_augmentation
from labelisation import labelize_data
from utils.extract_features import features_extraction

### Global variables

In [34]:
# all the variable related to the path.
# to read the csv for example.

# path of the current folder.
SCRIPT_DIR = os.getcwd()
# path of the parent folder.
PARENT_DIR = os.path.dirname(SCRIPT_DIR)
DATA_FOLDER = os.path.join(PARENT_DIR,  'data')
CSV_LABEL_PATH = os.path.join(DATA_FOLDER, "labels.csv")
MATRIX_FOLDER = os.path.join(DATA_FOLDER, "matrix")
RESULT_FOLDER = os.path.join(DATA_FOLDER, "results")

## Part I : DB build 

In [6]:
# Create the DB from the raw data
# remove the actuals files in the DB.
# create new DB content from the raw data.
build_db()

Files from c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\characters/a removed succefully
Files from c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\characters/b removed succefully
Files from c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\characters/c removed succefully
Files from c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\characters/d removed succefully
Files from c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\characters/e removed succefully
Files from c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\characters/f removed succefully
Files from c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\characters/g removed succefully
Files from c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\characters/h removed succefully


KeyboardInterrupt: 

**Note :** Clean the database manually can be useful. Especially to remove bad data, and have a clean database of handwritten letters.

### DB augmentation

In [None]:
# DB augmentation
# augment the DB size using rotation, blur and other transformations on current images.
data_augmentation()
# store the data information into a csv file : useful for sampling 
labelize_data()

Début de l'augmentation des données...
Augmentation done for letter : a
Augmentation done for letter : b
Augmentation done for letter : c
Augmentation done for letter : d
Augmentation done for letter : e
Augmentation done for letter : f
Augmentation done for letter : g
Augmentation done for letter : h
Augmentation done for letter : i
Augmentation done for letter : j
Augmentation done for letter : k
Augmentation done for letter : l
Augmentation done for letter : m
Augmentation done for letter : n
Augmentation done for letter : o
Augmentation done for letter : p
Augmentation done for letter : q
Augmentation done for letter : r
Augmentation done for letter : s
Augmentation done for letter : t
Augmentation done for letter : u
Augmentation done for letter : v
Augmentation done for letter : w
Augmentation done for letter : x
Augmentation done for letter : y
Augmentation done for letter : z
Terminé ! Chaque lettre a maintenant environ 498 exemples.
CSV created


## Part II : Features extraction

In [35]:
# read the csv file
data_filepath = pd.read_csv(CSV_LABEL_PATH)

### Features extraction

In [None]:
# choose the method of features extraction and apply it.
methods = ["HOG", "HU", "GEOMETRIC"]

for method in methods:
    features_extraction(df_labels=data_filepath, method=method)


The df is complete (data + labels) : True
c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\characters/a\char_100.png
c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\characters/a\char_100_aug_0.png
c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\characters/a\char_100_aug_1.png
c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\characters/a\char_100_aug_2.png
c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\characters/a\char_100_aug_3.png
c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\characters/a\char_100_aug_4.png
c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\characters/a\char_101.png
c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\characters/a\char_101_aug_0.png
c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handw

## Part III : Classifier preparation

### Model choice and training

In [42]:
## Imports 
# classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Utils
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

**Utils**

In [43]:
# --------------------------------------------------
# Stratified cross-validation
# --------------------------------------------------
cv = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=42
)


SCALER = StandardScaler()

# Cross-validated accuracy
def cross_validation_report(classifier, classifier_name, method, X_train, y_train, y_test, y_pred):
    cv_scores = cross_val_score(classifier, X_train, y_train, cv=cv)

    # save the results
    with open (f"{RESULT_FOLDER}/cv_report_{classifier_name}_{method}.txt", "w") as f:
        f.write("Cross-validated accuracy scores: " + str(cv_scores) + "\n")
        f.write("Mean accuracy: " + str(np.mean(cv_scores)) + "\n\n")
        f.write("Classification report (CV):\n")
        f.write(classification_report(y_test, y_pred))


# To plot the confusion matrix
def confusion_matrix_construction(classifier_name, method, y_pred, y_test):
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y_test))
    disp.plot(cmap=plt.cm.Blues, values_format='d')
    plt.title(f"Matrice de confusion - {classifier_name} - {method}")
    plt.savefig(f"{MATRIX_FOLDER}/matrix_{classifier_name}_{method}.png", dpi=300, bbox_inches='tight')  # Save in PNG

#### Random Forest

In [44]:
# --------------------------------------------------
# Random Forest classifier
# --------------------------------------------------
rf_clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_leaf=2,
    class_weight="balanced",
    random_state=42
)

#### KNN

In [45]:
# --------------------------------------------------
# k-NN classifier
# --------------------------------------------------
knn_clf = KNeighborsClassifier(
    n_neighbors=3,        # small value for small dataset
    metric='euclidean',   # standard distance
    weights='distance'    # closer neighbors matter more
)

#### SVM

In [46]:
# --------------------------------------------------
# SVM classifier
# --------------------------------------------------
svm_clf = SVC(
    kernel='rbf', 
    C=1.0, 
    gamma='scale')  # Base parameters

## PART IV : Main loop - Classifier evaluation

In [None]:
def model_evaluation(method):
    print(f"--- Evaluation for method: {method} ---")

    ### Data loading
    DATA_DIR = os.path.join(DATA_FOLDER, "features", method)
    X = np.load(os.path.join(DATA_DIR, f"features_{method}.npy"))
    y = np.load(os.path.join(DATA_DIR, f"labels_{method}.npy"), allow_pickle=True)


    ### Normalisation
    X_scaled = SCALER.fit_transform(X)


    ### Sampling
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42, stratify=y
    )


    ### Evaluation
    classifiers = {
        "Random Forest": rf_clf,
        "k-NN": knn_clf,
        "SVM": svm_clf
    }

    for name, clf in classifiers.items():
        print(f"Evaluating {name} classifier with {method} features.")
        # Train the model on the entire training set
        clf.fit(X_train, y_train)

        # Predict on the test set
        y_pred = clf.predict(X_test)

        # Save results for evaluation        
        cross_validation_report(classifier=clf, classifier_name=name, method=method, X_train=X_train, y_train=y_train, y_test=y_test, y_pred=y_pred)
        confusion_matrix_construction(classifier_name=name, method=method, y_pred=y_pred, y_test=y_test)

In [None]:
for method in methods:
    model_evaluation(method=method)

### Model Choice


**Mean accuracy** table : 

|       |   HOG   |   HU    |GEOMETRIC| Eighteen |
|:------|:-------:|:-------:|:-------:|:--------:|
|**RF** |0.934    |0.238    |0.520    |0.536     |
|**KNN**|0.980    |0.192    |0.489    |0.482     | 
|**SVM**|0.982    |0.222    |0.541    |0.550     |

Based on the previous table, the best method of features extraction is, without any doubt, the HOG method.

Regarding the model choice, Random Forest is less efficient than other models. The two other models have approximately the same efficacy, based on the mean accuracy table and the confusion matrix.

## Part V : Model application

### Intialization

In [47]:
# Optional : re-train the best model on the entire dataset and predict new data
# Here we assume SVM with HOG features is the best model
method = "HOG"
DATA_DIR = os.path.join(DATA_FOLDER, "features", method)
X = np.load(os.path.join(DATA_DIR, f"features_{method}.npy"))
y = np.load(os.path.join(DATA_DIR, f"labels_{method}.npy"), allow_pickle=True)

X_scaled = SCALER.fit_transform(X)

### Training of the model

In [48]:
svm_clf.fit(X, y)

0,1,2
,"C  C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.",1.0
,"kernel  kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf' Specifies the kernel type to be used in the algorithm. If none is given, 'rbf' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape ``(n_samples, n_samples)``. For an intuitive visualization of different kernel types see :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`.",'rbf'
,"degree  degree: int, default=3 Degree of the polynomial kernel function ('poly'). Must be non-negative. Ignored by all other kernels.",3
,"gamma  gamma: {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses  1 / (n_features * X.var()) as value of gamma, - if 'auto', uses 1 / n_features - if float, must be non-negative. .. versionchanged:: 0.22  The default value of ``gamma`` changed from 'auto' to 'scale'.",'scale'
,"coef0  coef0: float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.",0.0
,"shrinking  shrinking: bool, default=True Whether to use the shrinking heuristic. See the :ref:`User Guide `.",True
,"probability  probability: bool, default=False Whether to enable probability estimates. This must be enabled prior to calling `fit`, will slow down that method as it internally uses 5-fold cross-validation, and `predict_proba` may be inconsistent with `predict`. Read more in the :ref:`User Guide `.",False
,"tol  tol: float, default=1e-3 Tolerance for stopping criterion.",0.001
,"cache_size  cache_size: float, default=200 Specify the size of the kernel cache (in MB).",200
,"class_weight  class_weight: dict or 'balanced', default=None Set the parameter C of class i to class_weight[i]*C for SVC. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``.",


In [49]:
knn_clf.fit(X, y)

0,1,2
,"n_neighbors  n_neighbors: int, default=5 Number of neighbors to use by default for :meth:`kneighbors` queries.",3
,"weights  weights: {'uniform', 'distance'}, callable or None, default='uniform' Weight function used in prediction. Possible values: - 'uniform' : uniform weights. All points in each neighborhood  are weighted equally. - 'distance' : weight points by the inverse of their distance.  in this case, closer neighbors of a query point will have a  greater influence than neighbors which are further away. - [callable] : a user-defined function which accepts an  array of distances, and returns an array of the same shape  containing the weights. Refer to the example entitled :ref:`sphx_glr_auto_examples_neighbors_plot_classification.py` showing the impact of the `weights` parameter on the decision boundary.",'distance'
,"algorithm  algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto' Algorithm used to compute the nearest neighbors: - 'ball_tree' will use :class:`BallTree` - 'kd_tree' will use :class:`KDTree` - 'brute' will use a brute-force search. - 'auto' will attempt to decide the most appropriate algorithm  based on the values passed to :meth:`fit` method. Note: fitting on sparse input will override the setting of this parameter, using brute force.",'auto'
,"leaf_size  leaf_size: int, default=30 Leaf size passed to BallTree or KDTree. This can affect the speed of the construction and query, as well as the memory required to store the tree. The optimal value depends on the nature of the problem.",30
,"p  p: float, default=2 Power parameter for the Minkowski metric. When p = 1, this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. This parameter is expected to be positive.",2
,"metric  metric: str or callable, default='minkowski' Metric to use for distance computation. Default is ""minkowski"", which results in the standard Euclidean distance when p = 2. See the documentation of `scipy.spatial.distance `_ and the metrics listed in :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric values. If metric is ""precomputed"", X is assumed to be a distance matrix and must be square during fit. X may be a :term:`sparse graph`, in which case only ""nonzero"" elements may be considered neighbors. If metric is a callable function, it takes two arrays representing 1D vectors as inputs and must return one value indicating the distance between those vectors. This works for Scipy's metrics, but is less efficient than passing the metric name as a string.",'euclidean'
,"metric_params  metric_params: dict, default=None Additional keyword arguments for the metric function.",
,"n_jobs  n_jobs: int, default=None The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. Doesn't affect :meth:`fit` method.",


In [52]:
rf_clf.fit(X, y)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",300
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",10
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


### Main function

In [53]:
from new_data_preparation import prepare_new_data
from glob import glob

words_files = glob(os.path.join(DATA_FOLDER, "words", "*.jpg"))

def identify_words(classifier):
    results = []
    for file in words_files:
        file = os.path.basename(file)
        print(f"Processing file: {file}")
        ## The new_data have to follow the same pipeline as the training data
        # The file to process is supposed to be in the right folder : data/words/
        new_data = prepare_new_data(filename=file, features_extraction_method="HOG", prepare_new_data= True)

        # Normalisation
        X_new = SCALER.fit_transform(new_data)

        # Prediction
        predictions = classifier.predict(X_new)
        result = "".join(predictions)
        results.append((file, result))

    return results


results_knn = identify_words(knn_clf)
results_svm = identify_words(svm_clf)
results_rf = identify_words(rf_clf)

Processing file: lancegoat.jpg
Files from c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\words\characters removed succefully
This word contains 9 characters.
Extraction finished !

Dataframe created !

The df is complete (data + labels) : False
c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\words\characters\char_00.png
c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\words\characters\char_01.png
c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\words\characters\char_02.png
c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\words\characters\char_04.png
c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\words\characters\char_06.png
c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_handwritten_recognition\data\words\characters\char_07.png
c:\Users\tv68e\Documents\INSA_Lyon\Image_analysis\M1_h

### Results

In [56]:
print("Final results on new data for SVM:")
for file, result in results_svm:
    print(f"File: {file}, Result: {result}")    

print("\n")
print("Final results on new data for KNN:")
for file, result in results_knn:
    print(f"File: {file}, Result: {result}")

print("\n")
print("Final results on new data for Random Forest:")
for file, result in results_rf:
    print(f"File: {file}, Result: {result}")    

Final results on new data for SVM:
File: lancegoat.jpg, Result: fffffffff
File: results.jpg, Result: fffffff
File: xylophone.jpg, Result: fffffffff


Final results on new data for KNN:
File: lancegoat.jpg, Result: lancegoat
File: results.jpg, Result: resalts
File: xylophone.jpg, Result: xglophone


Final results on new data for Random Forest:
File: lancegoat.jpg, Result: famccgoaf
File: results.jpg, Result: rcsadus
File: xylophone.jpg, Result: xyyozhono


Here, the bests results are for the K-NN classifier.

We can suppose that, with a larger volume of data, the efficacy of the results would be better.

Regarding the SVM model, the results are surprising but I don't have any explaination about it.