# Models for plate characters recognition

In [1]:
from sklearn import svm
import numpy as np
import sklearn
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn import neighbors
from sklearn import tree
import os
import pandas as pd
from xgboost import XGBClassifier

## General utils

In [4]:
class Utils(object):

    def func_overX(self, X):
        Y = []
        for element in X:
            Y.append(sum(element.flatten()) > element.flatten().shape[0] // 2)
        return np.asarray(Y)

    def testing(self):

        ##### SVM Testing #####
        X = np.random.rand(200, 60, 30)
        Y = func_overX(X)

        ### GridSearch ###
        model = SVMCHandler(X, Y)
        model.fit(with_score=True, with_grid=True)

    def _ensure_dimensionalit(self, arr):
        return arr if len(arr[0].shape) == 1 else [x.flatten() for x in arr]

    def _acc(self, y_pred, y_target):

        if type(y_pred) == np.array and type(y_target) == np.array:
            assert(y_pred.shape == y_target.shape)
            mask = y_pred == y_target

        else:
            assert(len(y_pred) == len(y_target))
            mask = [x == y for x, y in zip(y_pred, y_target)]
        return sum(mask)/len(mask)

    def do_scaling(self, X):
        Scaler = StandardScaler()
        return Scaler.fit_transform(X)

    def _gen_gridSearch(self, model, hyperparams, n_splits=5):
        cv = StratifiedShuffleSplit(
            n_splits=n_splits, test_size=0.2, random_state=42)
        grid = GridSearchCV(model, param_grid=hyperparams,
                            cv=cv, n_jobs=6, verbose=3)
        return grid

    def df_Grid(self):
        if self.grid_flag:
            c = self.grid.__dict__['cv_results_']['params']
            a = ['params'] + \
                [f'split{n}_test_score' for n in range(self.n_splits)]
            dic = {h: i for h, i in zip(
                a, (c, *[self.grid.__dict__['cv_results_'][f'split{n}_test_score'] for n in range(self.n_splits)]))}
            return pd.DataFrame(dic)
        else:
            print('Grid has not been calculated')

# Models

## SVM

In [5]:
class SVMCHandler(Utils):

    def __init__(self, X, Y, **kwargs):
        super().__init__()
        hyperparams = {
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'C': np.logspace(-2, 10, 5),
            'gamma': np.logspace(-9, 3, 5),
        }
        self.n_splits = kwargs['n_splits'] if 'n_splits' in kwargs else 5
        if 'n_splits' in kwargs:
            del kwargs['n_splits']
        self.model = svm.SVC(**kwargs)
        self.X = self.do_scaling(self._ensure_dimensionalit(X))
        self.Y = Y
        assert(len(Y.shape) == 1)
        self.grid = self._gen_gridSearch(
            self.model, hyperparams, self.n_splits)
        self.grid_flag = False

    def fit(self, with_score=True, with_grid=True):
        if with_grid:
            self.grid.fit(self.X, self.Y)
            print(
                f"The best parameters are {self.grid.best_params_} and the best score is {self.grid.best_score_}")
            self.model = self.model.__class__(**self.grid.best_params_)
            self.model.fit(self.X, self.Y)
            self.grid_flag = True
        else:
            self.model.fit(self.X, self.Y)
        if with_score:
            pred = self.predict(self.X)
            print(f"Train acc  is : {self._acc(pred, self.Y)}")

    def predict(self, X):
        X = self._ensure_dimensionalit(X)
        return self.model.predict(X)

## KNN model

In [6]:
class KNNhandler(Utils):

    def __init__(self, X, Y, **kwargs):
        super(Utils, self).__init__()
        hyperparams = {
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'n_neighbors': np.arange(3, 10, 2),
            'p': np.arange(1, 3),
        }
        self.n_splits = kwargs['n_splits'] if 'n_splits' in kwargs else 5
        if 'n_splits' in kwargs:
            del kwargs['n_splits']
        self.model = neighbors.KNeighborsClassifier(**kwargs)
        self.X = self.do_scaling(self._ensure_dimensionalit(X))
        self.Y = Y
        assert(len(Y.shape) == 1)
        self.grid = self._gen_gridSearch(
            self.model, hyperparams, self.n_splits)
        self.grid_flag = False

    def fit(self, with_score=True, with_grid=True):
        if with_grid:
            self.grid.fit(self.X, self.Y)
            print(
                f"The best parameters are {self.grid.best_params_} and the best score is {self.grid.best_score_}")
            self.model = self.model.__class__(**self.grid.best_params_)
            self.model.fit(self.X, self.Y)
            self.grid_flag = True
        else:
            self.model.fit(self.X, self.Y)
        if with_score:
            pred = self.predict(self.X)
            print(f"Train acc  is : {self._acc(pred, self.Y)}")

    def predict(self, X):
        X = self._ensure_dimensionalit(X)
        return self.model.predict(X)

# Decision tree

In [7]:
class DTCHandler(Utils):

    def __init__(self, X, Y, **kwargs):
        super(Utils, self).__init__()
        hyperparams = {
            'criterion': ['gini', 'entropy'],
            'max_features': ['auto', 'sqrt', 'log2'],
            'splitter': ['best', 'random'],
        }
        self.n_splits = kwargs['n_splits'] if 'n_splits' in kwargs else 5
        if 'n_splits' in kwargs:
            del kwargs['n_splits']
        self.model = tree.DecisionTreeClassifier(**kwargs)
        self.X = self.do_scaling(self._ensure_dimensionalit(X))
        self.Y = Y
        assert(len(Y.shape) == 1)
        self.grid = self._gen_gridSearch(
            self.model, hyperparams, self.n_splits)
        self.grid_flag = False

    def fit(self, with_score=True, with_grid=True):
        if with_grid:
            self.grid.fit(self.X, self.Y)
            print(
                f"The best parameters are {self.grid.best_params_} and the best score is {self.grid.best_score_}")
            self.model = self.model.__class__(**self.grid.best_params_)
            self.model.fit(self.X, self.Y)
            self.grid_flag = True

        else:
            self.model.fit(self.X, self.Y)
        if with_score:
            pred = self.predict(self.X)
            print(f"Train acc  is : {self._acc(pred, self.Y)}")

    def predict(self, X):
        X = self._ensure_dimensionalit(X)
        return self.model.predict(X)

## XGBoost

In [8]:
class XGBHandler(Utils):

    def __init__(self, X, Y, **kwargs):
        super(Utils,self).__init__()
        hyperparams = {
            'min_child_weight': [1, 5, 10],
            'gamma': [0.5, 1, 1.5, 2, 5],
            'subsample': [0.6, 0.8, 1.0],
            'colsample_bytree': [0.6, 0.8, 1.0],
            'max_depth': [3, 4, 5]
        }
        self.n_splits  = kwargs['n_splits'] if 'n_splits' in kwargs else 5
        if 'n_splits' in kwargs: del kwargs['n_splits']
        self.model = XGBClassifier(**kwargs)
        self.X = self.do_scaling(self._ensure_dimensionalit(X))
        self.Y = Y
        assert(len(Y.shape) ==1 )
        self.grid = self._gen_gridSearch(self.model, hyperparams, self.n_splits)
        self.grid_flag= False
    
    def fit(self, with_score = True, with_grid=True):
        if with_grid:
            self.grid.fit(self.X, self.Y)
            print(f"The best parameters are {self.grid.best_params_} and the best score is {self.grid.best_score_}")
            self.model = self.model.__class__(**self.grid.best_params_)
            self.model.fit(self.X,self.Y)
            self.grid_flag = True

        else : 
            self.model.fit(self.X, self.Y )
        if with_score:
            pred = self.predict(self.X)
            print(f"Train acc  is : {self._acc(pred, self.Y)}")

    def predict(self, X):
        X = self._ensure_dimensionalit(X)
        return self.model.predict(X)


# Test

In [9]:
from detector import PlateLocator
def get_data(path: str) -> list:
    """Returns an iterator with paths to each image to be detected"""

    imgs = os.listdir(path)
    return sorted([f"{path}/{img}" for img in imgs])

In [10]:
locator = PlateLocator()
dataset, names = locator.get_dataset(get_data('data'), 56)
idx_to_letters = {x: y for x, y in enumerate('WLGVJNBX7853MSDH6C421R90YTFK')}
X_data = []
y_data = []
n_Test = 10
i = 0
for element,name in zip(dataset[:len(dataset) - n_Test],names):
    name_ = name.split('/')[1].replace('.jpg','')
    for detection in element:
        if len(detection) == 7:    
            for image,letter in zip(detection, name_) :
                if letter == 'P':                   
                    X_data.append(image)
                    y_data.append(letter)
                X_data.append(image)
                y_data.append(letter)

  0%|          | 0/56 [00:00<?, ?it/s]

[INFO] Loaded 56 images
[INFO] Located 87 plates
[INFO] Segmented 633 characters


In [11]:
# SVC
svc = SVMCHandler(np.array(X_data), np.array(y_data), n_splits = 3)
svc.fit(True, True)
svc.df_Grid()

Fitting 3 folds for each of 100 candidates, totalling 300 fits
The best parameters are {'C': 10.0, 'gamma': 0.001, 'kernel': 'sigmoid'} and the best score is 0.9753086419753085
Train acc  is : 0.9875311720698254


Unnamed: 0,params,split0_test_score,split1_test_score,split2_test_score
0,"{'C': 0.01, 'gamma': 1e-09, 'kernel': 'linear'}",0.962963,0.987654,0.962963
1,"{'C': 0.01, 'gamma': 1e-09, 'kernel': 'poly'}",0.086420,0.086420,0.086420
2,"{'C': 0.01, 'gamma': 1e-09, 'kernel': 'rbf'}",0.086420,0.086420,0.086420
3,"{'C': 0.01, 'gamma': 1e-09, 'kernel': 'sigmoid'}",0.086420,0.086420,0.086420
4,"{'C': 0.01, 'gamma': 1e-06, 'kernel': 'linear'}",0.962963,0.987654,0.962963
...,...,...,...,...
95,"{'C': 10000000000.0, 'gamma': 1.0, 'kernel': '...",0.407407,0.407407,0.283951
96,"{'C': 10000000000.0, 'gamma': 1000.0, 'kernel'...",0.962963,0.987654,0.962963
97,"{'C': 10000000000.0, 'gamma': 1000.0, 'kernel'...",0.876543,0.950617,0.901235
98,"{'C': 10000000000.0, 'gamma': 1000.0, 'kernel'...",0.543210,0.506173,0.518519


In [10]:
# KNN
knn = KNNhandler(np.array(X_data), np.array(y_data), n_splits = 3)
knn.fit(True, True)
knn.df_Grid()

Fitting 3 folds for each of 32 candidates, totalling 96 fits
The best parameters are {'algorithm': 'auto', 'n_neighbors': 3, 'p': 1} and the best score is 0.9382716049382717
Train acc  is : 0.972568578553616


Unnamed: 0,params,split0_test_score,split1_test_score,split2_test_score
0,"{'algorithm': 'auto', 'n_neighbors': 3, 'p': 1}",0.938272,0.938272,0.938272
1,"{'algorithm': 'auto', 'n_neighbors': 3, 'p': 2}",0.938272,0.950617,0.925926
2,"{'algorithm': 'auto', 'n_neighbors': 5, 'p': 1}",0.925926,0.950617,0.91358
3,"{'algorithm': 'auto', 'n_neighbors': 5, 'p': 2}",0.925926,0.950617,0.901235
4,"{'algorithm': 'auto', 'n_neighbors': 7, 'p': 1}",0.901235,0.962963,0.901235
5,"{'algorithm': 'auto', 'n_neighbors': 7, 'p': 2}",0.901235,0.925926,0.901235
6,"{'algorithm': 'auto', 'n_neighbors': 9, 'p': 1}",0.91358,0.938272,0.888889
7,"{'algorithm': 'auto', 'n_neighbors': 9, 'p': 2}",0.901235,0.901235,0.888889
8,"{'algorithm': 'ball_tree', 'n_neighbors': 3, '...",0.938272,0.938272,0.938272
9,"{'algorithm': 'ball_tree', 'n_neighbors': 3, '...",0.938272,0.950617,0.925926


In [131]:
# DTC
dtc = DTCHandler(np.array(X_data), np.array(y_data), n_splits = 3)
dtc.fit(True, True)
dtc.df_Grid()

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


The best parameters are {'criterion': 'gini', 'max_features': 'sqrt', 'splitter': 'random'} and the best score is 0.9254901960784313
Train acc  is : 1.0


[Parallel(n_jobs=6)]: Done  36 out of  36 | elapsed:    0.2s finished


Unnamed: 0,params,split0_test_score,split1_test_score,split2_test_score
0,"{'criterion': 'gini', 'max_features': 'auto', ...",0.929412,0.882353,0.847059
1,"{'criterion': 'gini', 'max_features': 'auto', ...",0.882353,0.905882,0.882353
2,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.870588,0.905882,0.894118
3,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.905882,0.952941,0.917647
4,"{'criterion': 'gini', 'max_features': 'log2', ...",0.917647,0.823529,0.917647
5,"{'criterion': 'gini', 'max_features': 'log2', ...",0.870588,0.811765,0.917647
6,"{'criterion': 'entropy', 'max_features': 'auto...",0.882353,0.882353,0.858824
7,"{'criterion': 'entropy', 'max_features': 'auto...",0.905882,0.905882,0.847059
8,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.882353,0.941176,0.905882
9,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.870588,0.929412,0.894118


## XGBoost

In [138]:
xgb = XGBHandler(np.array(X_data), np.array(y_data), n_splits = 3)
xgb.fit(True,True)
xgb.pd_Grid()

Fitting 3 folds for each of 405 candidates, totalling 1215 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:  6.1min


KeyboardInterrupt: 