In [12]:
import os
import numpy as np
from keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from skimage.feature import hog
from sympy import sympify
import cv2
import matplotlib.pyplot as plt
import joblib
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import joblib
from sklearn.ensemble import ExtraTreesClassifier

## Dataset Preparation

In [2]:
data_dir = "data"
img_size = 45

# Collect labels
symbol_labels = sorted([d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))])

# Label mappings
symbol_to_index = {label: idx for idx, label in enumerate(symbol_labels)}
index_to_label = {v: k for k, v in symbol_to_index.items()}

X_symbols, y_symbols = [], []

# Load dataset images
for label in symbol_labels:
    folder_path = os.path.join(data_dir, label)
    for file in os.listdir(folder_path):
        if file.endswith(".jpg"):
            try:
                img_path = os.path.join(folder_path, file)
                img = load_img(img_path, color_mode="grayscale", target_size=(img_size, img_size))
                img = img_to_array(img) / 255.0
                X_symbols.append(img)
                y_symbols.append(symbol_to_index[label])
            except Exception as e:
                print(f"Error loading {img_path}: {e}")

# Convert to numpy arrays
X_all = np.array(X_symbols)
y_all = np.array(y_symbols)

print("Dataset loaded:", X_all.shape, y_all.shape)
print("Classes:", index_to_label)



Dataset loaded: (186134, 45, 45, 1) (186134,)
Classes: {0: '(', 1: ')', 2: '+', 3: '-', 4: '0', 5: '1', 6: '2', 7: '3', 8: '4', 9: '5', 10: '6', 11: '7', 12: '8', 13: '9', 14: 'div', 15: 'times'}


## Training with train, test and validation

In [3]:
# Train + temp set split
X_train, X_temp, y_train, y_temp = train_test_split(
    X_all, y_all, test_size=0.3, random_state=42, stratify=y_all
)

# Validation + Test split (from temp)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Train:", X_train.shape, y_train.shape)
print("Val:", X_val.shape, y_val.shape)
print("Test:", X_test.shape, y_test.shape)

Train: (130293, 45, 45, 1) (130293,)
Val: (27920, 45, 45, 1) (27920,)
Test: (27921, 45, 45, 1) (27921,)


## Feature Extraction

In [4]:
def extract_hog_features(images):
    features = []
    for img in images:
        img = img.squeeze()
        hog_feat = hog(
            img,
            orientations=9,
            pixels_per_cell=(4, 4),
            cells_per_block=(2, 2),
            block_norm="L2-Hys"
        )
        features.append(hog_feat)
    return np.array(features)

X_train_hog = extract_hog_features(X_train)
X_val_hog   = extract_hog_features(X_val)
X_test_hog  = extract_hog_features(X_test)

## Model Training

In [5]:
# Train baseline ExtraTrees
et_hog = ExtraTreesClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)
et_hog.fit(X_train_hog, y_train)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,False


## Hyperparameter Tuning

In [7]:
y_pred = et_hog.predict(X_test_hog)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(
    y_test, y_pred, target_names=[index_to_label[i] for i in sorted(index_to_label)]
))

Test Accuracy: 0.9619999283693278

Classification Report:
               precision    recall  f1-score   support

           (       0.98      0.97      0.98      2144
           )       0.97      0.99      0.98      2154
           +       0.96      0.99      0.97      3767
           -       1.00      1.00      1.00      5100
           0       0.98      0.98      0.98      1037
           1       0.93      0.97      0.95      3978
           2       0.94      0.99      0.96      3921
           3       0.90      0.96      0.93      1637
           4       0.95      0.88      0.92      1110
           5       0.98      0.82      0.89       532
           6       0.96      0.89      0.93       468
           7       1.00      0.84      0.91       436
           8       1.00      0.81      0.89       460
           9       0.98      0.78      0.87       560
         div       1.00      0.60      0.75       130
       times       1.00      0.94      0.97       487

    accuracy         

In [6]:
param_dist = {
    "n_estimators": randint(100, 400),
    "max_depth": [None, 20, 40],
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 5),
    "max_features": ["sqrt", "log2"]
}

search = RandomizedSearchCV(
    et_hog,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)
search.fit(X_val_hog, y_val)

et_hog = search.best_estimator_
print("Best params:", search.best_params_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best params: {'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 149}


In [13]:
# Retrain ExtraTrees with the best hyperparams
et_final = ExtraTreesClassifier(
    n_estimators=et_hog.n_estimators,
    max_depth=et_hog.max_depth,
    min_samples_split=et_hog.min_samples_split,
    min_samples_leaf=et_hog.min_samples_leaf,
    max_features=et_hog.max_features,
    random_state=42,
    n_jobs=-1
)
et_final.fit(X_train_hog, y_train)

y_pred = et_final.predict(X_test_hog)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(
    y_test, y_pred, target_names=[index_to_label[i] for i in sorted(index_to_label)]
))

Test Accuracy: 0.9970273271014648

Classification Report:
               precision    recall  f1-score   support

           (       1.00      0.99      1.00      2144
           )       1.00      1.00      1.00      2154
           +       1.00      1.00      1.00      3767
           -       1.00      1.00      1.00      5100
           0       1.00      1.00      1.00      1037
           1       0.99      1.00      0.99      3978
           2       1.00      1.00      1.00      3921
           3       1.00      1.00      1.00      1637
           4       0.99      0.99      0.99      1110
           5       1.00      0.99      1.00       532
           6       1.00      1.00      1.00       468
           7       1.00      0.97      0.98       436
           8       1.00      0.99      0.99       460
           9       1.00      0.99      0.99       560
         div       1.00      0.98      0.99       130
       times       1.00      1.00      1.00       487

    accuracy         

## Evaluation

In [8]:
X_trainval = np.concatenate([X_train, X_val], axis=0)
y_trainval = np.concatenate([y_train, y_val], axis=0)

# Extract HOF features again for combined set
X_trainval_hog = extract_hog_features(X_trainval)
X_test_hog     = extract_hog_features(X_test)

et_final.fit(X_trainval_hog, y_trainval)

y_pred_test = et_final.predict(X_test_hog)

print("Final Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nFinal Classification Report:\n", classification_report(
    y_test, y_pred_test,
    target_names=[index_to_label[i] for i in sorted(index_to_label)]
))

Final Test Accuracy: 0.9983883098742882

Final Classification Report:
               precision    recall  f1-score   support

           (       1.00      0.99      1.00      2144
           )       1.00      1.00      1.00      2154
           +       1.00      1.00      1.00      3767
           -       1.00      1.00      1.00      5100
           0       1.00      1.00      1.00      1037
           1       0.99      1.00      1.00      3978
           2       1.00      1.00      1.00      3921
           3       1.00      1.00      1.00      1637
           4       1.00      1.00      1.00      1110
           5       1.00      1.00      1.00       532
           6       1.00      1.00      1.00       468
           7       1.00      0.99      0.99       436
           8       1.00      0.99      0.99       460
           9       1.00      1.00      1.00       560
         div       1.00      1.00      1.00       130
       times       1.00      1.00      1.00       487

    accur

## Saving Final Model + Mappings

In [9]:
joblib.dump(et_final, "models/et_final_hog.pkl")

joblib.dump(index_to_label, "models/index_to_label.pkl")
joblib.dump(symbol_to_index, "models/symbol_to_index.pkl")

['models/symbol_to_index.pkl']

In [10]:
# Load model
rf_final = joblib.load("models/et_final_hog.pkl")

# Load mappings
index_to_label = joblib.load("models/index_to_label.pkl")
symbol_to_index = joblib.load("models/symbol_to_index.pkl")

print("Model and mappings loaded successfully.")

Model and mappings loaded successfully.


In [11]:
# Evaluate ExtraTrees
y_pred_et = et_final.predict(X_test_hog)
acc_et = accuracy_score(y_test, y_pred_et)

print("ExtraTrees Final Test Accuracy:", acc_et)

print("\n ExtraTrees Classification Report:\n")
print(classification_report(y_test, y_pred_et, target_names=[index_to_label[i] for i in sorted(index_to_label)]))

FileNotFoundError: [Errno 2] No such file or directory: 'models/rf_final_hog.pkl'