# Histogram oriented gradients

HOG, или Histogram of Oriented Gradients, это дескриптор признаков, который часто используется для извлечения признаков из изображения. Это широко используется в задачах компьютерного зрения. Этот метод подсчитывает появления градиентной ориентации в отдельных участках изображения

HOG дескриптор фокусируется на структуре или форме объекта. Для признаков границ мы только отмечаем, является ли данный пиксель границей или нет. HOG способен также предоставить направление границы. Это делается путем извлечения градиента и ориентации (или можно сказать мощность и направление) границ.


In [39]:
%pip install python-dotenv boto3 pillow pandas matplotlib catboost

Defaulting to user installation because normal site-packages is not writeable
Collecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Collecting catboost
  Obtaining dependency information for catboost from https://files.pythonhosted.org/packages/15/95/951b51229a9c8af767e48b1f9ef7baa87279b1f5847d2f85de0855578e5d/catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl.metadata
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-dotenv, catboost
[0mSuccessfully installed catboost-1.2.2 python-dotenv-1.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To

In [1]:
%load_ext dotenv
%dotenv
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import boto3
import catboost as cb
from warnings import filterwarnings
filterwarnings("ignore")
import os
os.environ["PYTHONWARNINGS"] = "ignore"

s3_client = boto3.client(
    's3',
    endpoint_url='https://storage.yandexcloud.net',
    aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
)
BUCKET_NAME = 'tnn-hse-medtech'
DATASET_DIR = 'datasets/'


cannot find .env file


Считываем таблицу с обработанными данными на этапе [EDA](../../EDA.md)

In [None]:
data = pd.read_csv('normalized_data.csv')


Перед извлечением HOG-дескриптора приводим изображение к оттенкам серого и уменьшаем до размера 128x128.

In [None]:
import io
from skimage.feature import hog
from PIL import Image

def hog_picture(image: io.IOBase):
    image = Image.open(image).convert('L').resize((128, 128))
    array = np.asarray(image)
    vector, hog_image = hog(
        array,
        orientations=9,
        pixels_per_cell=(8, 8),
        cells_per_block=(2, 2),
        feature_vector=True,
        visualize=True,
    )
    return vector, hog_image


Запускаем сбор дескрипторов

In [None]:
import io
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor
from ipywidgets import IntProgress
from IPython.display import display

@dataclass()
class Metadata:
    severity: int
    relative_path: str
    hog: np.ndarray
    hog_image: np.ndarray

def handle_file(item) -> Metadata:
    buffer = io.BytesIO()
    s3_client.download_fileobj(BUCKET_NAME, f'{DATASET_DIR}{item.relative_path}', buffer)
    buffer.seek(0)
    hog_fd, hog_image = hog_picture(buffer)
    return Metadata(item.severity, item.relative_path, hog_fd, hog_image)

raw_data = []
progress = IntProgress(min=0, max=len(data))
display(progress)
with ThreadPoolExecutor(max_workers=100) as pool:
    for item in pool.map(handle_file, data.itertuples(), chunksize=1):
        raw_data.append(item)
        progress.value += 1


Создаем датасеты

In [None]:
X_data = pd.DataFrame(data=(item.hog for item in raw_data))
y_target = pd.DataFrame(data=(item.severity for item in raw_data))


In [None]:
X_data.info()


Посмотрим на пример полученного дескриптора

In [None]:
from skimage.exposure import rescale_intensity

hog_rescaled = rescale_intensity(raw_data[0].hog_image)
plt.imshow(hog_rescaled, cmap=plt.cm.gray)


Сохраняем датасет дескрипторов в S3

In [None]:
hog_data = X_data.copy()
hog_data['severity'] = y_target
buffer = io.BytesIO()
hog_data.to_csv(buffer, index=False, compression='gzip')
buffer.seek(0)
s3_client.upload_fileobj(buffer, BUCKET_NAME, 'csv/train_data.csv.gz')


Если нужно быстро получить hog_data, то нужно выполнить только этот блок

In [2]:
hog_data = pd.read_csv(f'https://storage.yandexcloud.net/{BUCKET_NAME}/csv/train_data.csv.gz', compression='gzip')
X_data = hog_data.drop(columns=['severity'])
y_target = hog_data['severity']


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_data, y_target, test_size=0.3, stratify=y_target)


Подбираем гиперпараметры для модели SVC

In [None]:
%pip install scikit-learn-intelex

In [None]:
from sklearnex import patch_sklearn
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

patch_sklearn()

params = {
    'C': np.linspace(1, 100, 3),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'coef0': np.linspace(0, 10, 3),
    'class_weight': [None, 'balanced'],
    'decision_function_shape': ['ovo', 'ovr'],
}
gs_svc_model = GridSearchCV(SVC(probability=True), params, n_jobs=4, verbose=2, scoring='f1', cv=3)
gs_svc_model.fit(X_train, y_train)
gs_svc_model.best_params_, gs_svc_model.best_score_


In [None]:
gs_svc_model.best_estimator_


In [None]:
from sklearn.metrics import f1_score, roc_auc_score
y_pred_proba = gs_svc_model.best_estimator_.predict_proba(X_test)
y_pred = gs_svc_model.best_estimator_.predict(X_test)
print('roc_auc:', roc_auc_score(y_test, y_pred_proba, multi_class='ovo'))
print('f1:', f1_score(y_test, y_pred, average='macro'))


Модель показывает неплохие метрики качества. Попробуем использовать градиентный бустинг

In [41]:
%pip install catboost ipywidgets


Defaulting to user installation because normal site-packages is not writeable
Collecting ipykernel>=4.5.1 (from ipywidgets)
  Downloading ipykernel-5.1.4-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.8/116.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting ipython>=4.0.0 (from ipywidgets)
  Downloading ipython-7.13.0-py3-none-any.whl (780 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.3/780.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting notebook>=4.4.1 (from widgetsnbextension~=3.6.0->ipywidgets)
  Downloading notebook-6.1.1-py3-none-any.whl (9.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ipython, ipykernel, notebook
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency 

In [None]:
!jupyter labextension install @jupyter-widgets/jupyterlab-manager jupyter-leaflet

In [4]:
#!g1.1
import catboost as cb

cb_model = cb.CatBoostClassifier(
    one_hot_max_size=50,
    iterations=1500,
    learning_rate=0.1,
    bootstrap_type='Bernoulli',
    subsample=0.5,
    task_type="GPU",
    devices='0',
    )
cb_model.fit(
    X_train, 
    y_train,
    verbose=False,
    plot=True,
    eval_set=(X_test, y_test)
    )


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7fdfccf4f1c0>

In [None]:
from sklearn.metrics import roc_auc_score, f1_score

y_pred_proba = cb_model.predict_proba(X_test)
y_pred = cb_model.predict(X_test)
print('roc_auc:', roc_auc_score(y_test, y_pred_proba, multi_class='ovo'))
print('f1:', f1_score(y_test, y_pred, average='macro'))


In [None]:
cb_model.get_params()


In [None]:
cb_model.save_model('hog_cat_boost.cbm', format="cbm")
s3_client.upload_file('hog_cat_boost.cbm', BUCKET_NAME, 'models/hog_cat_boost.cbm')


In [42]:
%pip install hyperopt


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [59]:
#!g1.1
import numpy as np
from hyperopt import hp, fmin, tpe
from catboost.utils import eval_metric

val_pool = cb.Pool(X_test, y_test)
train_pool = cb.Pool(X_train, y_train)

def hyperopt_objective(params):
    model = cb.CatBoostClassifier(
        loss_function='MultiClass',
        n_estimators=150,
        bootstrap_type='Bernoulli',
        subsample=0.5,
        task_type="GPU",
        devices='0',
        **params)
    model.fit(train_pool, verbose=0, eval_set=val_pool)
    y_pred = model.predict_proba(val_pool)
    return -eval_metric(val_pool.get_label(), y_pred, 'AUC')[0]

space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.1),
    'depth': hp.randint('depth', 3, 10),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1, 100),
}

best = fmin(
    hyperopt_objective,
    space=space,
    algo=tpe.suggest,
    max_evals=1000,
    trials_save_file='hog_cb_trials.bin',
)


{'depth': 5, 'l2_leaf_reg': 73.77222926204922, 'learning_rate': 0.059359155096410834}
{'depth': 3, 'l2_leaf_reg': 74.09293730714172, 'learning_rate': 0.030494912907392334}
{'depth': 8, 'l2_leaf_reg': 82.15404635401315, 'learning_rate': 0.06187703516500753}
{'depth': 9, 'l2_leaf_reg': 82.28373152367715, 'learning_rate': 0.044050836926450426}
{'depth': 3, 'l2_leaf_reg': 56.42794221209653, 'learning_rate': 0.04186862187068092}
{'depth': 3, 'l2_leaf_reg': 85.90737981856576, 'learning_rate': 0.021467178171600067}
{'depth': 6, 'l2_leaf_reg': 71.32840088761914, 'learning_rate': 0.09613430159570349}
{'depth': 4, 'l2_leaf_reg': 4.635688466880607, 'learning_rate': 0.06409904106006564}
{'depth': 7, 'l2_leaf_reg': 40.78211566858993, 'learning_rate': 0.05001806271867163}
{'depth': 5, 'l2_leaf_reg': 4.983856895446685, 'learning_rate': 0.07685080620059809}
{'depth': 4, 'l2_leaf_reg': 64.60251798517407, 'learning_rate': 0.0919239686832635}
{'depth': 6, 'l2_leaf_reg': 58.82857189476912, 'learning_rate'

In [60]:
best

{'depth': 9,
 'l2_leaf_reg': 1.0373073118412859,
 'learning_rate': 0.08469523739540981}

In [6]:
best = {'depth': 9,
 'l2_leaf_reg': 1.0373073118412859,
 'learning_rate': 0.07}

In [None]:
#!g1.1
val_pool = cb.Pool(X_test, y_test)
train_pool = cb.Pool(X_train, y_train)
model = cb.CatBoostClassifier(
    loss_function='MultiClass',
    n_estimators=1500,
    bootstrap_type='Bernoulli',
    task_type="GPU",
    devices='0',
    **best,
)
model.fit(train_pool, verbose=0, eval_set=val_pool, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [9]:
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score

y_pred_proba = model.predict_proba(X_test)
y_pred = model.predict(X_test)
print('roc_auc:', roc_auc_score(y_test, y_pred_proba, multi_class='ovo'))
print('f1:', f1_score(y_test, y_pred, average='macro'))
print('accuracy:', accuracy_score(y_test, y_pred))

roc_auc: 0.989593025030785
f1: 0.8935586817109332
accuracy: 0.891260162601626


После подбора параметров удалось немного увеличить качество предсказаний. Сохраним эту модель в S3, чтобы в будущем можно было использовать в сервисе

In [10]:
model.save_model('hog_cat_boost_v3.cbm', format="cbm")
s3_client.upload_file('hog_cat_boost_v3.cbm', BUCKET_NAME, 'models/hog_cat_boost_v3.cbm')

Проверим скорость предсказания на одной модели

In [11]:
from skimage.feature import hog
from PIL import Image
import io

def hog_picture(image: io.IOBase):
    image = Image.open(image).convert('L').resize((128, 128))
    array = np.asarray(image)
    vector = hog(
        array,
        orientations=9,
        pixels_per_cell=(8, 8),
        cells_per_block=(2, 2),
        feature_vector=True,
    )
    return vector

def predict(image: io.IOBase):
    vector = hog_picture(image)
    severity = model.predict(vector)
    return severity

Для этого используем спорные изображения, который получили разную оценку у экспертов и не использовались при обучении

In [12]:
buffer = io.BytesIO()
s3_client.download_fileobj(BUCKET_NAME, f'{DATASET_DIR}MedicalExpert-II/2Mild/MildG2 (110).png', buffer)
buffer.seek(0)

0

In [13]:
%%timeit
predict(buffer)

12.8 ms ± 61.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Модель предсказывает достаточно быстро для встраивания в API
