In [268]:
#import polars as pl
import numpy as np
import os
import torch
import cv2
from PIL import Image
from skimage.io import imread, imshow
from sklearn import svm
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import piq
import piqa

import pickle
import joblib

### Data preprocessing + feature extraction

In [381]:
import zipfile
from io import BytesIO

data = {}
fid_metric = piqa.FID()
list_feats = []
list_labels = []
# Путь к ZIP-файлу
data_dir = 'data'
for category in os.listdir(data_dir):
    zip_file_path = os.path.join(data_dir, category)
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        for file_info in zip_ref.infolist():
            with zip_ref.open(file_info) as file:
                image_data = BytesIO(file.read())
                img_array = imread(image_data, as_gray=False)
                if len(img_array.shape) != 3 or img_array.shape[2] != 3:
                    img_array =  cv2.cvtColor(img_array, cv2.COLOR_BGR2RGB)
                size = (250, 250)
                resized_img = cv2.resize(img_array, size)
                img_tensor = torch.tensor(resized_img).permute(2, 0, 1)[None, ...] / 255
                img_feats = fid_metric.features(img_tensor).reshape(-1)
                list_feats.append(img_feats)
                list_labels.append(category.rsplit('.', 1)[0])

data['features'] = list_feats
data['labels'] = list_labels
                    

### Splitting the dataset into training, validation, and test sets

In [382]:

X_train, X_test, y_train, y_test = train_test_split(data['features'], data['labels'], test_size=0.2, shuffle=True, stratify=data['labels'])
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle=True, stratify=y_train)


### RandomForest

In [383]:
model = RandomForestClassifier()

In [384]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy', 'log_loss']
}

In [385]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [386]:
best_params = grid_search.best_params_
print(f'Best params: {best_params}')

best_randforest = grid_search.best_estimator_

Best params: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 10, 'n_estimators': 200}


Evaluation (RandomForest)

In [387]:
val_preds = best_randforest.predict(X_val)
print(f'Accuracy: {accuracy_score(y_val, val_preds)}')

Accuracy: 0.9099099099099099


In [388]:
test_preds = best_randforest.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, test_preds)}')
print('Precision:', precision_score(y_test, test_preds, average='weighted'))
print('Recall:', recall_score(y_test, test_preds, average='weighted'))
print('F1:', f1_score(y_test, test_preds, average='weighted'))

Accuracy: 0.9147982062780269
Precision: 0.9165028626912035
Recall: 0.9147982062780269
F1: 0.914876993075608


In [389]:
test_preds = best_randforest.predict(X_train)
test_accuracy = accuracy_score(y_train, test_preds)
print(f'Accuracy: {test_accuracy}')

Accuracy: 1.0


In [366]:
# saving the model
with open('final_model.p', 'wb') as f:
  pickle.dump(best_randforest, f)
  f.close()

### SVM

In [390]:
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

In [391]:
svc = svm.SVC()
model = GridSearchCV(svc, parameters, cv=3)
model.fit(X_train, y_train)

In [392]:
best_params = model.best_params_

best_svm = svm.SVC(**best_params, probability=True)
best_svm.fit(X_train, y_train)

Evaluation (svm)

In [393]:
y_val_pred = best_svm.predict(X_val)
print("Accuracy: {:.2f}".format(accuracy_score(y_val, y_val_pred)))

Accuracy: 0.94


In [394]:
y_test_pred = best_svm.predict(X_test)
print("Точность модели на тестовом наборе данных: {:.2f}".format(accuracy_score(y_test, y_test_pred)))
print('Precision:', precision_score(y_test, y_test_pred, average='weighted'))
print('Recall:', recall_score(y_test, y_test_pred, average='weighted'))
print('F1:', f1_score(y_test, y_test_pred, average='weighted'))

Точность модели на тестовом наборе данных: 0.94
Precision: 0.9430776443936904
Recall: 0.9417040358744395
F1: 0.9414044886235059


In [None]:
# saving the model
joblib.dump(best_svm, 'final_model.joblib', compress=9)