In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score, hamming_loss

In [2]:
# фиктивные данные
np.random.seed(42)
df = pd.DataFrame(np.random.randint(0, 3, size=(300, 5000)))  # 300 образцов, 5000 столбцов
df = df.apply(LabelEncoder().fit_transform)  # Кодируем категории числами

# DataFrame в тензор PyTorch
data_tensor = torch.tensor(df.values, dtype=torch.long)

In [None]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,2,0,2,2,0,0,2,1,2,2,...,1,2,1,2,1,1,1,2,2,0
1,2,2,2,2,0,2,1,1,2,2,...,1,1,2,0,0,0,2,1,2,2
2,1,0,0,2,0,2,2,1,1,1,...,1,0,1,1,0,2,0,0,1,1
3,1,1,2,2,2,1,1,1,2,1,...,0,2,0,0,1,0,2,1,2,1
4,0,1,1,0,0,2,2,0,0,0,...,0,0,1,1,0,0,1,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,1,2,1,0,2,1,2,2,0,...,0,1,2,0,2,2,1,2,2,1
96,2,1,2,0,0,0,1,1,1,1,...,0,0,1,2,2,0,2,1,1,2
97,2,1,0,0,1,2,2,2,2,2,...,2,0,1,0,2,2,2,1,2,1
98,1,1,1,1,1,2,1,0,0,0,...,2,2,2,1,1,2,1,0,0,0


In [3]:
# Гиперпараметры
num_categories = 3  # Число категорий (0,1,2)
embedding_dim = 4   # Размерность векторов категорий
input_dim = df.shape[1]  # Количество столбцов
encoder_dims = [512, 256, 128, 64]  # Размерности скрытых слоёв

In [4]:
# Автоэнкодер для категориальных данных
class CategoricalAutoencoder(nn.Module):
    def __init__(self, input_dim, num_categories, embedding_dim, encoder_dims):
        super(CategoricalAutoencoder, self).__init__()

        # Embedding-слой для категорий
        self.embedding = nn.Embedding(num_categories, embedding_dim)

        # ЭНКОДЕР
        encoder_layers = []
        current_dim = input_dim * embedding_dim
        for dim in encoder_dims:
            encoder_layers.append(nn.Linear(current_dim, dim))
            encoder_layers.append(nn.ReLU())
            current_dim = dim
        self.encoder = nn.Sequential(*encoder_layers)

        # ДЕКОДЕР
        decoder_dims = encoder_dims[::-1] + [input_dim * num_categories]
        decoder_layers = []
        current_dim = decoder_dims[0]
        for dim in decoder_dims[1:]:
            decoder_layers.append(nn.Linear(current_dim, dim))
            decoder_layers.append(nn.ReLU() if dim != input_dim * num_categories else nn.Identity())
            current_dim = dim
        self.decoder = nn.Sequential(*decoder_layers)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.view(x.size(0), -1)
        encoded = self.encoder(embedded)
        decoded = self.decoder(encoded)
        return decoded.view(x.size(0), input_dim, num_categories)

In [5]:
# Инициализация модели
model = CategoricalAutoencoder(input_dim, num_categories, embedding_dim, encoder_dims)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

In [6]:
# Обучение модели
num_epochs = 500
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    output = model(data_tensor)
    loss = criterion(output.permute(0, 2, 1), data_tensor)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 50 == 0:
        print(f"Эпоха [{epoch+1}/{num_epochs}], Ошибка: {loss.item():.4f}")

Эпоха [50/500], Ошибка: 1.0737
Эпоха [100/500], Ошибка: 1.0569
Эпоха [150/500], Ошибка: 1.0395
Эпоха [200/500], Ошибка: 1.0306
Эпоха [250/500], Ошибка: 1.0288
Эпоха [300/500], Ошибка: 1.0196
Эпоха [350/500], Ошибка: 1.0410
Эпоха [400/500], Ошибка: 1.0273
Эпоха [450/500], Ошибка: 1.0416
Эпоха [500/500], Ошибка: 1.0238


In [7]:
# Восстановление категориальных данных
model.eval()
with torch.no_grad():
    output = model(data_tensor)
    predicted_classes = output.argmax(dim=2)

decoded_df = pd.DataFrame(predicted_classes.numpy(), columns=df.columns)
print("Восстановленный df:")
decoded_df

Восстановленный df:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,2,1,1,2,2,1,2,0,1,2,...,0,0,0,0,2,0,2,0,0,0
1,2,1,0,1,0,0,2,1,1,2,...,0,2,2,0,1,0,0,1,0,2
2,1,2,0,2,2,2,2,1,1,1,...,0,0,2,2,1,0,0,1,0,2
3,0,1,0,0,2,0,1,2,1,0,...,1,0,2,1,1,1,1,2,1,2
4,1,2,1,0,2,0,2,1,1,0,...,0,2,0,0,2,1,1,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0,2,1,2,1,2,2,2,1,2,...,1,0,1,0,0,0,0,2,2,1
296,1,2,1,2,2,0,2,1,1,0,...,0,2,0,0,2,1,1,2,1,1
297,1,2,2,2,0,1,0,0,1,0,...,2,1,0,2,1,0,0,1,2,2
298,2,1,0,1,0,0,2,1,1,2,...,0,2,2,0,1,0,0,1,0,2


In [8]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,2,0,2,2,0,0,2,1,2,2,...,0,0,1,1,0,0,1,2,1,0
1,2,0,2,0,2,2,0,2,1,0,...,2,2,0,1,1,2,0,1,1,2
2,2,1,1,0,1,1,2,1,2,1,...,0,2,1,0,1,2,0,1,0,0
3,1,0,1,0,0,0,2,2,1,0,...,0,0,1,0,1,0,2,0,1,0
4,1,1,2,1,2,2,0,2,1,1,...,0,2,0,2,2,0,2,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,2,1,2,1,2,2,2,0,0,2,...,1,0,2,2,2,2,0,1,1,0
296,1,2,1,0,1,2,2,2,0,0,...,2,2,2,0,1,2,2,2,1,0
297,0,1,2,2,2,0,1,1,2,1,...,2,2,0,2,0,2,0,0,0,2
298,1,1,0,2,0,0,1,1,0,0,...,2,1,1,0,1,0,1,1,0,2


In [9]:
# Оценка модели
with torch.no_grad():
    reconstructed_probs = model(data_tensor)
    reconstructed_labels = torch.argmax(reconstructed_probs, dim=-1)

true_labels = data_tensor.numpy().flatten()
predicted_labels = reconstructed_labels.numpy().flatten()

# 🔹 Вычисляем метрики
accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels, average='macro')
kappa = cohen_kappa_score(true_labels, predicted_labels)
hamming = hamming_loss(true_labels, predicted_labels)

print("\nМетрики качества автоэнкодера:")
print(f'Accuracy: {accuracy:.4f}')
print(f'F1-score: {f1:.4f}')
print(f'Cohen’s Kappa: {kappa:.4f}')
print(f'Hamming Loss: {hamming:.4f}')



Метрики качества автоэнкодера:
Accuracy: 0.4532
F1-score: 0.4532
Cohen’s Kappa: 0.1799
Hamming Loss: 0.5468
