In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score, hamming_loss

In [2]:
# —Ñ–∏–∫—Ç–∏–≤–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ
np.random.seed(42)
df = pd.DataFrame(np.random.randint(0, 3, size=(300, 5000)))  # 300 –æ–±—Ä–∞–∑—Ü–æ–≤, 5000 —Å—Ç–æ–ª–±—Ü–æ–≤
df = df.apply(LabelEncoder().fit_transform)  # –ö–æ–¥–∏—Ä—É–µ–º –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ —á–∏—Å–ª–∞–º–∏

# DataFrame –≤ —Ç–µ–Ω–∑–æ—Ä PyTorch
data_tensor = torch.tensor(df.values, dtype=torch.long)

In [None]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,2,0,2,2,0,0,2,1,2,2,...,1,2,1,2,1,1,1,2,2,0
1,2,2,2,2,0,2,1,1,2,2,...,1,1,2,0,0,0,2,1,2,2
2,1,0,0,2,0,2,2,1,1,1,...,1,0,1,1,0,2,0,0,1,1
3,1,1,2,2,2,1,1,1,2,1,...,0,2,0,0,1,0,2,1,2,1
4,0,1,1,0,0,2,2,0,0,0,...,0,0,1,1,0,0,1,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,1,2,1,0,2,1,2,2,0,...,0,1,2,0,2,2,1,2,2,1
96,2,1,2,0,0,0,1,1,1,1,...,0,0,1,2,2,0,2,1,1,2
97,2,1,0,0,1,2,2,2,2,2,...,2,0,1,0,2,2,2,1,2,1
98,1,1,1,1,1,2,1,0,0,0,...,2,2,2,1,1,2,1,0,0,0


In [3]:
# –ì–∏–ø–µ—Ä–ø–∞—Ä–∞–º–µ—Ç—Ä—ã
num_categories = 3  # –ß–∏—Å–ª–æ –∫–∞—Ç–µ–≥–æ—Ä–∏–π (0,1,2)
embedding_dim = 4   # –†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å –≤–µ–∫—Ç–æ—Ä–æ–≤ –∫–∞—Ç–µ–≥–æ—Ä–∏–π
input_dim = df.shape[1]  # –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å—Ç–æ–ª–±—Ü–æ–≤
encoder_dims = [512, 256, 128, 64]  # –†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç–∏ —Å–∫—Ä—ã—Ç—ã—Ö —Å–ª–æ—ë–≤

In [4]:
# –ê–≤—Ç–æ—ç–Ω–∫–æ–¥–µ—Ä –¥–ª—è –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö
class CategoricalAutoencoder(nn.Module):
    def __init__(self, input_dim, num_categories, embedding_dim, encoder_dims):
        super(CategoricalAutoencoder, self).__init__()

        # Embedding-—Å–ª–æ–π –¥–ª—è –∫–∞—Ç–µ–≥–æ—Ä–∏–π
        self.embedding = nn.Embedding(num_categories, embedding_dim)

        # –≠–ù–ö–û–î–ï–†
        encoder_layers = []
        current_dim = input_dim * embedding_dim
        for dim in encoder_dims:
            encoder_layers.append(nn.Linear(current_dim, dim))
            encoder_layers.append(nn.ReLU())
            current_dim = dim
        self.encoder = nn.Sequential(*encoder_layers)

        # –î–ï–ö–û–î–ï–†
        decoder_dims = encoder_dims[::-1] + [input_dim * num_categories]
        decoder_layers = []
        current_dim = decoder_dims[0]
        for dim in decoder_dims[1:]:
            decoder_layers.append(nn.Linear(current_dim, dim))
            decoder_layers.append(nn.ReLU() if dim != input_dim * num_categories else nn.Identity())
            current_dim = dim
        self.decoder = nn.Sequential(*decoder_layers)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.view(x.size(0), -1)
        encoded = self.encoder(embedded)
        decoded = self.decoder(encoded)
        return decoded.view(x.size(0), input_dim, num_categories)

In [5]:
# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –º–æ–¥–µ–ª–∏
model = CategoricalAutoencoder(input_dim, num_categories, embedding_dim, encoder_dims)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

In [6]:
# –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏
num_epochs = 500
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    output = model(data_tensor)
    loss = criterion(output.permute(0, 2, 1), data_tensor)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 50 == 0:
        print(f"–≠–ø–æ—Ö–∞ [{epoch+1}/{num_epochs}], –û—à–∏–±–∫–∞: {loss.item():.4f}")

–≠–ø–æ—Ö–∞ [50/500], –û—à–∏–±–∫–∞: 1.0737
–≠–ø–æ—Ö–∞ [100/500], –û—à–∏–±–∫–∞: 1.0569
–≠–ø–æ—Ö–∞ [150/500], –û—à–∏–±–∫–∞: 1.0395
–≠–ø–æ—Ö–∞ [200/500], –û—à–∏–±–∫–∞: 1.0306
–≠–ø–æ—Ö–∞ [250/500], –û—à–∏–±–∫–∞: 1.0288
–≠–ø–æ—Ö–∞ [300/500], –û—à–∏–±–∫–∞: 1.0196
–≠–ø–æ—Ö–∞ [350/500], –û—à–∏–±–∫–∞: 1.0410
–≠–ø–æ—Ö–∞ [400/500], –û—à–∏–±–∫–∞: 1.0273
–≠–ø–æ—Ö–∞ [450/500], –û—à–∏–±–∫–∞: 1.0416
–≠–ø–æ—Ö–∞ [500/500], –û—à–∏–±–∫–∞: 1.0238


In [7]:
# –í–æ—Å—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–∏–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö
model.eval()
with torch.no_grad():
    output = model(data_tensor)
    predicted_classes = output.argmax(dim=2)

decoded_df = pd.DataFrame(predicted_classes.numpy(), columns=df.columns)
print("–í–æ—Å—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–Ω—ã–π df:")
decoded_df

–í–æ—Å—Å—Ç–∞–Ω–æ–≤–ª–µ–Ω–Ω—ã–π df:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,2,1,1,2,2,1,2,0,1,2,...,0,0,0,0,2,0,2,0,0,0
1,2,1,0,1,0,0,2,1,1,2,...,0,2,2,0,1,0,0,1,0,2
2,1,2,0,2,2,2,2,1,1,1,...,0,0,2,2,1,0,0,1,0,2
3,0,1,0,0,2,0,1,2,1,0,...,1,0,2,1,1,1,1,2,1,2
4,1,2,1,0,2,0,2,1,1,0,...,0,2,0,0,2,1,1,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0,2,1,2,1,2,2,2,1,2,...,1,0,1,0,0,0,0,2,2,1
296,1,2,1,2,2,0,2,1,1,0,...,0,2,0,0,2,1,1,2,1,1
297,1,2,2,2,0,1,0,0,1,0,...,2,1,0,2,1,0,0,1,2,2
298,2,1,0,1,0,0,2,1,1,2,...,0,2,2,0,1,0,0,1,0,2


In [8]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,2,0,2,2,0,0,2,1,2,2,...,0,0,1,1,0,0,1,2,1,0
1,2,0,2,0,2,2,0,2,1,0,...,2,2,0,1,1,2,0,1,1,2
2,2,1,1,0,1,1,2,1,2,1,...,0,2,1,0,1,2,0,1,0,0
3,1,0,1,0,0,0,2,2,1,0,...,0,0,1,0,1,0,2,0,1,0
4,1,1,2,1,2,2,0,2,1,1,...,0,2,0,2,2,0,2,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,2,1,2,1,2,2,2,0,0,2,...,1,0,2,2,2,2,0,1,1,0
296,1,2,1,0,1,2,2,2,0,0,...,2,2,2,0,1,2,2,2,1,0
297,0,1,2,2,2,0,1,1,2,1,...,2,2,0,2,0,2,0,0,0,2
298,1,1,0,2,0,0,1,1,0,0,...,2,1,1,0,1,0,1,1,0,2


In [9]:
# –û—Ü–µ–Ω–∫–∞ –º–æ–¥–µ–ª–∏
with torch.no_grad():
    reconstructed_probs = model(data_tensor)
    reconstructed_labels = torch.argmax(reconstructed_probs, dim=-1)

true_labels = data_tensor.numpy().flatten()
predicted_labels = reconstructed_labels.numpy().flatten()

# üîπ –í—ã—á–∏—Å–ª—è–µ–º –º–µ—Ç—Ä–∏–∫–∏
accuracy = accuracy_score(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels, average='macro')
kappa = cohen_kappa_score(true_labels, predicted_labels)
hamming = hamming_loss(true_labels, predicted_labels)

print("\n–ú–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –∞–≤—Ç–æ—ç–Ω–∫–æ–¥–µ—Ä–∞:")
print(f'Accuracy: {accuracy:.4f}')
print(f'F1-score: {f1:.4f}')
print(f'Cohen‚Äôs Kappa: {kappa:.4f}')
print(f'Hamming Loss: {hamming:.4f}')



–ú–µ—Ç—Ä–∏–∫–∏ –∫–∞—á–µ—Å—Ç–≤–∞ –∞–≤—Ç–æ—ç–Ω–∫–æ–¥–µ—Ä–∞:
Accuracy: 0.4532
F1-score: 0.4532
Cohen‚Äôs Kappa: 0.1799
Hamming Loss: 0.5468
