In [None]:
!pip install pytorch_metric_learning torchmetrics torch clearml tqdm pandas numpy==1.26.4

In [None]:
import pandas as pd
df_train = pd.read_csv("/Users/rleontiev/Downloads/labeled_overall_train.csv")

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_train, test_size=0.2)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
from clearml import Task, Logger
from torch.optim.lr_scheduler import CosineAnnealingLR
from pytorch_metric_learning.losses import ProxyAnchorLoss
from pytorch_metric_learning.trainers import MetricLossOnly
from pytorch_metric_learning.utils import common_functions as c_f
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.optim import AdamW
import torchmetrics
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np

# === Настройки ===
batch_size = 16
embedding_size = 128  # Размерность эмбеддингов
num_epochs = 200
num_classes = 3

# === Инициализация ClearML ===
task = Task.init(project_name="Tabular_Metric_Learning", task_name="MLP_ProxyAnchor")
logger = task.get_logger()
print("Ссылка на задачу в ClearML:", task.get_output_log_web_page())

# === Модель MLP ===
class TabularMLP(nn.Module):
    """Простая MLP для обработки табличных данных."""
    def __init__(self, input_dim, embedding_dim=128):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, embedding_dim)  # Выход — эмбеддинг размерности embedding_dim
        )

    def forward(self, x):
        return self.model(x)

# === Тренер ===
class TabularTrainer(MetricLossOnly):
    def __init__(self, val_dataset, **kwargs):
        super().__init__(**kwargs)
        self.val_dataset = val_dataset

        # Метрики
        self.train_auc = torchmetrics.AUROC(task="multilabel", num_labels=num_classes, average="macro")
        self.val_auc = torchmetrics.AUROC(task="multilabel", num_labels=num_classes, average="macro")

        self.train_mcc = torchmetrics.MatthewsCorrCoef(task="multilabel", num_labels=num_classes)
        self.val_mcc = torchmetrics.MatthewsCorrCoef(task="multilabel", num_labels=num_classes)

        self.train_acc = torchmetrics.Accuracy(task="multilabel", num_labels=num_classes, average="macro")
        self.val_acc = torchmetrics.Accuracy(task="multilabel", num_labels=num_classes, average="macro")

        self.train_ap = torchmetrics.AveragePrecision(task="multilabel", num_labels=num_classes, average="macro")
        self.val_ap = torchmetrics.AveragePrecision(task="multilabel", num_labels=num_classes, average="macro")

    def initialize_val_dataloader(self):
        self.val_dataloader = c_f.get_eval_dataloader(
            self.val_dataset,
            self.batch_size,
            self.dataloader_num_workers,
            self.collate_fn,
        )

    def train(self, start_epoch=1, num_epochs=1):
        self.initialize_dataloader()
        self.initialize_val_dataloader()
        for self.epoch in range(start_epoch, num_epochs + 1):
            self.set_to_train()
            c_f.LOGGER.info(f"TRAINING EPOCH {self.epoch}")
            pbar = tqdm(range(self.iterations_per_epoch))

            self.train_auc.reset()
            self.train_mcc.reset()
            self.train_acc.reset()
            self.train_ap.reset()

            for self.iteration in pbar:
                batch = self.get_batch()
                self.forward_and_backward()
                self.end_of_iteration_hook(self)

                features, labels = batch
                embeddings = self.models["trunk"](features)

                proxies = self.loss_funcs["metric_loss"].proxies
                cosine_sim = F.cosine_similarity(embeddings.unsqueeze(1), proxies.unsqueeze(0), dim=-1)
                preds = (cosine_sim > 0.001).float()

                self.train_auc.update(preds, labels)
                self.train_mcc.update(preds, labels)
                self.train_acc.update(preds, labels)
                self.train_ap.update(preds, labels)

                auc_value = self.train_auc.compute().item()
                acc_value = self.train_acc.compute().item()
                ap_value = self.train_ap.compute().item()
                mcc_value = self.train_mcc.compute().item()

                pbar.set_description(f"AUC={auc_value:.4f}, ACC={acc_value:.4f}, MCC={mcc_value:.4f}, AP={ap_value:.4f}")

                # Логирование в ClearML
                logger.report_scalar("Train_AUC", "epoch", auc_value, self.epoch)
                logger.report_scalar("Train_Accuracy", "epoch", acc_value, self.epoch)
                logger.report_scalar("Train_AP", "epoch", ap_value, self.epoch)
                logger.report_scalar("Train_MCC", "epoch", mcc_value, self.epoch)

            self.step_lr_schedulers(end_of_epoch=True)
            self.zero_losses()

            if self.end_of_epoch_hook(self) is False:
                break

            self.validate()

    def validate(self):
        self.set_to_eval()
        val_pbar = tqdm(self.val_dataloader, desc="Validating")

        self.val_auc.reset()
        self.val_mcc.reset()
        self.val_acc.reset()
        self.val_ap.reset()

        with torch.no_grad():
            for batch in val_pbar:
                input_ids, labels = batch
                embeddings = self.models["trunk"](input_ids)

                proxies = self.loss_funcs["metric_loss"].proxies
                cos_sim = F.cosine_similarity(
                    embeddings.unsqueeze(1), proxies.unsqueeze(0), dim=-1
                )

                preds = (cos_sim > 0.001).float()

                self.val_auc.update(preds, labels.long())
                self.val_mcc.update(preds, labels.long())
                self.val_acc.update(preds, labels.long())
                self.val_ap.update(preds, labels.long())

        # Логируем валидационные метрики в ClearML
        logger.report_scalar("Val_AUC", "epoch", self.val_auc.compute().item(), self.epoch)
        logger.report_scalar("Val_Accuracy", "epoch", self.val_acc.compute().item(), self.epoch)
        logger.report_scalar("Val_AP", "epoch", self.val_ap.compute().item(), self.epoch)
        logger.report_scalar("Val_MCC", "epoch", self.val_mcc.compute().item(), self.epoch)

        print(f"Validation - AUC: {self.val_auc.compute():.4f}, MCC: {self.val_mcc.compute():.4f}, Acc: {self.val_acc.compute():.4f}, AP: {self.val_ap.compute():.4f}")
         


    def calculate_loss(self, curr_batch):

        input_ids, labels = curr_batch
        # print(curr_batch)
        embeddings = self.models["trunk"](input_ids)  # Compute embeddings

        # Compute Proxy Anchor Loss for each label independently
        for i in range(labels.size(1)):  # Iterate over each of the 4 labels
            curr_labels = labels[:, i]  # Select label column
            indices_tuple = self.maybe_mine_embeddings(embeddings, curr_labels)
            self.losses["metric_loss"] += self.maybe_get_metric_loss(embeddings, curr_labels, indices_tuple)

        self.losses["metric_loss"] /= labels.size(1)  # Normalize loss across labels


class TabularDataset(Dataset):
    """Датасет для работы с табличными фичами."""
    def __init__(self, df, feature_columns, target_columns):
        self.features = df[feature_columns].values.astype(np.float32)
        self.labels = df[target_columns].values.astype(np.float32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return torch.tensor(self.features[idx]), torch.tensor(self.labels[idx])

# === Функция для запуска обучения ===
def main():
    # === Подготовка данных ===
    # df_train = pd.read_csv("train.csv")
    # df_val = pd.read_csv("val.csv")

    feature_columns = [col for col in df_train.columns if col not in ["IL-4 release", "IL-10 release", "IFNg release"]]
    target_columns = ["IL-4 release", "IL-10 release", "IFNg release"]

    train_dataset = TabularDataset(df_train, feature_columns, target_columns)
    val_dataset = TabularDataset(df_test, feature_columns, target_columns)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=0)

    # === Модель ===
    trunk = TabularMLP(input_dim=len(feature_columns), embedding_dim=embedding_size)
    embedder = torch.nn.Identity()  # Заглушка для pytorch-metric-learning

    # === Loss ===
    loss_fn = ProxyAnchorLoss(num_classes=num_classes, embedding_size=embedding_size)

    # === Оптимизаторы ===
    trunk_optimizer = AdamW(trunk.parameters(), lr=1e-2, weight_decay=1e-5)
    metric_loss_optimizer = AdamW(loss_fn.parameters(), lr=1e-2, weight_decay=1e-5)

    # === Тренер ===
    trainer = TabularTrainer(
        label_hierarchy_level="all",
        models={"trunk": trunk, "embedder": embedder},
        loss_funcs={"metric_loss": loss_fn},
        optimizers={"trunk_optimizer": trunk_optimizer, "metric_loss_optimizer": metric_loss_optimizer},
        batch_size=batch_size,
        dataloader_num_workers=0,  # ВАЖНО: num_workers=0 решает проблему на Windows/macOS
        dataset=train_dataset,
        val_dataset=val_dataset
    )

    # === Запуск обучения ===
    trainer.train(num_epochs=num_epochs)
    print("Training complete!")

# === Запускаем обучение ===
if __name__ == "__main__":
    main()

In [None]:
import torch
import pandas as pd
from torch.utils.data import DataLoader

# === Функция загрузки обученной модели ===
def load_model(model_path, input_dim, embedding_dim=128):
    model = TabularMLP(input_dim, embedding_dim)
    model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu")))
    model.eval()
    return model

# === Функция для выполнения инференса ===
def inference(model, test_loader):
    embeddings = []
    with torch.no_grad():
        for features in test_loader:
            features = features.to(torch.float32)
            emb = model(features)
            embeddings.append(emb.cpu().numpy())
    return np.vstack(embeddings)

# === Функция подготовки данных ===
def prepare_dataloader(csv_path, feature_columns, batch_size=16):
    df = pd.read_csv(csv_path)
    features = df[feature_columns].values.astype(np.float32)
    dataset = torch.utils.data.TensorDataset(torch.tensor(features))
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    return loader

# === Основная функция ===
def main(model_path, csv_path, feature_columns, output_csv):
    input_dim = len(feature_columns)
    model = load_model(model_path, input_dim)
    test_loader = prepare_dataloader(csv_path, feature_columns)
    embeddings = inference(model, test_loader)
    
    df_result = pd.DataFrame(embeddings, columns=[f"emb_{i}" for i in range(embeddings.shape[1])])
    df_result.to_csv(output_csv, index=False)
    print(f"Инференс завершен. Результаты сохранены в {output_csv}")

# === Запуск ===
if __name__ == "__main__":
    model_path = "model.pth"  # Укажите путь к обученной модели
    csv_path = "test.csv"  # Укажите путь к CSV с тестовыми данными
    feature_columns = ["feature1", "feature2", "feature3"]  # Укажите список признаков
    output_csv = "embeddings.csv"  # Файл для сохранения эмбеддингов
    
    main(model_path, csv_path, feature_columns, output_csv)
