In [6]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel
from torch.optim import AdamW 
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# Для трекинга
import mlflow
import wandb
from clearml import Task

## Загрузка и подготовка данных


In [None]:
pip install torch transformers pandas scikit-learn mlflow wandb clearml

In [7]:
data = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")  
texts = data["review"].values
labels = data["sentiment"].map({"positive": 1, "negative": 0}).values

# Разделение на train/test
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Токенизация BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def encode_texts(texts, max_len=128):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            max_length=max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

X_train_ids, X_train_mask = encode_texts(X_train)
X_test_ids, X_test_mask = encode_texts(X_test)

# Создание DataLoader
train_dataset = TensorDataset(X_train_ids, X_train_mask, torch.tensor(y_train))
test_dataset = TensorDataset(X_test_ids, X_test_mask, torch.tensor(y_test))
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Модель BERT для классификации

In [12]:
class SentimentClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(768, 1)  # BERT возвращает 768-мерные эмбеддинги
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output  # [CLS] токен
        return torch.sigmoid(self.fc(self.dropout(pooled_output))).squeeze(1)

In [None]:
def train_model():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SentimentClassifier().to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5)
    criterion = nn.BCELoss()

    # Инициализация трекеров
    mlflow.set_experiment("IMDb_Sentiment_MLflow")
    wandb.init(project="imdb-sentiment-wandb")
    task = Task.init(project_name="IMDb_Sentiment_ClearML", task_name="BERT_Training")

    # Параметры для логирования
    config = {
        "batch_size": 16,
        "lr": 2e-5,
        "epochs": 3
    }
    wandb.config.update(config)
    task.connect(config)
    mlflow.log_params(config)

    # Обучение
    for epoch in range(config["epochs"]):
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels.float())
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        
        # Валидация
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in test_loader:
                input_ids, attention_mask, labels = [x.to(device) for x in batch]
                outputs = model(input_ids, attention_mask)
                predicted = (outputs > 0.5).int()
                correct += (predicted == labels).sum().item()
                total += labels.size(0)
        val_accuracy = correct / total

        # Логирование метрик
        print(f"Epoch {epoch + 1}: Loss = {avg_train_loss:.4f}, Accuracy = {val_accuracy:.4f}")
        
        # MLflow
        mlflow.log_metrics({
            "train_loss": avg_train_loss,
            "val_accuracy": val_accuracy
        }, step=epoch)
        
        # W&B
        wandb.log({
            "train_loss": avg_train_loss,
            "val_accuracy": val_accuracy
        })
        
        # ClearML
        task.get_logger().report_scalar("loss", "train", value=avg_train_loss, iteration=epoch)
        task.get_logger().report_scalar("accuracy", "val", value=val_accuracy, iteration=epoch)

    # Сохранение модели
    torch.save(model.state_dict(), "bert_sentiment.pt")
    
    # Загрузка артефактов
    mlflow.pytorch.log_model(model, "model")
    wandb.save("bert_sentiment.pt")
    task.upload_artifact("model_weights", "bert_sentiment.pt")

    # Завершение сессий
    wandb.finish()
    task.close()

if __name__ == "__main__":
    train_model()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

2025/06/03 20:20:47 INFO mlflow.tracking.fluent: Experiment with name 'IMDb_Sentiment_MLflow' does not exist. Creating a new experiment.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

In [None]:
!mlflow ui --backend-store-uri sqlite:///mlflow.db

2025/06/03 20:34:03 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/06/03 20:34:03 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  