In [1]:
!pip install optuna transformers torch pandas scikit-learn

Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch)
  Downloading nvidia_nvjitlink_cu12-1

In [2]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset
import optuna
from optuna.trial import TrialState
import numpy as np

2025-06-04 21:21:06.405445: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749072066.626485      18 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749072066.694683      18 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Определяем модель

In [3]:
class SentimentClassifier(nn.Module):
    def __init__(self, dropout_rate=0.1):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(768, 1)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return torch.sigmoid(self.fc(self.dropout(pooled_output))).squeeze(1)


## Загружаем данные

In [4]:
def load_data():
    data = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
    texts = data["review"].values
    labels = data["sentiment"].map({"positive": 1, "negative": 0}).values
    return train_test_split(texts, labels, test_size=0.2, random_state=42)

X_train, X_test, y_train, y_test = load_data()


## Токенизация

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def encode_texts(texts, max_len=128):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text, max_length=max_len, padding="max_length", 
            truncation=True, return_tensors="pt"
        )
        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

print("Encoding data...")
X_train_ids, X_train_mask = encode_texts(X_train)
X_test_ids, X_test_mask = encode_texts(X_test)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Encoding data...


In [6]:
def evaluate(model, data_loader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids, attention_mask)
            predicted = (outputs > 0.5).int()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    return correct / total

## Optuna

In [7]:
def objective(trial):
    # Параметры для оптимизации
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 5e-5, log=True),
        'batch_size': trial.suggest_categorical('batch_size', [16, 32]),
        'dropout_rate': trial.suggest_float('dropout_rate', 0.1, 0.5),
        'epochs': trial.suggest_int('epochs', 2, 4)
    }
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Подготовка DataLoader
    train_dataset = TensorDataset(X_train_ids, X_train_mask, torch.tensor(y_train))
    train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
    
    test_dataset = TensorDataset(X_test_ids, X_test_mask, torch.tensor(y_test))
    test_loader = DataLoader(test_dataset, batch_size=params['batch_size'])
    
    # Инициализация модели
    model = SentimentClassifier(dropout_rate=params['dropout_rate']).to(device)
    optimizer = AdamW(model.parameters(), lr=params['learning_rate'])
    criterion = nn.BCELoss()
    
    # Обучение
    best_accuracy = 0
    for epoch in range(params['epochs']):
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels.float())
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        # Валидация
        val_accuracy = evaluate(model, test_loader, device)
        trial.report(val_accuracy, epoch)
        
        # Ранняя остановка
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
        
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
    
    return best_accuracy

In [8]:
study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(),
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=1)
)

print("Starting optimization...")
study.optimize(objective, n_trials=10, timeout=3600)

[I 2025-06-04 21:24:42,013] A new study created in memory with name: no-name-36a6ebd2-83db-44c5-b664-eebf0dd2754c


Starting optimization...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

[I 2025-06-04 22:37:27,716] Trial 0 finished with value: 0.8929 and parameters: {'learning_rate': 4.015093216004315e-05, 'batch_size': 32, 'dropout_rate': 0.2023223846587023, 'epochs': 4}. Best is trial 0 with value: 0.8929.


## Результаты

In [9]:
print("\nBest trial:")
trial = study.best_trial
print(f"  Accuracy: {trial.value:.4f}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


Best trial:
  Accuracy: 0.8929
  Params: 
    learning_rate: 4.015093216004315e-05
    batch_size: 32
    dropout_rate: 0.2023223846587023
    epochs: 4


In [10]:
try:
    fig = optuna.visualization.plot_optimization_history(study)
    fig.show()
except:
    print("Could not display plots in Kaggle environment")