# Импорт библиотек

In [20]:
# %load_ext autoreload
# %autoreload 2

In [1]:
import torch
import pandas as pd
import numpy as np
from pathlib import Path

from transformers import BertTokenizer
from torch.optim import AdamW

from sklearn.metrics import cohen_kappa_score

from src.models.bert import BERTFinetune, BERTDataset
from src.models.wrapper import ScoreRegressor
from src.models.visualization import plot_bert_architecture


  from .autonotebook import tqdm as notebook_tqdm


## Константы

In [2]:
BASE_DIR = Path(r"E:\projects\AutoGrade-ENG-Writing")

TEXT = "Text"
TARGET = "Solving a communicative task"

# Загрузка данных

In [3]:
train_data = pd.read_excel(BASE_DIR.joinpath("email_train_data_neznaika.xlsx"), index_col=0)
test_data = pd.read_excel(BASE_DIR.joinpath("email_test_data_neznaika.xlsx"), index_col=0)

# Инициализация модели

In [4]:
class CFG:
    model_name = "bert-base-uncased"  # Model name for reference
    learning_rate = 0.000016 # Learning rate
    epsilon = 1e-6
    weight_decay = 0.03  # Weight decay (L2 regularization)
    hidden_dropout_prob = 0.007  # Dropout probability for hidden layers
    attention_probs_dropout_prob = 0.007  # Dropout probability for attention layers
    num_train_epochs = 1  # Number of training epochs
    n_splits = 4  # Number of splits for cross-validation
    batch_size = 16  # Batch size for training data
    random_seed = 42  # Random seed for reproducibility
    save_steps = 50  # Number of steps before saving model checkpoints
    max_length = 100  # Maximum sequence length for input data
    runs_folder = "data/runs/"

In [5]:
def cohence_kappa_metric(labels: torch.tensor, predictions: torch.tensor) -> float:
    if torch.is_tensor(labels):
        labels = labels.numpy()
    if torch.is_tensor(predictions):
        predictions = predictions.numpy()
    labels = np.round(labels).astype(int)
    predictions = np.round(predictions).astype(int)
    return cohen_kappa_score(labels, predictions)

In [9]:
bert_model = BERTFinetune()
bert_optimizer = AdamW(bert_model.parameters(), lr=CFG.learning_rate, eps=CFG.epsilon)
bert_tokenizer = BertTokenizer.from_pretrained(bert_model.model_name)

train_dataset = BERTDataset(input=train_data[TEXT].to_list(), labels=train_data[TARGET].to_list(), tokenizer=bert_tokenizer, max_length=CFG.max_length)
test_dataset = BERTDataset(input=test_data[TEXT].to_list(), labels=test_data[TARGET].to_list(), tokenizer=bert_tokenizer)

regressor = ScoreRegressor(
    model=bert_model,
    optimizer=bert_optimizer,
    train_dataset=train_dataset,
    test_dataset=test_dataset,
    metric_function=cohence_kappa_metric,
    runs_folder=BASE_DIR.joinpath(CFG.runs_folder),
    epochs=CFG.num_train_epochs
)

Using device: cpu


In [7]:
train_results = regressor.train()

Start training...

 Epoch  |  Batch  |  Train Loss |  Val Loss  |  Elapsed 
------------------------------------------------------------------------------------------------------------------------------------------------------
   1    |   20    |   0.398071   |     -      |     -      |     -      |     -      |  104.95  
   1    |   40    |   0.401158   |     -      |     -      |     -      |     -      |   98.59  
   1    |   60    |   0.351425   |     -      |     -      |     -      |     -      |   98.53  
   1    |   80    |   0.354865   |     -      |     -      |     -      |     -      |   99.18  
   1    |   100   |   0.317263   |     -      |     -      |     -      |     -      |   98.94  
   1    |   101   |   0.430567   |     -      |     -      |     -      |     -      |   4.96   
------------------------------------------------------------------------------------------------------------------------------------------------------
Evaluation Metric:  0.0
Saving best mode

Unnamed: 0,train_loss,test_loss
0,0.365532,0.804748
