In [None]:
from IPython.display import display, Javascript

display(Javascript('''
    function saveNotebook() {
        console.log("Автосохранение ноутбука...");
        IPython.notebook.save_checkpoint();
    }
    setInterval(saveNotebook, 60000);  // Сохранение каждые 60 секунд
'''))

<IPython.core.display.Javascript object>

In [None]:
!pip install datasets



In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import numpy as np

In [None]:
# Загрузка данных - датасет с 11 жанрами
df = pd.read_excel('/content/RuFoLa_new texts_cleaned.xlsx')

X = df['text']
y = df['text_genre']

In [None]:
# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Кодирование
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [None]:
# Конвертирование данных в формат, требующийся для модели
train_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_train, 'label': y_train_encoded}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_test, 'label': y_test_encoded}))

dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

In [None]:
# Загрузка модели и токенизатора
model_name = "joeddav/xlm-roberta-large-xnli"  # Одна из самых больших моделей
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_encoder.classes_),
    ignore_mismatched_sizes=True  # Игнорирование несоответствия размерности
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to 

In [None]:
# Токенизация
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1788 [00:00<?, ? examples/s]

Map:   0%|          | 0/448 [00:00<?, ? examples/s]

In [None]:
# Функция вычисления метрик
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,  # Меньшие размеры батчей для ускорения работы
    per_device_eval_batch_size=2,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,  # Use mixed precision training to save memory
)



In [None]:
# Инициализация
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Обучение
trainer.train()

Epoch,Training Loss,Validation Loss


Эта модель могла потратить 100 часов для обучения на существующем датасете из 2237 образцов. Ее применение рационально при наличии времени или более мощного оборудования

In [None]:
# Оценка
metrics = trainer.evaluate()
print(f"Metrics: {metrics}")

In [None]:
# Предсказания
predictions = trainer.predict(tokenized_datasets['test'])
preds = np.argmax(predictions.predictions, axis=1)
y_pred = label_encoder.inverse_transform(preds)

In [None]:
# Отчет
classification_report_str = classification_report(y_test, y_pred)
print('Classification Report:')
print(classification_report_str)