### Установка зависимостей

In [None]:
!pip install transformers
!pip install torch
!pip install datasets
!pip install sentencepiece
!pip install tiktoken

In [39]:
import torch

In [47]:
# MODEL_NAME = "slone/nllb-210-v1"
# MODEL_NAME = "Qwen/Qwen2.5-3B"
MODEL_NAME = "DeepPavlov/rubert-base-cased"


### Скачивание модели с huggingface

In [48]:
from huggingface_hub import snapshot_download

snapshot_download(MODEL_NAME, local_dir=f"./models/{MODEL_NAME}")

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

README.md:   0%|          | 0.00/584 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

flax_model.msgpack:   0%|          | 0.00/714M [00:00<?, ?B/s]

'/Users/olegslepcov/Projects/Studying/huggingface/models/DeepPavlov/rubert-base-cased'

### Загрузка модели

In [36]:
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM
)

model_path = f"models/{MODEL_NAME}"

# Загрузка токенизатора используемого при создании модели
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Загрузка модели
model = AutoModelForCausalLM.from_pretrained(model_path)

print(model.config)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Qwen2Config {
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151643,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "layer_types": [
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "full_attention",
    "f

### Подготовка данных

In [37]:

texts = [
    "I love machine learning!",
    "This is not good at all.",
    "The weather is nice today."
]
# text = "Договор между Ивановым Иваном Ивановичем и Петровым Петром Петровичем."


# Токенизация
inputs = tokenizer(
    texts,
    padding=True,        # Добиваем до одинаковой длины
    truncation=True,     # Обрезаем слишком длинные тексты
    max_length=512,      # Максимальная длина
    return_tensors="pt"  # Возвращаем тензоры PyTorch
)

print("Input shapes:", inputs['input_ids'].shape)

Input shapes: torch.Size([3, 7])


### Запуск модели

In [41]:
# Переводим модель в режим оценки
model.eval()

# Переносим на GPU если доступно
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Получаем предсказания
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

# Интерпретируем результаты
predicted_classes = torch.argmax(predictions, dim=1)
print("Predictions:", predicted_classes)
print("Confidence scores:", predictions)

Predictions: tensor([[3, 0, 6,  ..., 1, 1, 1],
        [5, 5, 5,  ..., 0, 0, 0],
        [4, 3, 3,  ..., 0, 0, 0]])
Confidence scores: tensor([[[1.0731e-06, 3.5383e-05, 1.2774e-05,  ..., 5.4083e-12,
          5.4083e-12, 5.4083e-12],
         [3.0870e-05, 1.0055e-05, 1.3733e-07,  ..., 5.0468e-10,
          5.0468e-10, 5.0469e-10],
         [3.4299e-05, 6.4636e-07, 7.0810e-08,  ..., 2.5902e-11,
          2.5902e-11, 2.5902e-11],
         ...,
         [6.3756e-06, 1.6727e-07, 4.5533e-05,  ..., 8.4607e-12,
          8.4607e-12, 8.4607e-12],
         [1.6162e-04, 1.9000e-07, 5.0877e-05,  ..., 4.3288e-12,
          4.3288e-12, 4.3288e-12],
         [6.0461e-04, 1.7242e-06, 3.5297e-04,  ..., 1.6939e-11,
          1.6939e-11, 1.6939e-11]],

        [[1.2136e-06, 1.9201e-07, 8.8113e-07,  ..., 2.3913e-10,
          2.3913e-10, 2.3913e-10],
         [6.0037e-07, 2.4078e-06, 1.1083e-07,  ..., 2.9706e-11,
          2.9706e-11, 2.9706e-11],
         [1.0572e-05, 8.8316e-06, 1.0251e-07,  ..., 4.981

### Пост-обработка результатов

In [43]:
# # Если у модели есть метки классов
# if hasattr(model.config, 'id2label'):
#     labels = [model.config.id2label[idx] for idx in predicted_classes.tolist()]
#     print("Predicted labels:", labels)
# else:
#     print("Raw predictions:", predicted_classes.tolist())

# Для каждой строки выводим результат
for i, text in enumerate(texts):
    print(f"Text: {text}")
    print(f"Prediction: {labels[i] if 'labels' in locals() else predicted_classes[i]}")
    print(f"Confidence: {predictions[i].max().item():.4f}")
    print("-" * 50)


Text: I love machine learning!
Prediction: tensor([3, 0, 6,  ..., 1, 1, 1])
Confidence: 0.8881
--------------------------------------------------
Text: This is not good at all.
Prediction: tensor([5, 5, 5,  ..., 0, 0, 0])
Confidence: 0.9505
--------------------------------------------------
Text: The weather is nice today.
Prediction: tensor([4, 3, 3,  ..., 0, 0, 0])
Confidence: 0.6654
--------------------------------------------------


### Настройка обучения

In [None]:
from transformers import TrainingArguments, Trainer

# Аргументы обучения
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",  # Директория для сохранения
    num_train_epochs=3,              # Количество эпох
    per_device_train_batch_size=8,   # Размер батча
    per_device_eval_batch_size=8,
    learning_rate=2e-5,              # Скорость обучения
    weight_decay=0.01,               # Вес decay
    logging_dir='./logs',            # Директория для логов
    evaluation_strategy="epoch",     # Стратегия оценки
    save_strategy="epoch",
    load_best_model_at_end=True,
)

### Создание и запуск тренера

In [None]:
from transformers import DataCollatorWithPadding

# Data collator для автоматического padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Создаем тренер
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # В реальности нужен отдельный eval set
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Запускаем обучение
trainer.train()

# Сохраняем дообученную модель
trainer.save_model("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

### Тестирование дообученной модели

In [None]:
# Загружаем дообученную модель
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained("./fine_tuned_model")
fine_tuned_model.to(device)

# Тестируем на новых данных
test_text = "This product exceeded my expectations!"
test_input = tokenizer(test_text, return_tensors="pt").to(device)

with torch.no_grad():
    output = fine_tuned_model(**test_input)
    prediction = torch.softmax(output.logits, dim=-1)
    
print(f"Text: {test_text}")
print(f"Prediction: {prediction.argmax().item()}")
print(f"Confidence: {prediction.max().item():.4f}")

In [44]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = model_path
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

article = """
[Your article text here]
"""

prompt = f"""
Extract the key points from the following article:

{article}
"""

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_length=1024)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(response)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Both `max_new_tokens` (=2048) and `max_length`(=1024) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Extract the key points from the following article:


[Your article text here]

Here are the key points from the article:

1. **Introduction to the Problem**: The article discusses the challenges faced by individuals with disabilities in accessing public transportation, specifically focusing on the need for accessible and inclusive systems.

2. **Current State of Accessibility**: It highlights the current state of public transportation accessibility, noting that while some improvements have been made, there is still a significant gap in meeting the needs of people with disabilities.

3. **Barriers to Access**: The article identifies several barriers that prevent people with disabilities from using public transportation effectively, including physical barriers, lack of information, and inadequate support services.

4. **Case Studies**: It presents case studies of individuals who have faced difficulties due to these barriers, emphasizing the personal impact on their lives and well-being.

In [None]:

from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = model_path
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

article = """
[Your article text here]
"""

text = "Договор между Ивановым Иваном Ивановичем и Петровым Петром Петровичем."

inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_length=1024)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(response)
