# НИР

В работе дообучим модель bert-base-multilingual-cased для распознавания текста

In [1]:
import json
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split

In [2]:
import zipfile

with zipfile.ZipFile('data/train.json.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

FileNotFoundError: [Errno 2] No such file or directory: 'data/train.json.zip'

In [3]:
data = []
with open('data/qas/combined_dataset_with_responses_and_classification.json', 'r') as file:
    for line in file:
        try:
            data.append(json.loads(line.strip()))
        except json.JSONDecodeError:
            continue

# Преобразование данных в DataFrame для удобства работы с ними
df = pd.DataFrame(data)
df.head(5)

Unnamed: 0,Question,Answer,ModelResponse,Classification
0,Was Abraham Lincoln the sixteenth President of...,yes,"Yes, Abraham Lincoln was indeed the 16th Presi...",yes
1,Did Lincoln sign the National Banking Act of 1...,yes,"No, Abraham Lincoln did not sign the National ...",no
2,Did his mother die of pneumonia?,no,"I apologize, but there is no information provi...",neither
3,Did Lincoln beat John C. Breckinridge in the 1...,yes,"Actually, Abraham Lincoln did not face John C....",no
4,Was Abraham Lincoln the first President of the...,No,"No, Abraham Lincoln was not the first Presiden...",no


# Функция предобработки текстовых данных

In [4]:
def preprocess_data(text_list, tokenizer, max_length=128):
    input_ids = []
    attention_masks = []

    for text in text_list:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True, 
            max_length=max_length,
            padding='max_length',  # Дополнение последовательности до максимальной длины
            truncation=True,  # Усечение длинных последовательностей
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Подготовка данных к обучению и валидации
Кодирование текста с помощью токенизатора BERT.
Разделение данных на обучающую и валидационную выборки.
Создание датасетов для обучения и валидации


In [5]:
# Загрузка предобученной модели и токенизатора BERT
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3) # Настройка количества меток (классов)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Training on GPU.")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Training on CPU.")
model.to(device)


# Предобработка текстовых данных и подготовка входных данных для модели
input_ids, attention_masks = preprocess_data(df['ModelResponse'].to_list(), tokenizer)
# Преобразование меток в числовой формат
labels = df['Classification'].apply(lambda x: 0 if x == 'neither' else 1 if x == 'yes' else 2).values
labels = torch.tensor(labels).to(device)

train_size = int(0.8 * len(df))
val_size = len(df) - train_size

train_dataset, val_dataset = random_split(TensorDataset(input_ids, attention_masks, labels), [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CUDA is available. Training on GPU.


# Функция для обучения модели

In [6]:
def train(model, train_dataloader, val_dataloader, epochs=4):
    optimizer = AdamW(model.parameters(), lr=2e-5)  # Оптимизатор AdamW с заданной скоростью обучения

    for epoch in range(epochs):
        model.train()  # Переключение модели в режим обучения
        total_loss = 0  # Переменная для хранения общей потери

        # Обучение на обучающей выборке
        for batch in train_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            
            model.zero_grad()  # Обнуление градиентов
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss  # Получение значения потери
            total_loss += loss.item()  # Добавление потери к общей сумме
            loss.backward()  # Обратное распространение ошибки
            optimizer.step()  # Обновление параметров модели

        avg_train_loss = total_loss / len(train_dataloader)
    
        print(f"Training loss: {total_loss / len(train_dataloader)}")

        model.eval()  # Переключение модели в режим оценки
        val_accuracy = 0  # Переменная для хранения общей точности на валидационной выборке
        val_loss = 0  # Переменная для хранения общей потери на валидационной выборке

        # Оценка на валидационной выборке
        for batch in val_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            
            with torch.no_grad():
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

            loss = outputs.loss  # Получение значения потери
            logits = outputs.logits  # Получение логитов
            val_loss += loss.item()  # Добавление потери к общей сумме
            preds = torch.argmax(logits, dim=1).flatten()  # Получение предсказаний
            accuracy = (preds == b_labels).cpu().numpy().mean() * 100  # Вычисление точности
            val_accuracy += accuracy

        avg_val_accuracy = val_accuracy / len(val_dataloader)  # Средняя точность на валидационной выборке
        avg_val_loss = val_loss / len(val_dataloader)  # Средняя потеря на валидационной выборке

        # Вывод результатов текущей эпохи
        print(f'Epoch {epoch+1}/{epochs}')
        print(f'Training Loss: {avg_train_loss:.2f}')
        print(f'Validation Loss: {avg_val_loss:.2f}')
        print(f'Validation Accuracy: {avg_val_accuracy:.2f}%')


# Выполнение обучения модели

In [7]:
train(model, train_dataloader, val_dataloader, epochs=20)

model.save_pretrained('rusentiment_bert_model')
tokenizer.save_pretrained('rusentiment_bert_model')




Training loss: 0.6075519683305174
Epoch 1/20
Training Loss: 0.61
Validation Loss: 0.33
Validation Accuracy: 87.81%
Training loss: 0.305311486730352
Epoch 2/20
Training Loss: 0.31
Validation Loss: 0.32
Validation Accuracy: 88.44%
Training loss: 0.22287968659074978
Epoch 3/20
Training Loss: 0.22
Validation Loss: 0.31
Validation Accuracy: 88.12%
Training loss: 0.1720857791369781
Epoch 4/20
Training Loss: 0.17
Validation Loss: 0.41
Validation Accuracy: 87.50%
Training loss: 0.12582770636072382
Epoch 5/20
Training Loss: 0.13
Validation Loss: 0.44
Validation Accuracy: 88.37%
Training loss: 0.13034197927918284
Epoch 6/20
Training Loss: 0.13
Validation Loss: 0.46
Validation Accuracy: 87.19%
Training loss: 0.05664774967881385
Epoch 7/20
Training Loss: 0.06
Validation Loss: 0.54
Validation Accuracy: 84.93%
Training loss: 0.05183425411814824
Epoch 8/20
Training Loss: 0.05
Validation Loss: 0.56
Validation Accuracy: 87.81%
Training loss: 0.06156612955091987
Epoch 9/20
Training Loss: 0.06
Validation

('rusentiment_bert_model/tokenizer_config.json',
 'rusentiment_bert_model/special_tokens_map.json',
 'rusentiment_bert_model/vocab.txt',
 'rusentiment_bert_model/added_tokens.json')

# Функция для предсказания класса текста

In [8]:
def predict(text, model, tokenizer):
    model.eval()  # Переключение модели в режим оценки
    inputs = tokenizer.encode_plus(text, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
    input_ids = inputs['input_ids'].to(device)  # Получение идентификаторов токенов
    attention_mask = inputs['attention_mask'].to(device)  # Получение масок внимания

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)  # Получение выходов модели
        logits = outputs.logits  # Получение логитов
        predicted_class = torch.argmax(logits, dim=1).item()  # Определение предсказанного класса

    return predicted_class

In [9]:
# Тестирование функции предсказания на примере текста
test_text = "No"
prediction = predict(test_text, model, tokenizer)
print(f'Text: {test_text}')
print(f'Prediction: {prediction} {"Neither" if prediction == 0 else "Yes" if prediction == 1 else "No"}')


Text: No
Prediction: 2 No


In [10]:
test_text = """
No, Anders Celsius was not born in Uppsala, Sweden. He was actually born on November 27, 1701, in Uppsala, Sweden, but he spent most of his life in the southern part of Sweden.
"""
prediction = predict(test_text, model, tokenizer)
print(f'Text: {test_text}')
print(f'Prediction: {prediction} {"Neither" if prediction == 0 else "Yes" if prediction == 1 else "No"}')

Text: 
No, Anders Celsius was not born in Uppsala, Sweden. He was actually born on November 27, 1701, in Uppsala, Sweden, but he spent most of his life in the southern part of Sweden.

Prediction: 2 No


In [11]:
test_text = """
I apologize, but there isn't enough information provided to answer this question accurately. Could you please provide more context or clarify which "his father" you are referring to?
"""
prediction = predict(test_text, model, tokenizer)
print(f'Text: {test_text}')
print(f'Prediction: {prediction} {"Neither" if prediction == 0 else "Yes" if prediction == 1 else "No"}')

Text: 
I apologize, but there isn't enough information provided to answer this question accurately. Could you please provide more context or clarify which "his father" you are referring to?

Prediction: 0 Neither


In [12]:
test_text = """
The modern hi-hat, also known as the cymbal pair, did not directly evolve from clash cymbals. Clash cymbals were used in the early days of drumming to create a loud, sharp sound by striking two cymbals together.

The modern hi-hat, on the other hand, is a specific type of cymbal pair that consists of two cymbals mounted on a pedal-controlled mechanism. This design allows for the cymbals to be opened and closed quickly, creating a distinctive "chick" sound.

While clash cymbals were used in early drumming, the modern hi-hat as we know it today was developed later, in the mid-20th century. The first patent for a pedal-controlled cymbal pair was granted to William F. Ludwig Sr. in 1939, and his company, W.F.Ludwig & Co., began manufacturing and selling the design.

So while clash cymbals may have influenced the development of drumming techniques and sounds, they did not directly contribute to the evolution of the modern hi-hat as we know it today.
"""
prediction = predict(test_text, model, tokenizer)
print(f'Text: {test_text}')
print(f'Prediction: {prediction} {"Neither" if prediction == 0 else "Yes" if prediction == 1 else "No"}')

Text: 
The modern hi-hat, also known as the cymbal pair, did not directly evolve from clash cymbals. Clash cymbals were used in the early days of drumming to create a loud, sharp sound by striking two cymbals together.

The modern hi-hat, on the other hand, is a specific type of cymbal pair that consists of two cymbals mounted on a pedal-controlled mechanism. This design allows for the cymbals to be opened and closed quickly, creating a distinctive "chick" sound.

While clash cymbals were used in early drumming, the modern hi-hat as we know it today was developed later, in the mid-20th century. The first patent for a pedal-controlled cymbal pair was granted to William F. Ludwig Sr. in 1939, and his company, W.F.Ludwig & Co., began manufacturing and selling the design.

So while clash cymbals may have influenced the development of drumming techniques and sounds, they did not directly contribute to the evolution of the modern hi-hat as we know it today.

Prediction: 2 No


In [13]:
test_text = """
The question implies that we are discussing someone who made a significant discovery or created something notable. To answer your question, yes, many scientists and researchers publish their work to share it with others in the field and make it accessible for further study and potential applications.

Publishing research involves submitting an article or paper to a reputable journal or conference proceedings, where it undergoes peer review before being accepted for publication. This process helps ensure that the work is of high quality, accurate, and contributes meaningfully to the existing body of knowledge.

In addition to publishing in traditional journals, researchers may also share their findings through online platforms, blogs, or social media. Some may present their work at conferences or workshops, either in person or virtually."""
prediction = predict(test_text, model, tokenizer)
print(f'Text: {test_text}')
print(f'Prediction: {prediction} {"Neither" if prediction == 0 else "Yes" if prediction == 1 else "No"}')

Text: 
The question implies that we are discussing someone who made a significant discovery or created something notable. To answer your question, yes, many scientists and researchers publish their work to share it with others in the field and make it accessible for further study and potential applications.

Publishing research involves submitting an article or paper to a reputable journal or conference proceedings, where it undergoes peer review before being accepted for publication. This process helps ensure that the work is of high quality, accurate, and contributes meaningfully to the existing body of knowledge.

In addition to publishing in traditional journals, researchers may also share their findings through online platforms, blogs, or social media. Some may present their work at conferences or workshops, either in person or virtually.
Prediction: 1 Yes
