In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_data = pd.read_json('/kaggle/input/sentiment-analysis-in-russian/train.json')
train_data['sentiment'] = train_data['sentiment'].map({'positive': 0, 'neutral': 1, 'negative': 2})
train_data, test_data = train_test_split(train_data, test_size=0.2, random_state=42)

In [2]:
!pip install datasets



In [3]:
from transformers import BertTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForPreTraining

tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")

model = AutoModelForPreTraining.from_pretrained("cointegrated/rubert-tiny2")

train_input_ids = []
train_attention_masks = []

for text in train_data['text']:
    encoding = tokenizer.encode_plus(text, 
                         add_special_tokens=True, 
                         max_length=1024, 
                         padding='max_length', 
                         truncation=True, 
                         return_attention_mask=True,
                         return_tensors='pt')
    
    train_input_ids.append(encoding['input_ids'])
    
    train_attention_masks.append(encoding['attention_mask'])

train_input_ids = torch.cat(train_input_ids, dim = 0)
train_attention_masks = torch.cat(train_attention_masks, dim = 0)

test_input_ids = []
test_attention_masks = []

for text in test_data['text']:
    encoding = tokenizer.encode_plus(text, 
                         add_special_tokens=True, 
                         max_length=1024, 
                         padding='max_length', 
                         truncation=True, 
                         return_attention_mask=True,
                         return_tensors='pt')
    
    test_input_ids.append(encoding['input_ids'])
    
    test_attention_masks.append(encoding['attention_mask'])

test_input_ids = torch.cat(test_input_ids, dim = 0)
test_attention_masks = torch.cat(test_attention_masks, dim = 0)


tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

In [4]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

train_labels = torch.tensor(train_data.sentiment.values)
test_labels = torch.tensor(test_data.sentiment.values)

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)

batch_size = 16

train_loader = DataLoader(
            train_dataset, 
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size 
        )
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [10]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score
def train_cls(model, train_loader, optimizer):
    model.train()
    total_loss = 0
    train_preds = []
    train_labels = []

    for input_ids, attention_mask, labels in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        train_preds.extend(torch.argmax(logits, dim=1).tolist())
        train_labels.extend(labels.tolist())

    train_accuracy = accuracy_score(train_labels, train_preds)
    train_loss = total_loss / len(train_loader)
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    return train_loss, train_accuracy

In [11]:
def val_cls(model, val_loader):
    model.eval()
    val_preds = []
    val_labels = []
    val_loss = 0
    with torch.no_grad():
        for input_ids, attention_mask, labels in tqdm(val_loader):
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            val_loss += loss.item()

            val_preds.extend(torch.argmax(logits, dim=1).tolist())
            val_labels.extend(labels.tolist())
    val_accuracy = accuracy_score(val_labels, val_preds)
    val_loss /= len(val_loader)
    print(f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
    return val_loss, val_accuracy

In [5]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [23]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "cointegrated/rubert-tiny2", 
    num_labels = 3,  
    output_hidden_states = False, 
)
opt = torch.optim.AdamW(model.parameters(), lr=5e-5)
model.cuda()

num_epochs = 3
for epoch in range(num_epochs):
    _,cls_acc = train_cls(model, train_loader, opt)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 414/414 [01:32<00:00,  4.48it/s]


Train Loss: 0.8109, Train Accuracy: 0.6139


100%|██████████| 414/414 [01:32<00:00,  4.48it/s]


Train Loss: 0.6067, Train Accuracy: 0.7352


100%|██████████| 414/414 [01:32<00:00,  4.48it/s]

Train Loss: 0.4194, Train Accuracy: 0.8304





In [24]:
_,val_cls_acc = val_cls(model, test_loader)

100%|██████████| 104/104 [00:07<00:00, 13.69it/s]

       Val Loss: 0.7059, Val Accuracy: 0.7042





In [25]:
val_cls_acc

0.7041742286751361

In [6]:
def train_mean(model, train_loader, opt):
    train_preds = []
    train_labels = []
    model.train()
    for input_ids, attention_mask, labels in tqdm(train_loader):
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels,
                        return_dict=True)

        mean = torch.mean(outputs.hidden_states[-1][:, 1:], dim=1)
        logits = model.classifier(mean)
        loss = torch.nn.functional.cross_entropy(logits, labels)
        loss.backward()

        opt.step()
        opt.zero_grad()

        logits = outputs.logits
        train_preds.extend(torch.argmax(logits, dim=-1).tolist())
        train_labels.extend(labels.tolist())
        train_accuracy = accuracy_score(train_labels, train_preds)
    print(f"Train Accuracy: {train_accuracy:.4f}")
        
    return train_accuracy

In [7]:
def eval_mean(model, test_loader):
    val_preds = []
    val_labels = []
    model.eval()
    for input_ids, attention_mask, labels in tqdm(test_loader):
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        with torch.no_grad():
            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            labels=labels,
                            return_dict=True)
            mean = torch.mean(outputs.hidden_states[-1][:, 1:], dim=1)
            logits = model.classifier(mean)
        predictions = torch.argmax(logits, dim=-1)
        val_preds.extend(torch.argmax(logits, dim=-1).tolist())
        val_labels.extend(labels.tolist())
        val_accuracy = accuracy_score(val_labels, val_preds)

    return val_accuracy

In [12]:
from transformers import BertForSequenceClassification

# Определение модели, оптимизатора и других параметров
model = BertForSequenceClassification.from_pretrained(
    "cointegrated/rubert-tiny2", 
    num_labels=3,
    output_hidden_states=True
)
model.cuda()

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Запуск обучения и валидации
num_epochs = 3
for epoch in range(num_epochs):
    train_accuracy = train_mean(model, train_loader, optimizer)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 414/414 [01:33<00:00,  4.41it/s]


Train Accuracy: 0.6083


100%|██████████| 414/414 [01:33<00:00,  4.45it/s]


Train Accuracy: 0.7315


100%|██████████| 414/414 [01:32<00:00,  4.45it/s]

Train Accuracy: 0.8440





In [13]:
eval_mean(model, test_loader)

100%|██████████| 104/104 [00:07<00:00, 13.53it/s]


0.7011494252873564