## **Установка требуемых библиотек**

In [None]:
!pip install transformers datasets torch pymorphy3 gensim stop_words



### **Загрузка библиотек**

In [None]:
import nltk
import re
import pymorphy3
import torch
import numpy as np
import pandas as ps
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, f1_score, accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec

nltk.download('stopwords')
nltk.download('punkt_tab')
RUSSIAN_STOP_WORDS = set(stopwords.words('russian'))

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

### **Анализ датасета**

In [5]:
dataset = load_dataset("MonoHime/ru_sentiment_dataset")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train.csv:   0%|          | 0.00/276M [00:00<?, ?B/s]

valid.csv:   0%|          | 0.00/32.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/189891 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/21098 [00:00<?, ? examples/s]

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'sentiment'],
        num_rows: 189891
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'text', 'sentiment'],
        num_rows: 21098
    })
})

In [7]:
dataset['train'][0]

{'Unnamed: 0': 21098,
 'text': '.с.,и спросил его:  о Посланник Аллаха!Ты порицаешь что-то из слушания?  Он ответил: я не порицаю ничего из него,но передай им,чтобы они открывали свои собрания Кораном и закрывали их Кораном ...........Это дошедшие до нас мнения и тот кто находится в поисках истины,по мере изучения этого вопроса будет сталкиваться с разногласиями и будет оставаться в растерянности или склонится к мнению одной из сторон по своему желанию.Но всего этого недостаточно,потому что он сам должен найти истину,подробно изучив вопросы запретного и разрешённого.|||||||||||||||||||||||||||||||||||||Обрати внимание:основатели всех четырёх мазхабов осуждали песни и поэззию.И этим часто грешат заблудшие суфии.Исключение делается для исламской тематики  Сахих БухариО проявлении радости во время праздника.498 (949). Сообщается, что ‘Аиша, да будет доволен ею Аллах, сказала: (Однажды) посланник Аллаха, да благословит его Аллах и приветствует, вошёл ко мне в то время, когда у меня находил

In [None]:
train_test = dataset['train'].train_test_split(test_size=0.2)
train_dataset = train_test['train']
test_dataset = train_test['test']

## **Лемматизация и очистка текста**

In [None]:
stop_words = set(stopwords.words('russian'))
morph = pymorphy3.MorphAnalyzer()

def preproc_nltk(text):
    tokens = word_tokenize(text.lower())

    lemmatized_tokens = [
        morph.parse(word)[0].normal_form
        for word in tokens
        if word.isalnum()
        and word.isalpha()
        and word not in stop_words
        and not word.startswith('@')]

    preprocessed_text = ' '.join(lemmatized_tokens)
    return preprocessed_text

# **Мешок слов**

In [None]:
bow_vectorizer = CountVectorizer(
    preprocessor=preproc_nltk,
    min_df=300,
    max_df=10000,
    binary=False)

bow_vectors = bow_vectorizer.fit_transform(train_dataset['text'][:10000])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(bow_vectors, train_dataset['sentiment'][:10000], test_size=0.2, random_state=42)
X_train.shape

In [None]:
clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

## **Наивный байесовский классификатор на мешке слов** 

In [None]:
nb = BernoulliNB(binarize=True)
nb.fit(X_train, y_train)

In [None]:
y_pred = nb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

## **TF-IDF**

In [None]:
vectorizer = TfidfVectorizer(preprocessor=preproc_nltk, min_df=50, max_df=100000)
vectors = vectorizer.fit_transform(train_dataset['text'][:10000])

In [None]:
dense_vectors = vectors.todense()
dense_vectors.shape

In [None]:
X_test = np.asarray(X_test)
y_test = np.asarray(y_test)

y_pred = nb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

In [None]:
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)

nb = BernoulliNB()
nb.fit(X_train, y_train)

In [None]:
clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

## **TF-IDF (N-грамм)**

In [None]:
vectorizer = TfidfVectorizer(preprocessor=preproc_nltk,
                             min_df=300,
                             max_df=10000,
                             ngram_range=(1, 2),
                             binary=True)
vectors = vectorizer.fit_transform(train_dataset['text'][:10000])

In [None]:
vectors = vectorizer.fit_transform(train_dataset['text'][:10000])
dense_vectors = vectors.todense()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    dense_vectors,
    train_dataset['sentiment'][:10000],
    test_size=0.2,
    random_state=0)

In [None]:
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)

nb = BernoulliNB()
nb.fit(X_train, y_train)

In [None]:
X_test = np.asarray(X_test)
y_test = np.asarray(y_test)

y_pred = nb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

In [None]:
clf = LogisticRegression(max_iter=100)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

## Использование эмбеддингов
#### Oбучение модели на основе самого текста

In [None]:
proc_words = [preproc_nltk(text).split() for text in tqdm(train_dataset['text'][:10000], desc="Tokenizing")]

In [None]:
model = Word2Vec(
    sentences=proc_words,
    vector_size=60,
    window=3,
    min_count=50,
    workers=4,
    epochs=10)

In [None]:
def vectorize_sum(comment, embeddings, max_length=60):
    embedding_dim = embeddings.vector_size
    features = np.zeros([max_length, embedding_dim], dtype='float32')
    words = preproc_nltk(comment).split()[:max_length]

    for i, word in enumerate(words):
        if word in embeddings.wv:
            features[i] = embeddings.wv[word]

    return features

In [None]:
X_wv = np.stack([vectorize_sum(text, model) for text in train_dataset['text'][:10000]])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_wv, train_dataset['sentiment'][:10000], test_size=0.3, random_state=0)

In [None]:
X_train.shape

In [None]:
X_train_flattened = X_train.reshape(X_train.shape[0], -1)
X_test_flattened = X_test.reshape(X_test.shape[0], -1)
clf.fit(X_train_flattened, y_train[:10000])
y_pred = clf.predict(X_test_flattened)
accuracy = accuracy_score(y_test[:10000], y_pred)
print(accuracy)

In [None]:
embeddings_pretrained = api.load('word2vec-ruscorpora-300')

In [None]:
word2vec_model = Word2Vec(proc_words, vector_size=60, window=3, workers=4)
X_train_w2v = np.array([vectorize_sum(text, word2vec_model) for text in train_dataset['text'][:10000]])

In [None]:
dataset = load_dataset("MonoHime/ru_sentiment_dataset")
train_texts = dataset["train"]["text"][:10000]
train_labels = dataset["train"]["sentiment"][:10000]

In [None]:
MAX_LEN = 40
EMBED_DIM = 256
NHEAD = 8
FFN_HID_DIM = 256
NUM_LAYERS = 8
BATCH_SIZE = 16
EPOCHS = 15
LR = 1e-4
num_classes = 3

In [None]:
def preprocess_text(text):
    if not isinstance(text, str):
        text = ""

    text = re.sub(r'@\w+\b', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'[^а-яё\s]', ' ', text, flags=re.IGNORECASE)
    text = re.sub(r'\s+', ' ', text).strip()

    tokens = word_tokenize(text, language='russian')
    lemmas = [
        morph.parse(t)[0].normal_form for t in tokens
        if t not in RUSSIAN_STOP_WORDS and len(t) >= 2
    ]

    return ' '.join(lemmas)

In [None]:
morph = MorphAnalyzer()
train_texts = [preprocess_text(t) for t in train_texts]

In [None]:
train_texts = [preprocess_text(t) for t in train_texts]

In [None]:
def build_vocab(texts, min_df=2):
    vectorizer = CountVectorizer(
        min_df=min_df,
        tokenizer=lambda x: x.split(),
        token_pattern=None
    )
    vectorizer.fit(texts)
    vocab = {word: idx+2 for idx, word in enumerate(vectorizer.get_feature_names_out())}
    vocab['<pad>'] = 0
    vocab['<unk>'] = 1
    return vocab

vocab = build_vocab(train_texts)
VOCAB_SIZE = len(vocab)

In [None]:
class CommentDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_len):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = self.texts[idx].split()
        token_ids = [self.vocab.get(t, self.vocab['<unk>']) for t in tokens[:self.max_len]]

        if len(token_ids) < self.max_len:
            token_ids += [self.vocab['<pad>']] * (self.max_len - len(token_ids))

        return {
            'input_ids': torch.tensor(token_ids, dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }


texts_train, texts_val, labels_train, labels_val = train_test_split(
    train_texts, train_labels, test_size=0.1, random_state=42
)

train_data = CommentDataset(texts_train, labels_train, vocab, MAX_LEN)
val_data = CommentDataset(texts_val, labels_val, vocab, MAX_LEN)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False)


### **Возьмем для начала датасет поменьше**

In [9]:
train_dataset = train_dataset.select(range(10000))
test_dataset = test_dataset.select(range(2000))

In [10]:
print(train_dataset.shape, test_dataset.shape)

(10000, 3) (2000, 3)


In [12]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

## **Создадим фукнцию**

In [None]:
test_texts = test_dataset['text']
test_labels = test_dataset['sentiment']

In [62]:
def test_model(model_name, test_texts, test_labels, batch_size):
  try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=3,
        )
        model.to(device)
        model.eval()
        predictions = []
        for i in tqdm(range(0, len(test_texts), batch_size)):
            batch_texts = test_texts[i:i + batch_size]

            inputs = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=128,
                return_tensors="pt"
            ).to(device)

            with torch.no_grad():
                outputs = model(**inputs)
                batch_predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
                predictions.extend(batch_predictions)

        predictions = np.array(predictions)
        acc = accuracy_score(test_labels, predictions)
        f1_m = f1_score(test_labels, predictions, average='macro')
        f1_w = f1_score(test_labels, predictions, average='weighted')
        print(classification_report(
            test_labels,
            predictions,
            target_names=['Позитивный', 'Нейтральный', 'Негативный'],
            digits=4
        ))
        return {
            'model': model_name,
            'accuracy': acc,
            'f1_macro': f1_m,
            'f1_weighted': f1_weighted,
            'predictions': f1_w
        }

  except Exception as e:
    print(f"{model}: {e}")
    return None

In [63]:
models = {
    "BERT Мультиязычный": "bert-base-multilingual-cased",
    "RuBERT": "DeepPavlov/rubert-base-cased",
    "RuBERT на диалогах": "DeepPavlov/rubert-base-cased-conversational"
}

In [64]:
results = {}

for model_label, model_name in models.items():
    result = test_model(model_name, test_texts, test_labels, batch_size=32)
    if result:
        results[model_label] = result

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 63/63 [00:18<00:00,  3.45it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

  Позитивный     0.3030    0.0188    0.0353       533
 Нейтральный     0.0000    0.0000    0.0000       957
  Негативный     0.2552    0.9843    0.4053       510

    accuracy                         0.2560      2000
   macro avg     0.1861    0.3344    0.1469      2000
weighted avg     0.1458    0.2560    0.1128      2000



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 63/63 [00:16<00:00,  3.83it/s]


              precision    recall  f1-score   support

  Позитивный     0.2614    0.8818    0.4033       533
 Нейтральный     0.5824    0.0554    0.1011       957
  Негативный     0.3153    0.0686    0.1127       510

    accuracy                         0.2790      2000
   macro avg     0.3864    0.3353    0.2057      2000
weighted avg     0.4288    0.2790    0.1846      2000



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 63/63 [00:16<00:00,  3.82it/s]


              precision    recall  f1-score   support

  Позитивный     0.3949    0.2045    0.2695       533
 Нейтральный     0.5046    0.8652    0.6374       957
  Негативный     0.5783    0.0941    0.1619       510

    accuracy                         0.4925      2000
   macro avg     0.4926    0.3879    0.3563      2000
weighted avg     0.4942    0.4925    0.4181      2000



In [65]:
results_data = {
    model: {
        'acc': info['accuracy'],
        'f1_m': info['f1_macro'],
        'f1_w': info['f1_weighted']
    }
    for model, info in results.items()
}

In [71]:
results_data

{'BERT Мультиязычный': {'acc': 0.256,
  'f1_m': 0.14688823869826195,
  'f1_w': 0.35692397090388744},
 'RuBERT': {'acc': 0.279,
  'f1_m': 0.20570895283252844,
  'f1_w': 0.35692397090388744},
 'RuBERT на диалогах': {'acc': 0.4925,
  'f1_m': 0.3562568586804417,
  'f1_w': 0.35692397090388744}}

## **Квантованные версии моделей**

In [111]:
!pip install bitsandbytes accelerate



In [112]:
from transformers import BitsAndBytesConfig

def test_quantized_model(model_name, test_texts, test_labels, batch_size=16):

    try:
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True,
            llm_int8_threshold=6.0
        )

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=3,
            quantization_config=quantization_config,
            device_map="auto",
            ignore_mismatched_sizes=True
        )
        model.eval()

        predictions = []

        for i in tqdm(range(0, len(test_texts), batch_size)):
            batch_texts = test_texts[i:i + batch_size]

            inputs = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=128,
                return_tensors="pt"
            )

            inputs = {k: v.to(model.device) for k, v in inputs.items()}

            with torch.no_grad():
                outputs = model(**inputs)
                batch_predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
                predictions.extend(batch_predictions)

        predictions = np.array(predictions)

        accuracy = accuracy_score(test_labels, predictions)
        f1_macro = f1_score(test_labels, predictions, average='macro')
        f1_weighted = f1_score(test_labels, predictions, average='weighted')

        print(classification_report(
            test_labels,
            predictions,
            target_names=['Позитивный', 'Нейтральный', 'Негативный'],
            digits=4
        ))

        if torch.cuda.is_available():
            memory_allocated = torch.cuda.memory_allocated() / 1024**3

        del model
        torch.cuda.empty_cache()

        return {
            'model': model_name,
            'accuracy': accuracy,
            'f1_macro': f1_macro,
            'f1_weighted': f1_weighted,
            'predictions': predictions
        }

    except Exception as e:
        print(f"Ошибка при тестировании {model_name}: {e}")
        import traceback
        traceback.print_exc()
        return None

In [113]:
models = {
    "BERT Мультиязычный": "bert-base-multilingual-cased",
    "RuBERT": "DeepPavlov/rubert-base-cased",
    "RuBERT на диалогах": "DeepPavlov/rubert-base-cased-conversational"
}

In [114]:
test_texts = test_dataset['text']
test_labels = test_dataset['sentiment']

results = {}

for model_label, model_name in models.items():
    result = test_quantized_model(model_name, test_texts, test_labels, batch_size=16)
    if result:
        results[model_label] = result

Ошибка при тестировании bert-base-multilingual-cased: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`


Traceback (most recent call last):
  File "/tmp/ipython-input-3327285778.py", line 12, in test_quantized_model
    model = AutoModelForSequenceClassification.from_pretrained(
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/auto/auto_factory.py", line 604, in from_pretrained
    return model_class.from_pretrained(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 277, in _wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 4881, in from_pretrained
    hf_quantizer, config, dtype, device_map = get_hf_quantizer(
                                              ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/quantizers/auto.py", line 319, in get_hf_quantizer
    hf_quantizer.validate_environment(
  

Ошибка при тестировании DeepPavlov/rubert-base-cased: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`


Traceback (most recent call last):
  File "/tmp/ipython-input-3327285778.py", line 12, in test_quantized_model
    model = AutoModelForSequenceClassification.from_pretrained(
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/auto/auto_factory.py", line 604, in from_pretrained
    return model_class.from_pretrained(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 277, in _wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 4881, in from_pretrained
    hf_quantizer, config, dtype, device_map = get_hf_quantizer(
                                              ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/quantizers/auto.py", line 319, in get_hf_quantizer
    hf_quantizer.validate_environment(
  

Ошибка при тестировании DeepPavlov/rubert-base-cased-conversational: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`


Traceback (most recent call last):
  File "/tmp/ipython-input-3327285778.py", line 12, in test_quantized_model
    model = AutoModelForSequenceClassification.from_pretrained(
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/models/auto/auto_factory.py", line 604, in from_pretrained
    return model_class.from_pretrained(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 277, in _wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py", line 4881, in from_pretrained
    hf_quantizer, config, dtype, device_map = get_hf_quantizer(
                                              ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/quantizers/auto.py", line 319, in get_hf_quantizer
    hf_quantizer.validate_environment(
  

In [115]:
results_data = {
    model: {
        'acc': info['accuracy'],
        'f1_m': info['f1_macro'],
        'f1_w': info['f1_weighted']
    }
    for model, info in results.items()
}

In [116]:
results_data

{}