<a href="https://colab.research.google.com/github/Vedernikov1/ML-DL.models/blob/main/RuBERT_end.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install transformers

In [34]:
import torch
import pandas as pd
import numpy as np

# Библиотека с предобученными трансформерами
from transformers import AutoTokenizer, AutoModel

# Для разбиения train выборки
from sklearn.model_selection import train_test_split

# Библиотеки для классификации
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Для подсчета метрик
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score

In [9]:
# Константы

RANDOM_SEED = 42
FILENAME = 'train.tsv'
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Считывание данных

In [15]:
def read_dataset(filename):
  df = pd.read_csv("train.tsv", sep="\t")
  return df['title'], df['is_fake']

In [16]:
X, y = read_dataset(FILENAME)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

### Токенизация данных

In [17]:
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/sbert_large_nlu_ru")

tokenized_train = tokenizer(list(X_train), padding=True, truncation=True, max_length=24, return_tensors='pt')
tokenized_train = tokenized_train.to(DEVICE)

tokenized_test = tokenizer(list(X_test), padding=True, truncation=True, max_length=24, return_tensors='pt')
tokenized_test = tokenized_test.to(DEVICE)

### Получение эмбеддингов предложений

In [None]:
# Скачивание предобученной модели
model_RuBERT = AutoModel.from_pretrained("sberbank-ai/sbert_large_nlu_ru").to(DEVICE)

In [21]:
''' Данная функция взята с https://huggingface.co '''


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [46]:
with torch.no_grad():
    model_output_train = model_RuBERT(**tokenized_train)
with torch.no_grad():
    model_output_test = model_RuBERT(**tokenized_test)

train_embeddings = mean_pooling(model_output_train, tokenized_train['attention_mask'])
test_embeddings = mean_pooling(model_output_test, tokenized_test['attention_mask'])

### Построение моделей классификации

In [None]:
classification_models = dict()

clf = make_pipeline(StandardScaler(), SVC(kernel='rbf', gamma='scale'))
clf.fit(train_embeddings.cpu(), y_train)
classification_models['SVM'] = clf

clf = make_pipeline(StandardScaler(), RandomForestClassifier())
clf.fit(train_embeddings.cpu(), y_train)
classification_models['RandomForestClassifier'] = clf

clf = make_pipeline(StandardScaler(), LogisticRegression())
clf.fit(train_embeddings.cpu(), y_train)
classification_models['LogisticRegeression'] = clf

clf = make_pipeline(StandardScaler(), xgb.XGBClassifier())
clf.fit(train_embeddings.cpu(), y_train)
classification_models['GBM'] = clf

### Подсчет метрик

In [43]:
def compute_metrics(y_true, y_pred):
  metric_scores = dict()

  metric_scores['f1_score'] = f1_score(y_true, y_pred)
  metric_scores['roc-auc'] = roc_auc_score(y_true, y_pred)
  metric_scores['accuracy'] = accuracy_score(y_true, y_pred)

  return metric_scores

In [44]:
for model_name, class_model in classification_models.items():
  y_pred = np.array(class_model.predict(test_embeddings.cpu()))
  metrics = compute_metrics(y_test, y_pred)

  print(f'{model_name}:')
  for score_name, score in metrics.items():
    print(f'{score_name}: {score}')
  print()

SVM:
f1_score: 0.9084687767322498
roc-auc: 0.9072953736654805
accuracy: 0.9071180555555556

RandomForestClassifier:
f1_score: 0.860802732707088
roc-auc: 0.8586133059895046
accuracy: 0.8585069444444444

LogisticRegeression:
f1_score: 0.874251497005988
roc-auc: 0.8725526268170577
accuracy: 0.8723958333333334

GBM:
f1_score: 0.8688245315161841
roc-auc: 0.866367090898124
accuracy: 0.8663194444444444

