# Installing

In [2]:
!pip install transformers
!pip install pymorphy2
!pip install pymystem3
!pip install transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m99.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m89.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2
Looking in in

# Import Biblio

In [3]:
import pandas as pd
from google.colab import drive
import transformers
import lightgbm as lgbm
import pymorphy2
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd

df = pd.read_excel("/content/drive/MyDrive/1 плох раб.xlsx")

In [6]:
df['target'] = [1 if elem == 'Неисправность инфраструктуры/подвижного состава' \
                else 0 for elem in df['Причина возникновения общая']]

In [7]:
df = df[['Примечание', 'target']]
# df['Примечание'] = df['Примечание'].map(str.lower)
df = df.rename(columns={'Примечание': 'text'})
df.head()

Unnamed: 0,text,target
0,Не идет на ход. Вагон(Витязь) 2020г.в. (экспл....,1
1,Скопление воды . Сообщил 3-30125 в 11-05. пере...,0
2,В салоне трамвая стало плохо женщине.103 в 10:...,0
3,При следовании в депо у вагона 31346 произошел...,0
4,"Сход вагона тягач № 323,3223, аварийная бригад...",1


#preproc

In [7]:
class Preprossesing():
    def __init__(self):
        self.pymorphy = pymorphy2.MorphAnalyzer()
    
    def lematize(self, text):
        result = []
        try:
            for w in text.split(' '):
                parsed_word = self.pymorphy.parse(w)[0]
                if 'Name' in parsed_word.tag or 'Surn' in parsed_word.tag \
                             or 'Patr' in parsed_word.tag:
                    result.append(' ')
                else:
                    result.append(parsed_word.normal_form)
            return " ".join(result) 
        except:
            return " "
    
    def delete_space(self, string):
        return string.strip()

    def clear_oracle(self, string):
        string = re.sub("[A-za-z<>,.()-:0-9]", "", string)
        string = re.sub(" +", " ", string)
        return self.delete_space(string)
   
    def main(self, df):
        df_clear = df.copy()
        df_clear['Примечание'] = [self.lematize(self.clear_oracle(string)) \
                                  for string in df_clear['Примечание']]
        return df_clear

In [8]:
df_clear = Preprossesing().main(df)

In [9]:
df_clear.head()

Unnamed: 0,Примечание,target
0,не идти на ход вагонвитязь гв экспл то дтп вод...,1
1,скопление вода сообщить в передать в сп в мо...,0
2,в салон трамвай стать плохо женщина в наряд на...,0
3,при следование в депо у вагон произойти сход з...,0
4,сход вагон тягач № аварийный бригада выехать в...,1


In [10]:
df_clear = df_clear[df_clear['Примечание'].astype(str) != '']
word_vect = TfidfVectorizer(ngram_range=(3, 3), min_df=3)
train_word_features = word_vect.fit_transform(df_clear['Примечание'])
print(train_word_features.shape)

(5778, 5069)


# lgbm

In [None]:
lgb_model = lgbm.LGBMClassifier()
lgb_model.fit(train_word_features, df_clear.target)

feature_importance = lgb_model.feature_importances_

df_feature_importances_ = pd.DataFrame(list(zip(word_vect.get_feature_names_out(), \
                                                lgb_model.feature_importances_)), \
                                                columns=['feature', 'importance']).\
                                        sort_values(by='importance', ascending=False)
df_feature_importances_[:30]

Unnamed: 0,feature,importance
4628,тягач на место,53
2639,не идти на,52
3379,продолжить движение по,48
4719,угибдд на место,44
919,гв эксплуатация то,42
2510,на трамвайный путь,38
4396,тот неисправность вагон,36
1271,депо проверка подвижный,35
2139,место движение восстановить,33
3191,по сообщение водитель,33


# transformers

In [10]:
from sklearn.model_selection import train_test_split
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          TrainingArguments, Trainer)
from torch.utils.data import Dataset

import torch
import evaluate
import pandas as pd
from tqdm.auto import tqdm
from datasets import Dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from transformers import (AutoTokenizer, 
                          AutoModelForSequenceClassification, get_scheduler)


In [24]:
class Dataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, index):
        sample = self.data[index]
        return sample

    def __len__(self):
        return len(self.data)

In [9]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [48]:
from datasets import Dataset

In [11]:
# Конвертируем датафрейм в Dataset
train, test = train_test_split(df, test_size=0.2)
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)

In [12]:
# Выполняем предобработку текста
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')  # case с регистром, uncase - все в нижнем
def tokenize_function(df):
	return tokenizer(df['text'], padding='max_length', truncation=True)


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(
	'bert-base-multilingual-cased',
	num_labels=2)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [17]:
def ds_preproc(ds):
	ds = ds.map(tokenize_function)
	ds = ds.remove_columns(['text', '__index_level_0__'])
	ds = ds.rename_column('target', 'labels')
	ds.set_format('torch')
	return ds

tokenized_train = ds_preproc(train)
tokenized_test = ds_preproc(test)

# Создаем даталоадер
train_dataloader = DataLoader(tokenized_train, shuffle=True, batch_size=64)
test_dataloader = DataLoader(tokenized_test, batch_size=64)

# Задаем оптимайзер и шедулер
optimizer = AdamW(model.parameters(), lr=5e-6)

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
	name='linear',
	optimizer=optimizer,
	num_warmup_steps=0,
	num_training_steps=num_training_steps)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Обучение
for epoch in tqdm(range(num_epochs)):

  model.train()
  for batch in tqdm(train_dataloader, leave=False):

      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = model(**batch)
      loss = outputs.loss
      loss.backward()
      optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()

  metric = evaluate.load('f1')

  # Валидация
  model.eval()
  for batch in tqdm(test_dataloader, leave=False):
      batch = {k: v.to(device) for k, v in batch.items()}
      with torch.no_grad():
          outputs = model(**batch)
      logits = outputs.logits
      predictions = torch.argmax(logits, dim=-1)

      metric.add_batch(predictions=predictions, references=batch['labels'])

  print(f'epoch {epoch} -', metric.compute())

Map:   0%|          | 0/4622 [00:00<?, ? examples/s]

Map:   0%|          | 0/1156 [00:00<?, ? examples/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/578 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

  0%|          | 0/145 [00:00<?, ?it/s]

epoch 0 - {'f1': 0.8773584905660377}


  0%|          | 0/578 [00:00<?, ?it/s]

  0%|          | 0/145 [00:00<?, ?it/s]

epoch 1 - {'f1': 0.8919667590027701}


  0%|          | 0/578 [00:00<?, ?it/s]

  0%|          | 0/145 [00:00<?, ?it/s]

epoch 2 - {'f1': 0.934010152284264}


  0%|          | 0/578 [00:00<?, ?it/s]

  0%|          | 0/145 [00:00<?, ?it/s]

epoch 3 - {'f1': 0.9347258485639687}


  0%|          | 0/578 [00:00<?, ?it/s]

  0%|          | 0/145 [00:00<?, ?it/s]

epoch 4 - {'f1': 0.9365079365079365}


In [20]:
dd = 'Не идет на ход. Вагон(Витязь) 2020г.в. (экспл.16.03.2020), ТО-21.06.2021, ДТП-1. Водитель: Коршунова Л.Г., РФ, 50лет, разряд 4, стаж 7/6, ДТП-0. Буксир 7/31096 до к-ца МЦД Каланчевская. Направлен тягач 320. Движение восстановлено в 18-30.'

In [24]:
inputs = tokenizer.encode_plus(dd, return_tensors='pt')
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
model = model.cpu()
with torch.no_grad():
  outputs = model(input_ids=input_ids, attention_mask=attention_mask,  output_hidden_states=True)
features = outputs.hidden_states[-1]

features

tensor([[[-0.0683,  0.8626,  0.3236,  ...,  1.0997, -1.5528,  0.1656],
         [ 0.0408,  0.8173,  0.4834,  ...,  1.1726, -1.4229,  0.2378],
         [ 0.1636,  0.5834,  0.5909,  ...,  0.9722, -1.4329,  0.1880],
         ...,
         [-0.1765,  0.9126,  0.3088,  ...,  1.2445, -1.2169,  0.0483],
         [ 0.1166,  0.8939,  0.3786,  ...,  0.8996, -1.0474, -0.2226],
         [ 0.1068,  1.0300,  0.1702,  ...,  1.0360, -1.4431,  0.2019]]])

In [25]:
features.shape

torch.Size([1, 122, 768])

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
import pandas as pd

# load data
data = df.copy().iloc[:10, :]

# set up NLP pipeline
nlp = pipeline("feature-extraction", model="bert-base-multilingual-uncased")

# generate embeddings for the text data
embeddings = nlp(list(data["Примечание"]))

# set up TF-IDF vectorizer
vectorizer = TfidfVectorizer(analyzer="word", ngram_range=(3, 3))

# fit TF-IDF vectorizer to the data
X = vectorizer.fit_transform(list(data["Примечание"]))

# get feature names and corresponding scores
feature_names = vectorizer.get_feature_names_out()
scores = X.sum(axis=0).A1

# sort feature names by their corresponding scores
sorted_indices = scores.argsort()[::-1]
sorted_features = [feature_names[idx] for idx in sorted_indices]

# print top 10 most important n-grams
print(sorted_features[:10])

32

In [42]:
tokenizer.encode_plus(dd, return_tensors='pt')

{'input_ids': tensor([[   101,  21124,    549,  82635,  10122, 105569,    119,    511,  51229,
          11579,    113,    511,  15811,  87118,    114,  23607,  10823,    119,
            543,    119,    113,    570,  18705,  11078,  10517,    119,  10250,
            119,  10907,    119,  23607,    114,    117,    527,  18002,    118,
          10296,    119,  10719,    119,  67267,    117,    513,  20411,  16027,
            118,    122,    119,  12624,  96655,  17371,    131,    519,  13097,
          27176,  36481,    520,    119,    512,    119,    117,  12068,    117,
          10462,  35025,    117,  17257,  49105,    125,    117,  15888,  55522,
            128,    120,    127,    117,    513,  20411,  16027,    118,    121,
            119,  81478,  18705,  23312,    128,    120,  23993,  11373,  11211,
          10344,    551,    118,  43418,    521,  47450,  22681,    519,  15522,
          60745, 102209,    119,  10778, 108877,  37813,  11347,  11746,  18920,
            11

In [43]:
def extract_keyphrases(model, tokenizer, input_text):
    # input_ids = torch.tensor([tokenizer.encode(input_text)]).to("cpu"
    inputs = tokenizer.encode_plus(dd, return_tensors='pt')
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    # target_id = torch.tensor([tokenizer.encode(target)]).to(device)
    model = model.cpu()
    with torch.no_grad():
      outputs = model(input_ids=input_ids, attention_mask=attention_mask,  output_hidden_states=True)
    logits = outputs.hidden_states[-1]
    
    keyphrase_ids = torch.argmax(logits, dim=1)
    print(keyphrase_ids)
    # keyphrases = tokenizer.decode(keyphrase_ids)
    return keyphrases

In [44]:
extract_keyphrases(model, tokenizer, dd)

tensor([[116,  31, 118,  55,  58,  53,  37, 112, 118,  54,  52,  58, 113, 112,
          28,  31,  58,  20, 110,  38,  50, 118,  49,  28, 117, 117,  58, 118,
          29,  28,  53,  51,  14,  25, 112,  51, 115,  28,  58, 116, 118,  54,
         110, 120, 119, 114, 118, 118, 117, 109, 119,  55, 118, 118, 117, 114,
          31,  58, 117,  89, 114,  92,  25,  50, 112,  51,  27,  98, 117,  29,
          28, 118,  45, 115,  26, 114, 118,  26,  87,  40,  49, 114,  28, 118,
          28, 118, 117, 114, 112, 112, 114,  57, 110, 117, 106, 118, 118,  56,
         119, 118,  79,  54,  38, 116,  26,  86, 119,  26, 116, 112,  28,  58,
          56, 113,  54, 115, 120,  56,  57, 119, 118, 116,  28, 110, 115, 114,
          26,  41, 112, 116,  41,  12,  26, 111,  22, 117,  57, 111,  59, 119,
          54, 112,  58, 120,  27,  37,  58, 112, 112, 103,  57,  58, 118,  56,
          28, 117,  28, 111,  54, 117,  49, 114, 120,  54, 114,  58, 111,  99,
         116,  56,  51,  55,  23,  50,  17,  53,  54

NameError: ignored

https://habr.com/ru/articles/704592/

In [None]:
clf = pipeline(
    task = 'sentiment-analysis', 
    model = 'SkolkovoInstitute/russian_toxicity_classifier')

text = ['Дурачок ты мой']

clf(text, top_k=None)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/712M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/585 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


[[{'label': 'toxic', 'score': 0.9748858213424683},
  {'label': 'neutral', 'score': 0.025114139541983604}]]

In [None]:
# Для генерации поштучно
"""def data(text):
    for row in text:
        yield row

for out in clf(data(text)):
    print(out)"""

In [None]:
# Для подгрузки из торча
# GPT2ForSequenceClassification, GPT2Tokenizer 
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(
    'SkolkovoInstitute/russian_toxicity_classifier')
model = AutoModelForSequenceClassification.from_pretrained(
    'SkolkovoInstitute/russian_toxicity_classifier')

# PyTorch самый простой Trainer

In [None]:
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import warnings
warnings.filterwarnings("ignore")

WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))

article_text = """Еда в магазине вкуснее домашней, вообще кайф
                  """

model_name = "csebuetnlp/mT5_multilingual_XLSum"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

input_ids = tokenizer(
    [WHITESPACE_HANDLER(article_text)],
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=512
)["input_ids"]

output_ids = model.generate(
    input_ids=input_ids,
    max_length=84,
    no_repeat_ngram_size=2,
    num_beams=4
)[0]

summary = tokenizer.decode(
    output_ids,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)

print(summary)

Что делать, если вы любите еду?


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Define the texts to be classified
texts = ['This is a positive text', 'This is a negative text']

# Preprocess and tokenize the texts
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Set the model to evaluate mode
model.eval()

# Forward pass through the model to get the logits
with torch.no_grad():
    logits = model(**inputs)[0]

# Get the index of the label with the highest logits value
pred_labels = torch.argmax(logits, dim=1)

# Get the importance score for each token using the attribution method, e.g. Integrated Gradients or LIME
# Here's an example of using Integrated Gradients with Captum library
from captum.attr import IntegratedGradients
ig = IntegratedGradients(model)
input_ids = inputs['input_ids']
attributions = ig.attribute(inputs['input_ids'], internal_batch_size=4)

# Print the predicted labels and importance scores for each token
for text, pred, attribution in zip(texts, pred_labels, attributions): 
    print('Text:', text)
    print('Predicted Label:', pred.item())
    print('Importance Scores:', attribution)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
from transformers import pipeline

model = pipeline('feature-extraction', model='bert-base-multilingual-uncased')

texts = ['This is the first text.', 'This is the second text.', 'This is the third text.']

embeddings = model(texts)
print(embeddings)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[[[[-0.10332181304693222, -0.028739335015416145, 0.04597948491573334, 0.06866433471441269, -0.357668399810791, 0.004001149907708168, -0.054379288107156754, -0.04002738744020462, -1.9381558895111084, -0.051035575568675995, -0.010675886645913124, -0.20984694361686707, 0.025374364107847214, 0.009641601704061031, 0.1272483915090561, 0.06392766535282135, 0.12325163185596466, -0.020203959196805954, -0.08458087593317032, -0.024887241423130035, -0.02219691500067711, 0.10794238746166229, -0.002284721937030554, -0.20956754684448242, 0.5416153073310852, 0.0021895889658480883, 0.00528379762545228, 0.02060137875378132, -2.181143283843994, -0.0010455301962792873, -0.1836225539445877, 0.08173767477273941, -0.038336027413606644, 0.15162785351276398, -0.10087408870458603, -0.0642482340335846, 0.04190075024962425, 1.6710593700408936, -0.06943222880363464, -0.06782996654510498, -0.0885917991399765, 0.1034233570098877, -0.048678845167160034, 0.1125992089509964, -0.0073766643181443214, -0.16438497602939606

In [None]:
# To implement this in Python using transformers, you can use the Hugging Face library which provides pre-trained models for text classification and NER. Here is an example of training a text classification model on the IMDb movie review dataset using the DistilBert transformer model:

from transformers import DistilBertTokenizerFast, TFTrainer, TFTrainingArguments
from datasets import load_dataset

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset, test_dataset = load_dataset('imdb', split=['train', 'test'])

train_dataset = train_dataset.map(tokenize_function, batched=True)
train_dataset.set_format('tensorflow', columns=['input_ids', 'attention_mask', 'label'])

test_dataset = test_dataset.map(tokenize_function, batched=True)
test_dataset.set_format('tensorflow', columns=['input_ids', 'attention_mask', 'label'])

model_checkpoint = 'distilbert-base-uncased'
training_args = TFTrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = TFTrainer(
    model=lambda: TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=lambda data: {'input_ids': tf.stack([item[0] for item in data]),
                                'attention_mask': tf.stack([item[1] for item in data]),
                                'labels': tf.stack([item[2] for item in data])}
)

trainer.train()



  0%|          | 0/2 [00:00<?, ?it/s]



Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")

text = "John works at Google and lives in San Francisco"
inputs = tokenizer(text, return_tensors="pt")

outputs = model(**inputs).logits
predictions = torch.argmax(outputs, dim=-1)

predictions






from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Fine-tune the model on a dataset of text with a target
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)
# train the model on labeled dataset

# Use the model to analyze new text
text = "Some example text about the target."
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=1)
if predictions[0] == 1:
  # The text is about the target
  # Use NER or keyword extraction techniques to extract important phrases related to the TrainingArguments

tensor([[0, 4, 0, 0, 6, 0, 0, 0, 8, 8, 0]])

In [None]:
bbfrom transformers import BertTokenizer, BertForSequenceClassification
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Fine-tune on dataset with target and keyphrases
train_dataset = ...

# Prepare optimizer and schedule (linear warmup and decay)
optimizer = ...

for epoch in range(num_epochs):
    for batch in train_dataset:
        inputs = batch['input_ids']
        labels = batch['labels']
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Use model to extract keyphrases from new input text with target
def extract_keyphrases(model, tokenizer, input_text, target):
    input_ids = torch.tensor([tokenizer.encode(input_text)]).to(device)
    target_id = torch.tensor([tokenizer.encode(target)]).to(device)
    outputs = model(input_ids, labels=target_id)
    logits = outputs.logits
    keyphrase_ids = torch.argmax(logits, dim=1)
    keyphrases = tokenizer.decode(keyphrase_ids)
    return keyphrases

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
df.columns()

Index(['Примечание', 'target'], dtype='object')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
import pandas as pd

# load data
data = df.copy().iloc[:10, :]

# set up NLP pipeline
nlp = pipeline("feature-extraction", model="bert-base-multilingual-uncased")

# generate embeddings for the text data
embeddings = nlp(list(data["Примечание"]))

# set up TF-IDF vectorizer
vectorizer = TfidfVectorizer(analyzer="word", ngram_range=(3, 3))

# fit TF-IDF vectorizer to the data
X = vectorizer.fit_transform(list(data["Примечание"]))

# get feature names and corresponding scores
feature_names = vectorizer.get_feature_names_out()
scores = X.sum(axis=0).A1

# sort feature names by their corresponding scores
sorted_indices = scores.argsort()[::-1]
sorted_features = [feature_names[idx] for idx in sorted_indices]

# print top 10 most important n-grams
print(sorted_features[:10])

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['71 931м выпуск', '06 2021 дтп', 'москва разряд стаж', 'на автономном ходу', 'на месте 10', 'витязь 71 931м', 'дтп вагон витязь', 'вагон витязь 71', 'востановленно вода сошла', 'сп рыбалко 11']


In [None]:
from transformers import pipeline

classifier = pipeline("feature-extraction", model="nlptown/bert-base-multilingual-uncased-sentiment")

result = classifier("вкусный обед, но обслуживание медленное", return_all_scores=True)

print(result[0][0][0])

Some weights of the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


-0.14540168642997742


In [None]:
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, Trainer, TrainingArguments
import pandas as pd

# Load the dataset (assuming it's in a CSV file)


# Initialize the tokenizer and add special tokens
tokenizer = RobertaTokenizerFast.from_pretrained('xlm-roberta-base')
tokenizer.add_tokens(['<target>'])

# Encode the input sequences and add the target variable
encoded_data = tokenizer(df['Примечание'].tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')
encoded_data['labels'] = df['target'].tolist()

# Load the RoBERTa model and set the number of labels
model = RobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=2)


print(encoded_data['input_ids'].shape)

# Fine-tune the model on the encoded dataset
training_args = TrainingArguments('output_dir', 
                                  evaluation_strategy='epoch', 
                                  num_train_epochs=3, 
                                  per_device_train_batch_size=16, 
                                  per_device_eval_batch_size=64, 
                                  logging_steps=10, 
                                  eval_steps=50, 
                                  save_steps=500, 
                                  learning_rate=2e-5, 
                                  weight_decay=0.01)
trainer = Trainer(model=model, train_dataset=encoded_data)
trainer.train()

# # Extract the learned weights for the RoBERTa embeddings
# weights = model.roberta.embeddings.weight.detach().numpy()

# # Extract the most important phrases using the TF-IDF algorithm and the learned weights
# from sklearn.feature_extraction.text import TfidfVectorizer
# tfidf = TfidfVectorizer(ngram_range=(3,3), max_features=100000, analyzer='word')
# tfidf.fit(df['text'].to_list())
# feature_names = tfidf.get_feature_names()
# phrase_scores = {}
# for i, phrase in enumerate(feature_names):
#     if len(phrase.split()) < 2:
#         continue
#     phrase_embedding = weights[tokenizer.encode(phrase, add_special_tokens=False)[0], :]
#     phrase_scores[phrase] = tfidf.idf_[i] * (phrase_embedding ** 2).sum()
    
# # Print out the


# https://towardsdatascience.com/fine-tuning-pretrained-nlp-models-with-huggingfaces-trainer-6326a4456e7b

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLMRobertaTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.
You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


Downloading pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.weight',

torch.Size([5778, 512])
