In [84]:
# import json
# import pandas as pd

# # Load the JSON file
# with open('data/telegram_full/result.json', encoding='utf-8') as f:
#     data = json.load(f)

# # Extract just the 'messages' list
# messages = data["messages"]

# # Load into a DataFrame
# df = pd.DataFrame(messages)

# df = df[df["action"].isna()]

# print(df["text"].dtypes)

# # df.to_csv("data/messages.csv")


In [85]:
from datasets import load_dataset

dataset = load_dataset('csv', data_files='data/messages.csv')
split_dataset = dataset['train'].train_test_split(test_size=0.2, shuffle=True, seed=1)

In [86]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")

In [87]:
def get_max_length(dataset, sample_size=None):
    """
    Determines the max tokenized length in the dataset.
    Optionally, limit to a sample of the dataset to save time.
    """
    lengths = []
    for i, example in enumerate(dataset["text"]):
        if example is not None:
            tokenized = tokenizer.tokenize(example)
            lengths.append(len(tokenized))
        if sample_size and i >= sample_size:
            break
    return max(lengths)

max_len = get_max_length(dataset=split_dataset["train"])
max_len

875

In [88]:
def preprocess_function(examples):
    pre_texts = [text if text is not None else "" for text in examples["text"]]
    texts = []
    for i, text in enumerate(pre_texts):
        if text != "":
            texts.append(text)
    # print(examples["text"])
    # print(" ".join([x for x in examples["text"] if x is not None]) if examples["text"] else "")
    # print(tokenizer(str(examples["text"]) if examples["text"] is not None else ""))
    return tokenizer(texts, max_length=128, padding="max_length", truncation=True)

In [89]:
tokenized_dataset = split_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=4,
    num_proc=4,
    remove_columns=split_dataset["train"].column_names,
)

In [90]:
total_empty = 0
for sample in tokenized_dataset["test"]:
    summa = sum(sample["input_ids"])
    if summa == 203:
        print(sample)
        total_empty += 1

print(total_empty)

0


In [91]:
print(f"Train dataset length: {len(tokenized_dataset['train'])}")
print(f"Test dataset length: {len(tokenized_dataset['test'])}")

print(tokenized_dataset["test"][5])
tokens = tokenized_dataset["test"][5]["input_ids"]

original_text = tokenizer.convert_ids_to_tokens(tokens)
print(original_text)

Train dataset length: 4068
Test dataset length: 1013
{'input_ids': [101, 50190, 6582, 1469, 3124, 34722, 81666, 17942, 9798, 166, 122, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [92]:
block_size = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [93]:
lm_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=4)

In [94]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [95]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("DeepPavlov/rubert-base-cased")

In [96]:
from transformers import TrainingArguments, Trainer
import torch

training_args = TrainingArguments(
    output_dir="RuBertMLM",
    eval_strategy="epoch",
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.001,
    push_to_hub=False,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    bf16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,4.3233,
2,3.9108,3.692928
3,3.3985,


TrainOutput(global_step=2034, training_loss=3.8079587923273106, metrics={'train_runtime': 323.0677, 'train_samples_per_second': 37.775, 'train_steps_per_second': 6.296, 'total_flos': 803872285977600.0, 'train_loss': 3.8079587923273106, 'epoch': 3.0})

In [97]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 40.16


In [99]:
from transformers import pipeline

text = "Это [MASK] спам?"

mask_filler = pipeline("fill-mask", "/home/sergei/Documents/pets/spambot/RuBertMLM/checkpoint-2034")
mask_filler(text, top_k=10)

Device set to use cuda:0


[{'score': 0.44067850708961487,
  'token': 1758,
  'token_str': 'за',
  'sequence': 'Это за спам?'},
 {'score': 0.12265335023403168,
  'token': 3629,
  'token_str': 'же',
  'sequence': 'Это же спам?'},
 {'score': 0.07217394560575485,
  'token': 11089,
  'token_str': 'ли',
  'sequence': 'Это ли спам?'},
 {'score': 0.03132351115345955,
  'token': 845,
  'token_str': 'в',
  'sequence': 'Это в спам?'},
 {'score': 0.026461344212293625,
  'token': 2739,
  'token_str': 'как',
  'sequence': 'Это как спам?'},
 {'score': 0.024310747161507607,
  'token': 1699,
  'token_str': 'не',
  'sequence': 'Это не спам?'},
 {'score': 0.018983563408255577,
  'token': 3474,
  'token_str': 'или',
  'sequence': 'Это или спам?'},
 {'score': 0.012777060270309448,
  'token': 1469,
  'token_str': 'на',
  'sequence': 'Это на спам?'},
 {'score': 0.012134999968111515,
  'token': 5806,
  'token_str': 'через',
  'sequence': 'Это через спам?'},
 {'score': 0.011625571176409721,
  'token': 2067,
  'token_str': 'был',
  'seq