In [44]:
import pandas as pd

In [None]:
path = "../train_data/" # Ваш путь до директории с данными /path/to/data/
data = pd.read_csv(path + 'train_events.csv')
video = pd.read_csv(path + 'video_info_v2.csv')
targets = pd.read_csv(path + 'train_targets.csv')

In [None]:
def pars_user_history_as_sequence(data, id):
    user_data = data[data['viewer_uid']==id]
    user_data = user_data.sort_values(by="event_timestamp")
    fields = ["region", "ua_device_type", "ua_client_type", "ua_os", "ua_client_name", "quart_watchtime", "category", "author_id", "title",]

    for fieald in fields:
        user_data[fieald] = user_data[fieald].apply(lambda x: x.replace(" ", "-"))
    
    user_row  = "[CLS] " + " [SEP] ".join(["_".join(row[fields]) for _, row in user_data[fields].iterrows()]) + " [CLS]"
        
    return user_row
    
def get_quart(watchtime):
    if 0 <= watchtime < 25:
        return "quart1"
    elif 25 <= watchtime < 50:
        return "quart2"
    elif 50 <= watchtime < 75:
        return "quart3"
    elif 75 <= watchtime <= 100:
        return "quart4"

all_data = data.merge(video, how="left", on="rutube_video_id")

all_data["rel_watchtime"] = all_data["total_watchtime"] / all_data["total_watchtime"]
all_data["quart_watchtime"] = all_data["total_watchtime"].apply(get_quart)
all_data["author_id"] = all_data["author_id"].apply(lambda x: str(x))
all_data = all_data.fillna("unk")

all_users = all_data['viewer_uid'].to_list()

In [None]:
users_history = {id: pars_user_history_as_sequence(all_data, id) for id in all_users}

In [None]:
train_data = pd.DataFrame({"viewer_uid": users_history.keys(), "history": users_history.values()})

In [None]:
train_data = train_data.merge(targets[['age', 'age_class', 'sex', 'viewer_uid']], how="left", on="viewer_uid")
train_data[['age', 'age_class', 'sex']] = train_data[['age', 'age_class', 'sex']].apply(lambda x: x.to_list()[0])

In [46]:
train_data = pd.read_csv(path + "custom_bert_train.csv")
custom_tokens = train_data['history'].apply(lambda x: x.replace("_", " ")).to_list()

In [47]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, normalizers
from tokenizers.processors import TemplateProcessing

# 1. Определение базовой модели токенизатора
tokenizer = Tokenizer(models.BPE())

# 2. Установка нормализаторов и пре-токенизаторов
tokenizer.normalizer = normalizers.Sequence([normalizers.Lowercase(), normalizers.Replace("_", " "),
                                            normalizers.Replace("-", " ")])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# 3. Определение тренера с кастомными токенами
trainer = trainers.BpeTrainer(special_tokens=["[CLS]", "[SEP]", "[PAD]"], unk_token="[UNK]")

# 5. Тренировка токенизатора на кастомных токенах
tokenizer.train_from_iterator(custom_tokens, trainer=trainer)

# 6. Установка шаблона обработки
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)

# 7. Сохранение токенизатора
tokenizer.save(output_path + "custom_tokenizer.json")


Ignored unknown kwargs option unk_token





In [48]:
from transformers import PreTrainedTokenizerFast

# Создание fast-токенизатора на основе ранее созданного
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file=output_path+"/custom_tokenizer.json", unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",)

# Сохранение токенизатора в формате transformers
fast_tokenizer.save_pretrained(output_path + "custom_hf_tokenizer")




('/kaggle/working/custom_hf_tokenizer/tokenizer_config.json',
 '/kaggle/working/custom_hf_tokenizer/special_tokens_map.json',
 '/kaggle/working/custom_hf_tokenizer/tokenizer.json')

In [49]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import PreTrainedTokenizerFast
import pandas as pd

# Класс должен быть определён на уровне модуля (глобально)
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        history = str(self.data.iloc[index]['history'])
        age_class = self.data.iloc[index]['age_class']
        sex = self.data.iloc[index]['sex']

        # Токенизация
        encoding = self.tokenizer.encode_plus(
            history,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'age_class': torch.tensor(age_class, dtype=torch.long),
            'sex': torch.tensor(sex, dtype=torch.long)
        }

encode_sex = {"male": 1, "female": 0}
decode_sex = {1: "male", 0: "female"}

# Основной код
tokenizer = PreTrainedTokenizerFast.from_pretrained(output_path + "custom_hf_tokenizer")
df = pd.read_csv(path + 'custom_bert_train.csv')
train_data = df.sample(frac=0.8, random_state=42)
train_data['sex'] = train_data['sex'].apply(lambda x: encode_sex[x])

val_data = df.drop(train_data.index)
print(val_data.shape)
val_data['sex'] = val_data['sex'].apply(lambda x: encode_sex[x])
train_dataset = CustomDataset(train_data, tokenizer, max_len=128)
val_dataset = CustomDataset(val_data, tokenizer, max_len=128)

# Используем num_workers > 0 только если CustomDataset глобально определён
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=4)

(36002, 6)


In [50]:
from transformers import BertModel

class BertMultiHeadClassifier(torch.nn.Module):
    def __init__(self, model_name, num_age_categories=4, num_gender_categories=2):
        super(BertMultiHeadClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.dropout = torch.nn.Dropout(0.2)
        # Классификатор для возрастных категорий
        self.age_classifier = torch.nn.Linear(self.bert.config.hidden_size, num_age_categories)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]  # Используем CLS-токен для классификации
        pooled_output = self.dropout(pooled_output)
        age_logits = self.age_classifier(pooled_output)
        return age_logits


In [51]:
import torch
from transformers import AdamW
from torch.nn import CrossEntropyLoss
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Создание модели
model = BertMultiHeadClassifier("bert-base-uncased", num_age_categories=4, num_gender_categories=2)
model = model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-4, eps=1e-8, weight_decay=0.01)

EPOCHS = 15
total_steps = len(train_dataloader) * EPOCHS

age_class_weights = torch.tensor([6.42, 0.75, 0.68, 0.95],
                                 dtype=torch.float32).to(device)  

age_loss_fn = CrossEntropyLoss(weight=age_class_weights).to(device)
gender_loss_fn = CrossEntropyLoss().to(device)



In [52]:
from tqdm import tqdm

# Функция обучения модели
def train_epoch(model, data_loader, age_loss_fn, gender_loss_fn, optimizer, device, epoch):
    model.train()
    total_loss = 0

    loop = tqdm(enumerate(data_loader), total=len(data_loader), desc=f"Epoch {epoch}", leave=False)

    for _, batch in loop:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        age_labels = batch['age_class'].to(device)

        age_logits = model(input_ids, attention_mask=attention_mask)

        age_loss = age_loss_fn(age_logits, age_labels)
        loss = age_loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    return total_loss / len(data_loader)

# Функция валидации модели
def eval_model(model, data_loader, age_loss_fn, gender_loss_fn, device):
    model.eval()
    total_loss = 0
    correct_age = 0
    correct_gender = 0
    total_samples = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            age_labels = batch['age_class'].to(device)

            age_logits = model(input_ids, attention_mask=attention_mask)

            age_loss = age_loss_fn(age_logits, age_labels)
            loss = age_loss

            total_loss += loss.item()

            _, age_preds = torch.max(age_logits, dim=1)
            correct_age += (age_preds == age_labels).sum().item()
            total_samples += age_labels.size(0)

    accuracy_age = correct_age / total_samples

    return total_loss / len(data_loader), accuracy_age


In [53]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_loss = train_epoch(model, train_dataloader, age_loss_fn, gender_loss_fn, optimizer, device, epoch)
    print(f'Training loss: {train_loss}')

    val_loss, val_acc_age = eval_model(model, val_dataloader, age_loss_fn, gender_loss_fn, device)
    print(f'Validation loss: {val_loss}')
    print(f'Validation Age Accuracy: {val_acc_age}')
    print(f'Validation Gender Accuracy: {val_acc_gender}')
    
    model.bert.save_pretrained(output_path+f"epoch_{epoch}/bert_fine_{epoch}")
    torch.save(model.age_classifier.state_dict(), output_path+f'epoch_{epoch}/trained_age_{epoch}.pt')
    
    print("Save model")
    


Epoch 1/15
----------


Epoch 0:  65%|██████▌   | 5879/9001 [33:26<17:42,  2.94it/s, loss=1.22] 

In [None]:
model.bert.save_pretrained(output_path+"bert_fine")
torch.save(model.age_classifier.state_dict(), output_path+'trained_age.pt')