In [None]:
import torch
from transformers import AutoTokenizer, AutoModel


device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

model_path = 'distilbert/distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_path)
device

'cuda:0'

In [None]:
import torch.nn as nn
import numpy as np
import torch.nn.functional as F



class BertMultiRegressor(nn.Module):
    """
    Модель мульти регрессора на основе эмбеддингов BERT
    """

    def __init__(self, bert_model_name, output_size):
        super(BertMultiRegressor, self).__init__()
        self.bert = AutoModel.from_pretrained(bert_model_name)
        self.regressor = nn.Linear(self.bert.config.hidden_size + 7, output_size)

    def forward(self,
        input_ids,
        attention_mask=None,
        audio_traits=None,
        labels=None,):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
        )

        last_hidden_state = outputs.last_hidden_state[:, 0, :]

        # Объединение эмбеддингов и аудио вектора
        result = torch.concat((last_hidden_state, audio_traits), dim=1)

        outputs = self.regressor(result).to(torch.float64)


        loss = None
        if labels is not None:

            loss_fn = nn.L1Loss()
            losses = [loss_fn(outputs[i], labels[i]) for i in range(outputs.size()[0])]
            loss = sum(losses)/len(losses)

        return {
            "loss": loss,
            "logits": outputs
        }


model = BertMultiRegressor(model_path, 6)

In [None]:
model.train()

BertMultiRegressor(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Lin

In [None]:
import pandas as pd

# Формирование датасета для обучения

def process_dataset(data, data_trans, data_emo, data_cap, data_audio):
    result_df = pd.DataFrame()
    for emotion in data['emotion'].unique():
        temp_df = data[data['emotion'] == emotion][['video_path', 'annotation']]
        temp_df = temp_df.rename(columns={'annotation': emotion})

        if result_df.empty:
            result_df = temp_df
        else:
            result_df = result_df.merge(temp_df, on='video_path', how='outer')
    final_df = result_df.merge(data_trans, on='video_path', how='outer')
    data_cap = data_cap.rename(columns={'Filename': 'video_path'})
    final_df = final_df.merge(data_cap, on='video_path', how='outer')
    data_emo = data_emo.rename(columns={'Filename': 'video_path'})
    final_df = final_df.merge(data_emo[['video_path', 'Emotion']], on='video_path', how='outer')
    data_audio = data_audio.rename(columns={'Filename': 'video_path'})
    final_df = final_df.merge(data_audio, on='video_path', how='outer')
    return final_df

In [None]:
test_texts = process_dataset(pd.read_csv('annotation_validation.csv'), pd.read_csv('transcription_validation.csv'),
                              pd.read_csv('emotions_validation.csv'), pd.read_csv('video_captions_validation.csv'),
                              pd.read_csv('audio_features_validation.csv'))
test_texts.head(2)

Unnamed: 0,video_path,extraversion,neuroticism,agreeableness,conscientiousness,interview,openness,transcription,Description,Emotion,AudioFeatures
0,-6otZ7M-Mro.003.mp4,0.71028,0.552083,0.681319,0.728155,0.654206,0.666667,... About you. I have a third nipple. It doesn...,a man with blonde hair and a plaid shirt,neutral,"[121.996249412252, 5191.308075129179, 18.4303,..."
1,-6otZ7M-Mro.005.mp4,0.523364,0.635417,0.626374,0.728155,0.64486,0.6,"Beauty gurus, whatever you want to call them. ...",a man with blonde hair and a plaid shirt,angry,"[115.36711503771465, 2358.436542021055, 23.853..."


In [None]:
train_dataset = process_dataset(pd.read_csv('annotation_training.csv'), pd.read_csv('transcription_training.csv'),
                                pd.read_csv('emotions_train.csv'), pd.read_csv('video_captions.csv'),
                                pd.read_csv('audio_features_train.csv'))
len(train_dataset)

6000

In [None]:
test_dataset = process_dataset(pd.read_csv('annotation_validation.csv'), pd.read_csv('transcription_validation.csv'),
                              pd.read_csv('emotions_validation.csv'), pd.read_csv('video_captions_validation.csv'),
                              pd.read_csv('audio_features_validation.csv'))

In [None]:
train_dataset = train_dataset.dropna()
train_dataset.reset_index(drop=True, inplace=True)
len(train_dataset)

5766

In [None]:
test_dataset = test_dataset.dropna()
test_dataset.reset_index(drop=True, inplace=True)
len(test_dataset)

1923

In [None]:
train_dataset.head(2)

Unnamed: 0,video_path,extraversion,neuroticism,agreeableness,conscientiousness,interview,openness,transcription,Description,Emotion,AudioFeatures
0,--Ymqszjv54.001.mp4,0.551402,0.5,0.527473,0.650485,0.588785,0.744444,I like Tabasco sauce. I like Louisiana Hot Sau...,a man holding a bowl of vegetables in his hand,neutral,"[122.1400763651526, 680.116572306553, 49.11913..."
1,--Ymqszjv54.003.mp4,0.392523,0.427083,0.516484,0.475728,0.392523,0.466667,Lot more things there. Then the menus are a lo...,a man in an orange hoodie is sitting in front ...,neutral,"[121.88748153289707, 1014.9409413938398, 53.44..."


In [None]:
import numpy as np
from transformers import DataCollatorWithPadding
from ast import literal_eval


def preprocess_function(example):
   # Объединение всех текстовых метрик в один вход
   text = f"There is {example['Description']}. This person feels {example['Emotion']}. This person says: {example['transcription']}"
   res = tokenizer(text, truncation=True, padding='max_length', max_length=512)

   res['labels'] = np.array(example[['extraversion', 'neuroticism', 'agreeableness',
                                    'conscientiousness', 'interview', 'openness']].tolist())

   res['audio_traits'] = torch.FloatTensor(literal_eval(example['AudioFeatures']))
   return res

train_dataset = train_dataset.apply(preprocess_function, axis=1)
test_dataset = test_dataset.apply(preprocess_function, axis=1)


In [None]:
import numpy as np
import math
from torch import nn
from transformers import TrainingArguments, Trainer

# Запуск обучения

def compute_metrics(p):
    rmse_per_output = np.mean(np.sqrt(np.mean((p.label_ids - p.predictions) ** 2, axis=0)))
    rmae_per_output = np.mean(np.abs(p.label_ids  - p.predictions))
    return {'MultiRMSE': rmse_per_output,
           'MultiMAE': rmae_per_output,
           'Accuracy': 1 - rmae_per_output}


training_args = TrainingArguments(
  output_dir="disbert_audio",
  learning_rate=2e-5,
  per_device_train_batch_size=3,
  per_device_eval_batch_size=3,
  num_train_epochs=5,
  weight_decay=0.01,
  evaluation_strategy="epoch",
  save_strategy="epoch",
  load_best_model_at_end=True,
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=train_dataset,
   eval_dataset=test_dataset,
   compute_metrics=compute_metrics,
)

trainer.train()




Epoch,Training Loss,Validation Loss,Multirmse,Multimae,Accuracy
1,0.0887,0.147107,0.304377,0.147107,0.852893
2,0.0809,0.120802,0.164951,0.120802,0.879198
3,0.074,0.118761,0.187752,0.118761,0.881239
4,0.0611,0.118132,0.177369,0.118132,0.881868
5,0.0523,0.112364,0.150269,0.112364,0.887636


TrainOutput(global_step=9610, training_loss=0.07286596011896136, metrics={'train_runtime': 576.3636, 'train_samples_per_second': 50.021, 'train_steps_per_second': 16.674, 'total_flos': 0.0, 'train_loss': 0.07286596011896136, 'epoch': 5.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.11236380481704547,
 'eval_MultiRMSE': 0.15026899889002251,
 'eval_MultiMAE': 0.11236380481704548,
 'eval_Accuracy': 0.8876361951829546,
 'eval_runtime': 9.5509,
 'eval_samples_per_second': 201.343,
 'eval_steps_per_second': 67.114,
 'epoch': 5.0}

In [None]:
# Сохранение модели

torch.save(model.regressor.state_dict(), 'reg_state_dict.pth')

In [None]:
save_path = 'models/regressor/'
model.bert.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('models/hope_bert/tokenizer_config.json',
 'models/hope_bert/special_tokens_map.json',
 'models/hope_bert/vocab.txt',
 'models/hope_bert/added_tokens.json',
 'models/hope_bert/tokenizer.json')