In [13]:
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer, BertModel, BertTokenizer
import torch
import pandas as pd
from torch.nn import Linear, Softmax
from sklearn.model_selection import train_test_split
import numpy as np

def convert_time_to_seconds(time_str):
    h, m, s = map(float, time_str.split(':'))
    return h * 3600 + m * 60 + s

def pad_audio(audio_segment, target_length):    
    # Вычисление длины, которую нужно добавить
    padding_length = target_length - len(audio_segment)
    if padding_length > 0:
        # Добавление нулей в конец аудиосегмента
        audio_segment = np.pad(audio_segment, (0, padding_length))
    return audio_segment


# Загрузка моделей и токенизаторов
audio_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
audio_tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")

text_model = BertModel.from_pretrained('bert-base-uncased')
text_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Создание пустого DataFrame для записи результатов

results = pd.DataFrame(columns=["Speaker", "Text", "Time Segment", "Embeddings","Emotions"])
results_text = pd.DataFrame(columns=["Speaker", "Text", "Time Segment", "Emotions","Embeddings"])
results_audio = pd.DataFrame(columns=["Speaker", "Text", "Time Segment", "Emotions","Embeddings"])

i = 0

# Создание пустых DataFrame для записи результатов
train_results = pd.DataFrame(columns=["Speaker", "Text", "Time Segment", "Embeddings", "Emotions"])
test_results = pd.DataFrame(columns=["Speaker", "Text", "Time Segment", "Embeddings", "Emotions"])

train_results_text = pd.DataFrame(columns=["Speaker", "Text", "Time Segment", "Emotions","Embeddings"])
test_results_text = pd.DataFrame(columns=["Speaker", "Text", "Time Segment", "Emotions","Embeddings"])

train_results_audio = pd.DataFrame(columns=["Speaker", "Text", "Time Segment", "Emotions","Embeddings"])
test_results_audio = pd.DataFrame(columns=["Speaker", "Text", "Time Segment", "Emotions","Embeddings"])

target_length = 16000

for i in range(1, 93):  # Для каждого из 17 файлов

    audio, sr = librosa.load(f"E:/Практика/Разметка Сухацкий - Щеглов/wav/{i}.wav", sr=16000)

    data = pd.read_csv(f'E:/Практика/Разметка Сухацкий - Щеглов/csv/{i}.csv', header=None)

    data.iloc[:, 0] = data.iloc[:, 0].replace({"spekear 1": "speaker 1", "spekear 2": "speaker 2", "spekear": "speaker 1", "speaker1": "speaker 1", "speaker2": "speaker 2"})

    # Фильтрация строк, которые содержат "speaker1" или "speaker2"
    filtered_data = data[data.iloc[:, 0].isin(["speaker 1", "speaker 2"])]

    # Удаление строк, где текст равен "0"
    filtered_data = filtered_data[filtered_data.iloc[:, 8] != "0"]

    filtered_data = filtered_data.reset_index(drop=True)

    emotions_data = data[data.iloc[:, 0].isin(["Эмоция"])]

    i = 0  # Reset i to 0 for each new file

    for row in filtered_data.itertuples(index=False):

        start_time = convert_time_to_seconds(row[2])  # начальное время в секундах

        end_time = convert_time_to_seconds(row[4])  # конечное время в секундах

        start_index = int(start_time * sr)  # начальный индекс в сэмплах
        end_index = int(end_time * sr)  # конечный индекс в сэмплах

        audio_segment = audio[start_index:end_index]

        audio_segment = pad_audio(audio_segment, target_length)

        audio_input_values = audio_tokenizer(audio_segment, return_tensors="pt").input_values

        audio_outputs = audio_model(audio_input_values)

        # Извлечение встраиваний
        audio_embeddings = audio_outputs.logits

        text = row[8]

        # Предобработка и токенизация текста
        text_inputs = text_tokenizer(text, return_tensors="pt")
        text_embeddings = text_model(**text_inputs).last_hidden_state

        # Получение размера встраиваний
        audio_embedding_size = audio_embeddings.size(-1)
        text_embedding_size = text_embeddings.size(-1)

        # Преобразование встраиваний в одинаковый размер
        audio_attention = Linear(audio_embedding_size, 1)
        text_attention = Linear(text_embedding_size, 1)
        softmax = Softmax(dim=1)

        audio_attention_weights = softmax(audio_attention(audio_embeddings))
        audio_embeddings = (audio_embeddings * audio_attention_weights).sum(dim=1)

        text_attention_weights = softmax(text_attention(text_embeddings))
        text_embeddings = (text_embeddings * text_attention_weights).sum(dim=1)

        #   Объединение встраиваний
        combined_embeddings = torch.cat((audio_embeddings, text_embeddings), dim=-1)

        embeddings_transform = Linear(combined_embeddings.size(-1), 768)  

        combined_embeddings = embeddings_transform(combined_embeddings)

        combined_embeddings_numpy = combined_embeddings.detach().numpy()

        text_embeddings_numpy = text_embeddings.detach().numpy()

        audio_embeddings_numpy = audio_embeddings.detach().numpy()


    # Запись результатов в DataFrame
        results = results.append({"Speaker": row[0], "Text": text, "Time Segment": (start_time, end_time), "Embeddings": combined_embeddings_numpy, "Emotions":  emotions_data.iloc[i, -1]}, ignore_index=True)
        results_text = results_text.append({"Speaker": row[0], "Text": text, "Time Segment": (start_time, end_time), "Emotions":  emotions_data.iloc[i, -1], "Embeddings": text_embeddings_numpy}, ignore_index=True)
        results_audio = results_audio.append({"Speaker": row[0], "Text": text, "Time Segment": (start_time, end_time), "Emotions":  emotions_data.iloc[i, -1], "Embeddings": audio_embeddings_numpy}, ignore_index=True)
        i = i + 1

train_data, test_data = train_test_split(results, test_size=0.2, random_state=42)
# Добавление данных в общие DataFrame
train_results = train_results.append(train_data, ignore_index=True)
test_results = test_results.append(test_data, ignore_index=True)

train_data_text, test_data_text = train_test_split(results_text, test_size=0.2, random_state=42)

train_results_text = train_results_text.append(train_data_text, ignore_index=True)
test_results_text = test_results_text.append(test_data_text, ignore_index=True)

train_data_audio, test_data_audio = train_test_split(results_audio, test_size=0.2, random_state=42)

train_results_audio = train_results_audio.append(train_data_audio, ignore_index=True)
test_results_audio = test_results_audio.append(test_data_audio, ignore_index=True)


Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

In [14]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,default,,00:00:00.000,0.0,00:00:01.600,1.6,00:00:01.600,1.6,понятно
1,default,,00:00:01.600,1.6,00:00:02.890,2.89,00:00:01.290,1.29,с 4 пути
2,default,,00:00:02.900,2.9,00:00:05.300,5.3,00:00:02.400,2.4,станция заречная разрешаете отправиться
3,default,,00:00:05.300,5.3,00:00:07.740,7.74,00:00:02.440,2.44,при жёлтом показании маршрутного светофора
4,default,,00:00:07.750,7.75,00:00:10.360,10.36,00:00:02.610,2.61,чн2 машинист поезда
5,default,,00:00:10.370,10.37,00:00:11.940,11.94,00:00:01.570,1.57,№1002
6,default,,00:00:11.940,11.94,00:00:13.774,13.774,00:00:01.834,1.834,хлебников
7,Эмоция,,00:00:00.000,0.0,00:00:01.600,1.6,00:00:01.600,1.6,neutral
8,Эмоция,,00:00:01.600,1.6,00:00:02.890,2.89,00:00:01.290,1.29,neutral
9,Эмоция,,00:00:02.900,2.9,00:00:05.300,5.3,00:00:02.400,2.4,neutral


In [15]:
results

Unnamed: 0,Speaker,Text,Time Segment,Embeddings,Emotions
0,speaker 1,машинист поезда номер 11,"(0.0, 2.19)","[[-2.093182, -0.084427565, -0.0048157796, -0.8...",neutral
1,speaker 1,на приближении к станции Тамбов,"(2.19, 4.14)","[[-1.0642551, 0.2636854, 0.31097046, -0.036249...",neutral
2,speaker 2,слушаю вас машинист поезда номер,"(5.13, 6.75)","[[0.5230138, -1.7581105, -0.7612661, -1.035871...",0
3,speaker 2,11 Артемов,"(6.75, 8.857)","[[-1.2073413, 0.62755525, -0.74986386, -1.1010...",neutral
4,speaker 1,по маршруту следования,"(0.0, 1.64)","[[0.5409283, 0.45159, 1.0782692, -1.3063444, 1...",happy
...,...,...,...,...,...
376,speaker 2,понятно переключаюсь на канал номер 2 станцион...,"(19.3, 22.4)","[[0.06510512, 0.42952305, -0.6499426, 0.091657...",0
377,speaker 2,под руководством состовителя петрова машинист ...,"(22.74, 25.0)","[[0.18058798, -0.32275182, -0.04420802, 0.9674...",neutral
378,speaker 2,дежурный по станции обнинск состовитель петров,"(27.4, 29.76)","[[1.0905488, -0.69343084, 0.08977455, -0.25394...",0
379,speaker 2,проверка связи на канале 2,"(29.76, 33.16)","[[0.010929722, 0.5535577, 0.29261592, -0.50885...",happy


In [16]:
results_text 

Unnamed: 0,Speaker,Text,Time Segment,Emotions,Embeddings
0,speaker 1,машинист поезда номер 11,"(0.0, 2.19)",neutral,"[[-0.1956833, -0.27624738, -0.20654655, -0.337..."
1,speaker 1,на приближении к станции Тамбов,"(2.19, 4.14)",neutral,"[[-0.22191104, -0.12967058, -0.060135793, -0.3..."
2,speaker 2,слушаю вас машинист поезда номер,"(5.13, 6.75)",0,"[[-0.17900729, -0.1986678, -0.05033465, -0.329..."
3,speaker 2,11 Артемов,"(6.75, 8.857)",neutral,"[[-0.10914676, -0.13654394, -0.12824324, -0.33..."
4,speaker 1,по маршруту следования,"(0.0, 1.64)",happy,"[[-0.12296123, -0.44541165, -0.105610386, -0.2..."
...,...,...,...,...,...
376,speaker 2,понятно переключаюсь на канал номер 2 станцион...,"(19.3, 22.4)",0,"[[-0.17588872, -0.08140291, 0.1442534, -0.3135..."
377,speaker 2,под руководством состовителя петрова машинист ...,"(22.74, 25.0)",neutral,"[[-0.2583757, -0.05231013, 0.040076576, -0.334..."
378,speaker 2,дежурный по станции обнинск состовитель петров,"(27.4, 29.76)",0,"[[-0.3046777, -0.21704413, 0.21304555, -0.2851..."
379,speaker 2,проверка связи на канале 2,"(29.76, 33.16)",happy,"[[0.22940046, -0.282548, -0.16429695, -0.30635..."


In [17]:
results_audio

Unnamed: 0,Speaker,Text,Time Segment,Emotions,Embeddings
0,speaker 1,машинист поезда номер 11,"(0.0, 2.19)",neutral,"[[5.4614654, -19.606144, -19.473978, -19.51362..."
1,speaker 1,на приближении к станции Тамбов,"(2.19, 4.14)",neutral,"[[5.689903, -18.06169, -17.981596, -18.027658,..."
2,speaker 2,слушаю вас машинист поезда номер,"(5.13, 6.75)",0,"[[5.9489303, -19.777016, -19.671215, -19.6003,..."
3,speaker 2,11 Артемов,"(6.75, 8.857)",neutral,"[[11.96084, -27.260258, -26.941566, -27.0545, ..."
4,speaker 1,по маршруту следования,"(0.0, 1.64)",happy,"[[5.741558, -19.600391, -19.503517, -19.543608..."
...,...,...,...,...,...
376,speaker 2,понятно переключаюсь на канал номер 2 станцион...,"(19.3, 22.4)",0,"[[3.7140734, -16.127014, -15.981678, -16.07527..."
377,speaker 2,под руководством состовителя петрова машинист ...,"(22.74, 25.0)",neutral,"[[3.6749468, -16.284145, -16.127737, -16.21118..."
378,speaker 2,дежурный по станции обнинск состовитель петров,"(27.4, 29.76)",0,"[[4.215435, -17.036978, -16.85318, -16.93004, ..."
379,speaker 2,проверка связи на канале 2,"(29.76, 33.16)",happy,"[[4.572869, -17.784475, -17.618923, -17.663439..."


In [18]:
filtered_data

Unnamed: 0,0,1,2,3,4,5,6,7,8


In [24]:
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.nn import Linear

class EmotionDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        self.linear_layer = Linear(32, 768)  # Предполагая, что ваши встраивания имеют размерность 32

    def __getitem__(self, idx):
        item = {'inputs_embeds': torch.tensor(self.encodings[idx])}
        item['labels'] = torch.tensor(self.labels[idx]).long()
        return item

    def __len__(self):
        return len(self.labels)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Предположим, что у вас есть метки эмоций для каждого сегмента текста
emotion_labels = train_results.iloc[:, -1].unique()  # Замените на ваши метки эмоций

# Преобразование меток эмоций в числовые значения
le = LabelEncoder()

train_labels = le.fit_transform(train_results['Emotions'])

# Создание модели RoBERTa для классификации
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(le.classes_))

# Подготовка данных для обучения и тестирования
datasets = {'combined': (train_results, test_results), 'text': (train_results_text, test_results_text), 'audio': (train_results_audio, test_results_audio)}
for data_type, (train_data, test_data) in datasets.items():
    train_encodings = train_data['Embeddings'].tolist()
    train_labels = le.fit_transform(train_data['Emotions'])
    train_dataset = EmotionDataset(train_encodings, train_labels)

    test_encodings = test_data['Embeddings'].tolist()
    test_labels = le.transform(test_data['Emotions'])
    test_dataset = EmotionDataset(test_encodings, test_labels)

    # Обучение модели
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=10,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        weight_decay=0.01,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )

    print(f"Training model on {data_type} data...")
    trainer.train()

    # Оценка модели
    print(f"Evaluating model on {data_type} data...")
    eval_results = trainer.evaluate()

    # Вывод результатов оценки
    for key, value in eval_results.items():
        print(f"{key}: {value}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  0%|          | 0/190 [19:20<?, ?it/s]


Training model on combined data...


Could not estimate the number of tokens of the input, floating-point operations will not be computed
                                       
100%|██████████| 190/190 [01:55<00:00,  1.65it/s]


{'train_runtime': 115.1563, 'train_samples_per_second': 26.399, 'train_steps_per_second': 1.65, 'train_loss': 1.4538866545024671, 'epoch': 10.0}
Evaluating model on combined data...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
100%|██████████| 2/2 [00:00<00:00, 26.54it/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


eval_loss: 1.2251523733139038
eval_accuracy: 0.6103896103896104
eval_f1: 0.4627147046501885
eval_precision: 0.3725754764715804
eval_recall: 0.6103896103896104
eval_runtime: 0.1547
eval_samples_per_second: 497.827
eval_steps_per_second: 12.931
epoch: 10.0
Training model on text data...


                                       
100%|██████████| 190/190 [01:51<00:00,  1.70it/s]


{'train_runtime': 111.7177, 'train_samples_per_second': 27.211, 'train_steps_per_second': 1.701, 'train_loss': 1.193438720703125, 'epoch': 10.0}
Evaluating model on text data...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
100%|██████████| 2/2 [00:00<00:00, 31.63it/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


eval_loss: 1.2306236028671265
eval_accuracy: 0.6103896103896104
eval_f1: 0.4627147046501885
eval_precision: 0.3725754764715804
eval_recall: 0.6103896103896104
eval_runtime: 0.1319
eval_samples_per_second: 583.666
eval_steps_per_second: 15.16
epoch: 10.0
Training model on audio data...




RuntimeError: The size of tensor a (32) must match the size of tensor b (768) at non-singleton dimension 2

100%|██████████| 30/30 [00:02<00:00, 12.34it/s]

{'train_runtime': 2.4311, 'train_samples_per_second': 197.441, 'train_steps_per_second': 12.34, 'train_loss': 1.475445302327474, 'epoch': 3.0}





TrainOutput(global_step=30, training_loss=1.475445302327474, metrics={'train_runtime': 2.4311, 'train_samples_per_second': 197.441, 'train_steps_per_second': 12.34, 'train_loss': 1.475445302327474, 'epoch': 3.0})