<a href="https://colab.research.google.com/github/Tatyanka25/Course-paper/blob/main/GigaAM_Emo_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get install sox libsndfile1 ffmpeg
!pip install matplotlib>=3.3.2

BRANCH = 'r1.21.0'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]

In [None]:
from typing import List, Union

!pip install hydra-core
!pip install omegaconf

from omegaconf import OmegaConf
import torch
import torchaudio
import soundfile as sf
from omegaconf import DictConfig, ListConfig
import hydra
from google.colab import drive
drive.mount('/content/gdrive/')
!ls /content/gdrive/

In [4]:
class SpecScaler(torch.nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return torch.log(x.clamp_(1e-9, 1e9))


class GigaAMEmo(torch.nn.Module):
    def __init__(self, conf: Union[DictConfig, ListConfig]):
        super().__init__()
        self.id2name = conf.id2name
        self.feature_extractor = hydra.utils.instantiate(conf.feature_extractor)
        self.conformer = hydra.utils.instantiate(conf.encoder)
        self.linear_head = hydra.utils.instantiate(conf.classification_head)

    def forward(self, features, features_length=None):
        if features.dim() == 2:
            features = features.unsqueeze(0)
        if not features_length:
            features_length = torch.ones(features.shape[0]) * features.shape[-1]
            features_length = features_length.to(features.device)
        encoded, _ = self.conformer(audio_signal=features, length=features_length)
        encoded_pooled = torch.nn.functional.avg_pool1d(
            encoded, kernel_size=encoded.shape[-1]
        ).squeeze(-1)

        logits = self.linear_head(encoded_pooled)
        return logits

    def get_probs(self, audio_path: str) -> List[List[float]]:
        audio_signal, _ = sf.read(audio_path, dtype="float32")
        features = self.feature_extractor(torch.tensor(audio_signal).float().to(next(self.parameters()).device))
        logits = self.forward(features)
        probs = torch.nn.functional.softmax(logits).detach().tolist()
        return probs

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

# Loading weights, config and example wav for CTC-model
!wget https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/emo_model_weights.ckpt
!wget https://n-ws-q0bez.s3pd12.sbercloud.ru/b-ws-q0bez-jpv/GigaAM/emo_model_config.yaml

## **Мultiple emotions with probabilities**

In [None]:
from omegaconf import OmegaConf
model_config = 'emo_model_config.yaml'
model_weights = 'emo_model_weights.ckpt'
audio_path = '/content/gdrive/My Drive/Emotion_models/My_audio/angry.wav'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

conf = OmegaConf.load(model_config)
model = GigaAMEmo(conf)
ckpt = torch.load(model_weights, map_location="cpu")
model.load_state_dict(ckpt, strict=False)
model = model.to(device)
model.eval()
with torch.no_grad():
    probs = model.get_probs(audio_path)[0]
print(", ".join([f"{model.id2name[i]}: {p:.3f}" for i, p in enumerate(probs)]))

## **The most likely emotion**

In [None]:
from omegaconf import OmegaConf
model_config = 'emo_model_config.yaml'
model_weights = 'emo_model_weights.ckpt'
audio_path = '/content/gdrive/My Drive/Emotion_models/My_audio/angry.wav'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

conf = OmegaConf.load(model_config)
model = GigaAMEmo(conf)
ckpt = torch.load(model_weights, map_location="cpu")
model.load_state_dict(ckpt, strict=False)
model = model.to(device)
model.eval()
with torch.no_grad():
    probs = model.get_probs(audio_path)[0]
emotion = max(enumerate(probs), key=lambda item: item[1])
print(f"Predicted emotion: {model.id2name[emotion[0]]}")

## **RAVDESS**

In [None]:
#RAVDESS
import os, glob
import numpy as np
from sklearn.metrics import accuracy_score

emotion_labels = {
  '01':'neutral',
  '02':'calm',
  '03':'positive',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

focused_emotion_labels = ['positive', 'sad', 'angry', 'neutral']

model_config = 'emo_model_config.yaml'
model_weights = 'emo_model_weights.ckpt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

conf = OmegaConf.load(model_config)
model = GigaAMEmo(conf)
ckpt = torch.load(model_weights, map_location="cpu")
model.load_state_dict(ckpt, strict=False)
model = model.to(device)
model.eval()

true_labels = []
predictions = []

for file in glob.glob("/content/gdrive/My Drive/Основы программирования/Emotions/Actor_*//*.wav"):
  audio_path=os.path.basename(file)
  emotion = emotion_labels[audio_path.split("-")[2]]
  if emotion not in focused_emotion_labels:
            continue
  true_labels.append(emotion)
  with torch.no_grad():
    probs = model.get_probs(file)[0]
  emotion_1 = max(enumerate(probs), key=lambda item: item[1])
  predictions.append(model.id2name[emotion_1[0]])
  #print(f"Predicted emotion: {model.id2name[emotion_1[0]]}"," Real emotion:", emotion)

accuracy = accuracy_score(y_true=true_labels, y_pred= predictions)
print("Accuracy of the Recognizer is: {:.1f}%".format(accuracy*100))

## **RAVDESS: recognition accuracy for each emotion**

In [None]:
import os, glob
import numpy as np
from sklearn.metrics import classification_report

emotion_labels = {
  '01':'neutral',
  '02':'calm',
  '03':'positive',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

focused_emotion_labels = ['positive', 'sad', 'angry', 'neutral']

model_config = 'emo_model_config.yaml'
model_weights = 'emo_model_weights.ckpt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

conf = OmegaConf.load(model_config)
model = GigaAMEmo(conf)
ckpt = torch.load(model_weights, map_location="cpu")
model.load_state_dict(ckpt, strict=False)
model = model.to(device)
model.eval()

true_labels = []
predictions = []

for file in glob.glob("/content/gdrive/My Drive/Основы программирования/Emotions/Actor_*//*.wav"):
  audio_path=os.path.basename(file)
  emotion = emotion_labels[audio_path.split("-")[2]]
  if emotion not in focused_emotion_labels:
            continue
  true_labels.append(emotion)
  with torch.no_grad():
    probs = model.get_probs(file)[0]
  emotion_1 = max(enumerate(probs), key=lambda item: item[1])
  predictions.append(model.id2name[emotion_1[0]])
  #print(f"Predicted emotion: {model.id2name[emotion_1[0]]}"," Real emotion:", emotion)

report = classification_report(y_true=true_labels, y_pred=predictions, target_names=focused_emotion_labels)
print(report)

## **SAVEE**

In [None]:
#SAVEE
import os, glob
from sklearn.metrics import accuracy_score
import re

emotion_labels = {
  'n':'neutral',
  'c':'calm',
  'h':'positive',
  'sa':'sad',
  'a':'angry',
  'f':'fearful',
  'd':'disgust',
  'su':'surprised'
}

focused_emotion_labels = ['positive', 'sad', 'angry', 'neutral']

model_config = 'emo_model_config.yaml'
model_weights = 'emo_model_weights.ckpt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

conf = OmegaConf.load(model_config)
model = GigaAMEmo(conf)
ckpt = torch.load(model_weights, map_location="cpu")
model.load_state_dict(ckpt, strict=False)
model = model.to(device)
model.eval()

true_labels = []
predictions = []

for file in glob.glob("/content/gdrive/My Drive/Основы программирования/savee/*//*.wav"):
  audio_path=os.path.basename(file)
  emotion = emotion_labels[re.search(r"([a-zA-Z]+)", audio_path).group(1)]
  if emotion not in focused_emotion_labels:
            continue
  true_labels.append(emotion)
  with torch.no_grad():
    probs = model.get_probs(file)[0]
  emotion_1 = max(enumerate(probs), key=lambda item: item[1])
  predictions.append(model.id2name[emotion_1[0]])
  #print(f"Predicted emotion: {model.id2name[emotion_1[0]]}"," Real emotion:", emotion)

accuracy = accuracy_score(y_true=true_labels, y_pred= predictions)
print("Accuracy of the Recognizer is: {:.1f}%".format(accuracy*100))

## **SAVEE: recognition accuracy for each emotion**

In [None]:
import os, glob
from sklearn.metrics import classification_report
import re

emotion_labels = {
  'n':'neutral',
  'c':'calm',
  'h':'positive',
  'sa':'sad',
  'a':'angry',
  'f':'fearful',
  'd':'disgust',
  'su':'surprised'
}

focused_emotion_labels = ['positive', 'sad', 'angry', 'neutral']

model_config = 'emo_model_config.yaml'
model_weights = 'emo_model_weights.ckpt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

conf = OmegaConf.load(model_config)
model = GigaAMEmo(conf)
ckpt = torch.load(model_weights, map_location="cpu")
model.load_state_dict(ckpt, strict=False)
model = model.to(device)
model.eval()

true_labels = []
predictions = []

for file in glob.glob("/content/gdrive/My Drive/Основы программирования/savee/*//*.wav"):
  audio_path=os.path.basename(file)
  emotion = emotion_labels[re.search(r"([a-zA-Z]+)", audio_path).group(1)]
  if emotion not in focused_emotion_labels:
            continue
  true_labels.append(emotion)
  with torch.no_grad():
    probs = model.get_probs(file)[0]
  emotion_1 = max(enumerate(probs), key=lambda item: item[1])
  predictions.append(model.id2name[emotion_1[0]])
  #print(f"Predicted emotion: {model.id2name[emotion_1[0]]}"," Real emotion:", emotion)

report = classification_report(y_true=true_labels, y_pred=predictions, target_names=focused_emotion_labels)
print(report)

## **DUSHA**

In [None]:
#DUSHA
import pandas as pd
import tensorflow as tf

df = pd.read_csv('/content/gdrive/My Drive/Основы программирования/raw_crowd_test.tsv', sep='\t')
unique_emotions = df['speaker_emo'].unique()
print(unique_emotions)

In [None]:
import os, glob
from sklearn.metrics import accuracy_score

model_config = 'emo_model_config.yaml'
model_weights = 'emo_model_weights.ckpt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

conf = OmegaConf.load(model_config)
model = GigaAMEmo(conf)
ckpt = torch.load(model_weights, map_location="cpu")
model.load_state_dict(ckpt, strict=False)
model = model.to(device)
model.eval()

true_labels = []
predictions = []

# Чтение файла с информацией о данных
info_df = pd.read_csv('/content/gdrive/My Drive/Основы программирования/raw_crowd_test.tsv', sep='\t')

# Перебор аудиофайлов
for file in glob.glob("/content/gdrive/My Drive/Основы программирования/wavs_test/*.wav"):
    audio_path=os.path.basename(file)
    audio_file_name = audio_path.split('.')[0]
    # Проверка наличия названия аудиофайла в файле с информацией о данных
    if audio_file_name in info_df['hash_id'].values:
        if pd.isnull(info_df[info_df['hash_id'] == audio_file_name]['speaker_emo'].values[0]):
            # Если столбец 'speaker_emo' пуст, продолжить
            continue
        else:
            # В противном случае получить эмоцию, соответствующую аудиофайлу
            emotion = info_df[info_df['hash_id'] == audio_file_name]['speaker_emo'].values[0]
            true_labels.append(emotion)
            with torch.no_grad():
              probs = model.get_probs(file)[0]
            emotion_1 = max(enumerate(probs), key=lambda item: item[1])
            predictions.append(model.id2name[emotion_1[0]])
            #print(f"Predicted emotion: {model.id2name[emotion_1[0]]}"," Real emotion:", emotion)

accuracy = accuracy_score(y_true=true_labels, y_pred= predictions)
print("Accuracy of the Recognizer is: {:.1f}%".format(accuracy*100))

## **DUSHA: recognition accuracy for each emotion**

In [None]:
import os, glob
from sklearn.metrics import classification_report

model_config = 'emo_model_config.yaml'
model_weights = 'emo_model_weights.ckpt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

conf = OmegaConf.load(model_config)
model = GigaAMEmo(conf)
ckpt = torch.load(model_weights, map_location="cpu")
model.load_state_dict(ckpt, strict=False)
model = model.to(device)
model.eval()

true_labels = []
predictions = []

focused_emotion_labels = ['positive', 'sad', 'angry', 'neutral']

# Чтение файла с информацией о данных
info_df = pd.read_csv('/content/gdrive/My Drive/Основы программирования/raw_crowd_test.tsv', sep='\t')

# Перебор аудиофайлов
for file in glob.glob("/content/gdrive/My Drive/Основы программирования/wavs_test/*.wav"):
    audio_path=os.path.basename(file)
    audio_file_name = audio_path.split('.')[0]
    # Проверка наличия названия аудиофайла в файле с информацией о данных
    if audio_file_name in info_df['hash_id'].values:
        if pd.isnull(info_df[info_df['hash_id'] == audio_file_name]['speaker_emo'].values[0]):
            # Если столбец 'speaker_emo' пуст, продолжить
            continue
        else:
            # В противном случае получить эмоцию, соответствующую аудиофайлу
            emotion = info_df[info_df['hash_id'] == audio_file_name]['speaker_emo'].values[0]
            true_labels.append(emotion)
            with torch.no_grad():
              probs = model.get_probs(file)[0]
            emotion_1 = max(enumerate(probs), key=lambda item: item[1])
            predictions.append(model.id2name[emotion_1[0]])
            #print(f"Predicted emotion: {model.id2name[emotion_1[0]]}"," Real emotion:", emotion)

report = classification_report(y_true=true_labels, y_pred=predictions, target_names=focused_emotion_labels)
print(report)

## **RESD**

In [None]:
#RESD
import os, glob
from sklearn.metrics import accuracy_score
import statistics


emotion_labels = {
  'sadness':'sad',
  'neutral':'neutral',
  'happiness':'positive',
  'anger':'angry',
  'fear':'fearful',
  'disgust':'disgust',
  'enthusiasm':'enthusiasm'
}

focused_emotion_labels = ['positive', 'sad', 'angry', 'neutral']

model_config = 'emo_model_config.yaml'
model_weights = 'emo_model_weights.ckpt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

conf = OmegaConf.load(model_config)
model = GigaAMEmo(conf)
ckpt = torch.load(model_weights, map_location="cpu")
model.load_state_dict(ckpt, strict=False)
model = model.to(device)
model.eval()

true_labels = []
predictions = []

# Чтение файла с информацией о данных
info_resd_df = pd.read_csv('/content/gdrive/My Drive/Основы программирования/resd_test.csv', sep=',')

# Перебор аудиофайлов
for file in glob.glob("/content/gdrive/My Drive/Основы программирования/resd_test/*//*.wav"):
    audio_path=os.path.basename(file)
    audio_file_name = audio_path.split('.')[0]
    # Проверка наличия названия аудиофайла в файле с информацией о данных
    if audio_file_name in info_resd_df['name'].values:
        if pd.isnull(info_resd_df[info_resd_df['name'] == audio_file_name]['emotion'].values[0]):
            # Если столбец 'speaker_emo' пуст, продолжить
            continue
        else:
            # В противном случае получить эмоцию, соответствующую аудиофайлу
            emotion = emotion_labels[info_resd_df[info_resd_df['name'] == audio_file_name]['emotion'].values[0]]
            if emotion not in focused_emotion_labels:
              continue
            true_labels.append(emotion)
            with torch.no_grad():
              probs = model.get_probs(file)[0]
            emotion_1 = max(enumerate(probs), key=lambda item: item[1])
            predictions.append(model.id2name[emotion_1[0]])
            #print(f"Predicted emotion: {model.id2name[emotion_1[0]]}"," Real emotion:", emotion)

accuracy = accuracy_score(y_true=true_labels, y_pred= predictions)
print("Accuracy of the Recognizer is: {:.1f}%".format(accuracy*100))

## **RESD: recognition accuracy for each emotion**

In [None]:
import os, glob
from sklearn.metrics import classification_report
import statistics


emotion_labels = {
  'sadness':'sad',
  'neutral':'neutral',
  'happiness':'positive',
  'anger':'angry',
  'fear':'fearful',
  'disgust':'disgust',
  'enthusiasm':'enthusiasm'
}

focused_emotion_labels = ['positive', 'sad', 'angry', 'neutral']

model_config = 'emo_model_config.yaml'
model_weights = 'emo_model_weights.ckpt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

conf = OmegaConf.load(model_config)
model = GigaAMEmo(conf)
ckpt = torch.load(model_weights, map_location="cpu")
model.load_state_dict(ckpt, strict=False)
model = model.to(device)
model.eval()

true_labels = []
predictions = []

# Чтение файла с информацией о данных
info_resd_df = pd.read_csv('/content/gdrive/My Drive/Основы программирования/resd_test.csv', sep=',')

# Перебор аудиофайлов
for file in glob.glob("/content/gdrive/My Drive/Основы программирования/resd_test/*//*.wav"):
    audio_path=os.path.basename(file)
    audio_file_name = audio_path.split('.')[0]
    # Проверка наличия названия аудиофайла в файле с информацией о данных
    if audio_file_name in info_resd_df['name'].values:
        if pd.isnull(info_resd_df[info_resd_df['name'] == audio_file_name]['emotion'].values[0]):
            # Если столбец 'speaker_emo' пуст, продолжить
            continue
        else:
            # В противном случае получить эмоцию, соответствующую аудиофайлу
            emotion = emotion_labels[info_resd_df[info_resd_df['name'] == audio_file_name]['emotion'].values[0]]
            if emotion not in focused_emotion_labels:
              continue
            true_labels.append(emotion)
            with torch.no_grad():
              probs = model.get_probs(file)[0]
            emotion_1 = max(enumerate(probs), key=lambda item: item[1])
            predictions.append(model.id2name[emotion_1[0]])
            #print(f"Predicted emotion: {model.id2name[emotion_1[0]]}"," Real emotion:", emotion)

report = classification_report(y_true=true_labels, y_pred=predictions, target_names=focused_emotion_labels)
print(report)