# Extracting Audio Embeddings From PTMs

### Using A Wrapper by [Girish](https://github.com/CodeVault-girish/SFM-models.git)

In [None]:
!git clone https://github.com/CodeVault-girish/SFM-models.git

Cloning into 'SFM-models'...
remote: Enumerating objects: 164, done.[K
remote: Counting objects: 100% (164/164), done.[K
remote: Compressing objects: 100% (117/117), done.[K
remote: Total 164 (delta 86), reused 118 (delta 43), pack-reused 0 (from 0)[K
Receiving objects: 100% (164/164), 28.74 KiB | 3.59 MiB/s, done.
Resolving deltas: 100% (86/86), done.


In [None]:
import sys
sys.path.append("/kaggle/working/SFM-models")  


In [None]:
from sfm_extractor.extractor import model_list, extract_from
model_list()

Available models:
1. Trillsson
2. YAMNet
3. Facebook MMS-1B
4. SpeechBrain x-vector
5. Facebook HuBERT-base-ls960
6. Microsoft WavLM-base
7. Facebook Wav2Vec2-XLS-R-1B
8. Facebook Wav2Vec2-base
9. OpenAI Whisper-base
10. Microsoft UniSpeech-SAT-base-100h-Libri-ft
11. speechbrain/spkrec-ecapa-voxceleb


### Creating Specified Files for Embeddings

In [6]:
# !ls
# !touch audio_context_WavLM_base_embeddings.csv
# !touch audio_utterance_WavLM_base_embeddings.csv
# !touch audio_utterance_Wav2Vec2_base_embeddings.csv
# !touch audio_context_Wav2Vec2_base_embeddings.csv
!touch audio_context_mms_embeddings.csv
!touch audio_utterance_mms_embeddings.csv
!touch audio_context_hubert_embeddings.csv
!touch audio_utterance_hubert_embeddings.csv

# Extracting Various Embeddings From PTMs

In [7]:
extract_from("3", "/kaggle/input/btp-dataset/audio_context/audio_context", output_file="/kaggle/working/audio_context_mms_embeddings.csv", device="cuda")

100%|██████████| 1202/1202 [11:52<00:00,  1.69it/s]


Saved all features to /kaggle/working/audio_context_mms_embeddings.csv


In [8]:
extract_from("3", "/kaggle/input/btp-dataset/audio_utterance/audio_utterance", output_file="/kaggle/working/audio_utterance_mms_embeddings.csv", device="cuda")

100%|██████████| 1202/1202 [04:26<00:00,  4.50it/s]


Saved all features to /kaggle/working/audio_utterance_mms_embeddings.csv


In [9]:
extract_from("5", "/kaggle/input/btp-dataset/audio_utterance/audio_utterance", output_file="/kaggle/working/audio_utterance_hubert_embeddings.csv", device="cuda")

preprocessor_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Processing audio files: 100%|██████████| 1202/1202 [00:54<00:00, 21.86it/s]


Saved all features to /kaggle/working/audio_utterance_hubert_embeddings.csv


In [10]:
extract_from("5", "/kaggle/input/btp-dataset/audio_context/audio_context", output_file="/kaggle/working/audio_context_hubert_embeddings.csv", device="cuda")

Processing audio files: 100%|██████████| 1202/1202 [02:20<00:00,  8.55it/s]


Saved all features to /kaggle/working/audio_context_hubert_embeddings.csv


In [20]:
extract_from("8", "/kaggle/input/btp-dataset/audio_utterance/audio_utterance", output_file="/kaggle/working/audio_utterance_Wav2Vec2_base_embeddings.csv", device="cuda")

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

Processing audio files: 100%|██████████| 1202/1202 [00:58<00:00, 20.52it/s]


Saved all features to /kaggle/working/audio_utterance_Wav2Vec2_base_embeddings.csv


In [21]:
extract_from("8", "/kaggle/input/btp-dataset/audio_context/audio_context", output_file="/kaggle/working/audio_context_Wav2Vec2_base_embeddings.csv", device="cuda")

Processing audio files: 100%|██████████| 1202/1202 [02:31<00:00,  7.95it/s]


Saved all features to /kaggle/working/audio_context_Wav2Vec2_base_embeddings.csv


# Merging Audio Embeddings (Audio Context + Audio Utterance -> Audio Features)

In [None]:
"""
MERGING THE AUDIO EMBEDDINGS OF CONTEXT AND UTTERNACE WITH LABELS AND OTHER FEATURES
MERGING WavLM Embeddings
"""

import pandas as pd

# Load the CSV files
csv1 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_context_WavLM_base_embeddings.csv")
csv2 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_utterance_WavLM_base_embeddings.csv")
map_df = pd.read_csv("/kaggle/input/btp-audioembeddings/context_to_utterance_map.csv")

# Remove the 'audio_context/' and 'audio_utterance/' prefixes from map.csv
map_df["audio_context"] = map_df["audio_context"].str.replace("audio_context/", "", regex=False)
map_df["audio_utterance"] = map_df["audio_utterance"].str.replace("audio_utterance/", "", regex=False)

# Extract features (excluding the first column which is file_name)
features_csv1 = csv1.iloc[:, 1:].copy()  # Features from csv1
features_csv2 = csv2.iloc[:, 1:].copy()  # Features from csv2

# Rename columns to distinguish between csv1 and csv2 features
features_csv1.columns = [f"audio_c_feature_{col}" for col in features_csv1.columns]
features_csv2.columns = [f"audio_u_feature_{col}" for col in features_csv2.columns]

# Add file_name back to features for merging
features_csv1.insert(0, "filename", csv1.iloc[:, 0])
features_csv2.insert(0, "filename", csv2.iloc[:, 0])

# Merge csv1 with map.csv using audio_context (which is file_name in csv1)
merged_df = map_df.merge(features_csv1, left_on="audio_context", right_on="filename", how="inner")

# Merge csv2 with the updated dataframe using audio_utterance (which is file_name in csv2)
merged_df = merged_df.merge(features_csv2, left_on="audio_utterance", right_on="filename", how="inner", suffixes=("_csv1", "_csv2"))

# Drop redundant filename columns from csv1 and csv2
merged_df.drop(columns=["filename_csv1", "filename_csv2"], inplace=True)

# Rename columns to keep them organized
#merged_df.rename(columns={"audio_context": "file_csv1", "audio_utterance": "file_csv2"}, inplace=True)

# Save the final dataset
merged_df.to_csv("audio_features_WavLM_base.csv", index=False)

print("Merged dataset saved as final_dataset.csv")


In [None]:
"""
MERGING THE AUDIO EMBEDDINGS OF CONTEXT AND UTTERNACE WITH LABELS AND OTHER FEATURES
MERGING Wav2Vec2 Embeddings
"""

import pandas as pd

# Load the CSV files
csv1 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_context_Wav2Vec2_base_embeddings.csv")
csv2 = pd.read_csv("/kaggle/input/btp-audioembeddings/audio_utterance_Wav2Vec2_base_embeddings.csv")
map_df = pd.read_csv("/kaggle/input/btp-audioembeddings/context_to_utterance_map.csv")

# Remove the 'audio_context/' and 'audio_utterance/' prefixes from map.csv
map_df["audio_context"] = map_df["audio_context"].str.replace("audio_context/", "", regex=False)
map_df["audio_utterance"] = map_df["audio_utterance"].str.replace("audio_utterance/", "", regex=False)

# Extract features (excluding the first column which is file_name)
features_csv1 = csv1.iloc[:, 1:].copy()  # Features from csv1
features_csv2 = csv2.iloc[:, 1:].copy()  # Features from csv2

# Rename columns to distinguish between csv1 and csv2 features
features_csv1.columns = [f"audio_c_feature_{col}" for col in features_csv1.columns]
features_csv2.columns = [f"audio_u_feature_{col}" for col in features_csv2.columns]

# Add file_name back to features for merging
features_csv1.insert(0, "filename", csv1.iloc[:, 0])
features_csv2.insert(0, "filename", csv2.iloc[:, 0])

# Merge csv1 with map.csv using audio_context (which is file_name in csv1)
merged_df = map_df.merge(features_csv1, left_on="audio_context", right_on="filename", how="inner")

# Merge csv2 with the updated dataframe using audio_utterance (which is file_name in csv2)
merged_df = merged_df.merge(features_csv2, left_on="audio_utterance", right_on="filename", how="inner", suffixes=("_csv1", "_csv2"))

# Drop redundant filename columns from csv1 and csv2
merged_df.drop(columns=["filename_csv1", "filename_csv2"], inplace=True)

# Rename columns to keep them organized
#merged_df.rename(columns={"audio_context": "file_csv1", "audio_utterance": "file_csv2"}, inplace=True)

# Save the final dataset
merged_df.to_csv("audio_features_Wav2Vec2_base.csv", index=False)

print("Merged dataset saved as final_dataset.csv")


In [11]:
"""
MERGING THE AUDIO EMBEDDINGS OF CONTEXT AND UTTERNACE WITH LABELS AND OTHER FEATURES
MERGING HUBBERT Embeddings
"""

import pandas as pd

# Load the CSV files
csv1 = pd.read_csv("/kaggle/working/audio_context_hubert_embeddings.csv")
csv2 = pd.read_csv("/kaggle/working/audio_utterance_hubert_embeddings.csv")
map_df = pd.read_csv("/kaggle/input/btp-audioembeddings/context_to_utterance_map.csv")

# Remove the 'audio_context/' and 'audio_utterance/' prefixes from map.csv
map_df["audio_context"] = map_df["audio_context"].str.replace("audio_context/", "", regex=False)
map_df["audio_utterance"] = map_df["audio_utterance"].str.replace("audio_utterance/", "", regex=False)

# Extract features (excluding the first column which is file_name)
features_csv1 = csv1.iloc[:, 1:].copy()  # Features from csv1
features_csv2 = csv2.iloc[:, 1:].copy()  # Features from csv2

# Rename columns to distinguish between csv1 and csv2 features
features_csv1.columns = [f"audio_c_feature_{col}" for col in features_csv1.columns]
features_csv2.columns = [f"audio_u_feature_{col}" for col in features_csv2.columns]

# Add file_name back to features for merging
features_csv1.insert(0, "filename", csv1.iloc[:, 0])
features_csv2.insert(0, "filename", csv2.iloc[:, 0])

# Merge csv1 with map.csv using audio_context (which is file_name in csv1)
merged_df = map_df.merge(features_csv1, left_on="audio_context", right_on="filename", how="inner")

# Merge csv2 with the updated dataframe using audio_utterance (which is file_name in csv2)
merged_df = merged_df.merge(features_csv2, left_on="audio_utterance", right_on="filename", how="inner", suffixes=("_csv1", "_csv2"))

# Drop redundant filename columns from csv1 and csv2
merged_df.drop(columns=["filename_csv1", "filename_csv2"], inplace=True)

# Rename columns to keep them organized
#merged_df.rename(columns={"audio_context": "file_csv1", "audio_utterance": "file_csv2"}, inplace=True)

# Save the final dataset
merged_df.to_csv("audio_features_hubert.csv", index=False)

print("Merged dataset saved as final_dataset.csv")


Merged dataset saved as final_dataset.csv


In [12]:
"""
MERGING THE AUDIO EMBEDDINGS OF CONTEXT AND UTTERNACE WITH LABELS AND OTHER FEATURES
MERGING HUBBERT Embeddings
"""

import pandas as pd

# Load the CSV files
csv1 = pd.read_csv("/kaggle/working/audio_context_mms_embeddings.csv")
csv2 = pd.read_csv("/kaggle/working/audio_utterance_mms_embeddings.csv")
map_df = pd.read_csv("/kaggle/input/btp-audioembeddings/context_to_utterance_map.csv")

# Remove the 'audio_context/' and 'audio_utterance/' prefixes from map.csv
map_df["audio_context"] = map_df["audio_context"].str.replace("audio_context/", "", regex=False)
map_df["audio_utterance"] = map_df["audio_utterance"].str.replace("audio_utterance/", "", regex=False)

# Extract features (excluding the first column which is file_name)
features_csv1 = csv1.iloc[:, 1:].copy()  # Features from csv1
features_csv2 = csv2.iloc[:, 1:].copy()  # Features from csv2

# Rename columns to distinguish between csv1 and csv2 features
features_csv1.columns = [f"audio_c_feature_{col}" for col in features_csv1.columns]
features_csv2.columns = [f"audio_u_feature_{col}" for col in features_csv2.columns]

# Add file_name back to features for merging
features_csv1.insert(0, "filename", csv1.iloc[:, 0])
features_csv2.insert(0, "filename", csv2.iloc[:, 0])

# Merge csv1 with map.csv using audio_context (which is file_name in csv1)
merged_df = map_df.merge(features_csv1, left_on="audio_context", right_on="filename", how="inner")

# Merge csv2 with the updated dataframe using audio_utterance (which is file_name in csv2)
merged_df = merged_df.merge(features_csv2, left_on="audio_utterance", right_on="filename", how="inner", suffixes=("_csv1", "_csv2"))

# Drop redundant filename columns from csv1 and csv2
merged_df.drop(columns=["filename_csv1", "filename_csv2"], inplace=True)

# Rename columns to keep them organized
#merged_df.rename(columns={"audio_context": "file_csv1", "audio_utterance": "file_csv2"}, inplace=True)

# Save the final dataset
merged_df.to_csv("audio_features_mms.csv", index=False)

print("Merged dataset saved as final_dataset.csv")


Merged dataset saved as final_dataset.csv
