### HuBERT

In [None]:
import numpy as np
import torch
import torchaudio
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from tqdm import tqdm
import pandas as pd
import os

In [None]:
# Load CSV file
df = pd.read_csv('train-data-annotation-v1.csv')

# Load feature extractor and model
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/hubert-base-ls960")
model = Wav2Vec2ForCTC.from_pretrained("facebook/hubert-base-ls960")

# Set up resampler
resampler = torchaudio.transforms.Resample(orig_freq=48000, new_freq=16000)

def extract_features(file_path):
    if os.path.exists(file_path):
        speech, * = torchaudio.load(file_path)
        speech = resampler(speech)
        
        if speech.shape[0] == 2:
            speech = speech.mean(dim=0, keepdim=True)
            
        speech = speech.squeeze()
        
        inputs = feature_extractor(speech, return_tensors="pt", padding=True, sampling_rate=16000)     
        with torch.no_grad():
            outputs = model(**inputs)
            
        return outputs.logits.squeeze().cpu().numpy()
    else:
        print(f"File not found: {file_path}")
        return None

def adjust_features(features, target_len):
    adjusted_features = []
    for feat in features:
        if feat is not None:
            if feat.shape[0] < target_len:
                adjusted_feat = np.pad(feat, ((0, target_len - feat.shape[0]), (0, 0)), mode='constant')
            else:
                adjusted_feat = feat[:target_len, :]
            adjusted_features.append(adjusted_feat)
        else:
            adjusted_features.append(np.zeros((target_len, 32)))  # 32 is the dimension of the feature vector
    return np.array(adjusted_features)

def extract_features_for_role(vid_names, role):
    features = []
    for vid in tqdm(vid_names, desc=f"Extracting {role} features"):
        file_path = f"audio-files-split-2/{vid}/{vid}_{role}.wav"
        feature = extract_features(file_path)
        features.append(feature)
    
    # Calculate median length of feature vectors
    lengths = [feat.shape[0] for feat in features if feat is not None]
    median_len = int(np.median(lengths))
    
    print(f"Median length for {role}: {median_len}")
    
    # Adjust to median length
    adjusted_features = adjust_features(features, median_len)
    
    return adjusted_features

# Extract features according to the order of 'VID_NAME' in the CSV file
vid_names = df['VID_NAME'].tolist()
talker_features = extract_features_for_role(vid_names, 'talker')
listener_features = extract_features_for_role(vid_names, 'listener')

print("Talker features shape:", talker_features.shape)
print("Listener features shape:", listener_features.shape)
print("Features saved successfully.")

In [None]:
df = pd.read_csv('val-data-annotation-v1.csv')

vid_names = df['VID_NAME'].tolist()

val_talker_features = extract_features_for_role(vid_names, 'talker')
val_listener_features = extract_features_for_role(vid_names, 'listener')

In [None]:
df = pd.read_csv('test-data-annotation-v1.csv')

vid_names = df['VID_NAME'].tolist()

test_talker_features = extract_features_for_role(vid_names, 'talker')
test_listener_features = extract_features_for_role(vid_names, 'listener')

In [None]:
min_timesteps = min(listener_features.shape[1], val_listener_features.shape[1], test_listener_features.shape[1])

train_features_l = listener_features[:, :min_timesteps, :]
val_features_l = val_listener_features[:, :min_timesteps, :]
test_features_l = test_listener_features[:, :min_timesteps, :]

### Wav2Vec 2.0

In [None]:
# Load CSV file
df = pd.read_csv('train-data-annotation-v1.csv')

# Load Wav2Vec 2.0 processor and model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

# Set up resampler
resampler = torchaudio.transforms.Resample(orig_freq=48000, new_freq=16000)

def extract_features(file_path):
    if os.path.exists(file_path):
        speech, sr = torchaudio.load(file_path)
        speech = resampler(speech)
        
        if speech.shape[0] == 2:
            speech = speech.mean(dim=0, keepdim=True)
            
        speech = speech.squeeze().numpy()
        
        inputs = processor(speech, return_tensors="pt", padding=True, sampling_rate=16000)
        
        with torch.no_grad():
            outputs = model(**inputs)
            
        # Use the last hidden state
        features = outputs.last_hidden_state.squeeze().cpu().numpy()
        
        return features
    else:
        print(f"File not found: {file_path}")
        return None

def adjust_features(features, target_len):
    adjusted_features = []
    for feat in features:
        if feat is not None:
            if feat.shape[0] < target_len:
                adjusted_feat = np.pad(feat, ((0, target_len - feat.shape[0]), (0, 0)), mode='constant')
            else:
                adjusted_feat = feat[:target_len, :]
            adjusted_features.append(adjusted_feat)
        else:
            adjusted_features.append(np.zeros((target_len, 768)))  # Wav2Vec 2.0's feature vector dimension is 768
    return np.array(adjusted_features)

def extract_features_for_role(vid_names, role):
    features = []
    for vid in tqdm(vid_names, desc=f"Extracting {role} features"):
        file_path = f"audio-files-split-2/{vid}/{vid}_{role}.wav"
        feature = extract_features(file_path)
        features.append(feature)
    
    # Calculate median length of feature vectors
    lengths = [feat.shape[0] for feat in features if feat is not None]
    median_len = int(np.median(lengths))
    
    print(f"Median length for {role}: {median_len}")
    
    # Adjust to median length
    adjusted_features = adjust_features(features, median_len)
    
    return adjusted_features

# Extract features according to the order of 'VID_NAME' in the CSV file
vid_names = df['VID_NAME'].tolist()
talker_features = extract_features_for_role(vid_names, 'talker')
listener_features = extract_features_for_role(vid_names, 'listener')

print("Talker features shape:", talker_features.shape)
print("Listener features shape:", listener_features.shape)
print("Features extracted successfully.")

In [None]:
df = pd.read_csv('val-data-annotation-v1.csv')

vid_names = df['VID_NAME'].tolist()

val_talker_features = extract_features_for_role(vid_names, 'talker')
val_listener_features = extract_features_for_role(vid_names, 'listener')

In [None]:
df = pd.read_csv('test-data-annotation-v1.csv')

vid_names = df['VID_NAME'].tolist()

test_talker_features = extract_features_for_role(vid_names, 'talker')
test_listener_features = extract_features_for_role(vid_names, 'listener')

In [None]:
min_timesteps = min(listener_features.shape[1], val_listener_features.shape[1], test_listener_features.shape[1])

train_features_l = listener_features[:, :min_timesteps, :]
val_features_l = val_listener_features[:, :min_timesteps, :]
test_features_l = test_listener_features[:, :min_timesteps, :]