In [1]:
import os
import torch
import torchaudio
import pandas as pd
from transformers import Wav2Vec2FeatureExtractor
import torch.nn as nn
from transformers import HubertModel

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df = pd.read_csv('/home/teaching/Desktop/priyam/labels/vad_all_segments_all_audios.csv')
# df.head()
df['start'] = df['start']*1000
df['end'] = df['end']*1000
df['length'] = df['length']*1000

df.to_csv('/home/teaching/Desktop/priyam/labels/vad_all_segments_all_audios.csv')

In [None]:


# -------------------------------
# Model Definition
# -------------------------------
class HubertClassifierWithFreeze(nn.Module):
    def __init__(self, hubert_model_name="facebook/hubert-large-ls960-ft", num_labels=2):
        super().__init__()
        self.base_model = HubertModel.from_pretrained(hubert_model_name)
        self.config = self.base_model.config
        hidden_size = self.config.hidden_size

        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 1024),
            nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(1024, 512),
            nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(), nn.Dropout(0.1),
            nn.Linear(64, num_labels)
        )

    def forward(self, input_values, attention_mask=None):
        outputs = self.base_model(input_values=input_values, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state.mean(dim=1)
        logits = self.classifier(pooled)
        return logits

# -------------------------------
# Load model and processor
# -------------------------------
model_path = "/home/teaching/Desktop/hubert model/phase2_fine_tuned_model.pt"
hubert_name = "facebook/hubert-large-ls960-ft"
processor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = HubertClassifierWithFreeze(hubert_model_name=hubert_name)
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()

# -------------------------------
# Classify a single segment
# -------------------------------
def classify_segment(waveform, sr):
    if sr != 16000:
        waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)
    waveform = waveform.mean(dim=0)  # convert to mono
    segment = waveform.numpy()

    inputs = processor(segment, sampling_rate=16000, return_tensors="pt", padding=False)
    input_values = inputs["input_values"].to(device)
    attention_mask = (input_values != 0).long()

    with torch.no_grad():
        logits = model(input_values=input_values, attention_mask=attention_mask)
        pred = torch.argmax(logits, dim=-1).item()

    # Clear memory
    del inputs, input_values, attention_mask, logits
    torch.cuda.empty_cache()

    return "Mandarin" if pred == 0 else "English"


# -------------------------------
# Process all segments from VAD CSV
# -------------------------------
def classify_vad_segments(vad_csv, audio_dir, output_csv, target_audio_name=None):
    torch.cuda.empty_cache()
    df = pd.read_csv(vad_csv)

    if target_audio_name:
        df = df[df["audio_name"] == target_audio_name]

    results = []

    for _, row in df.iterrows():
        audio_name = row["audio_name"]
        audio_path = os.path.join(audio_dir, audio_name)
        start_ms = row["start"]
        end_ms = row["end"]
        utt_id = row["utt_id"]

        if row["language_tag"] == "language":
            try:
                waveform, sr = torchaudio.load(audio_path)
                start_sample = int((start_ms / 1000) * sr)
                end_sample = int((end_ms / 1000) * sr)
                segment = waveform[:, start_sample:end_sample]

                min_duration_ms = 20  # 20 milliseconds
                min_samples = int((min_duration_ms / 1000) * sr)

                if segment.numel() == 0 or segment.shape[1] < min_samples:
                    predicted = "TooShort"
                    print("Too short...\n")
                else:
                    predicted = classify_segment(segment, sr)


            except Exception as e:
                print(f"Error processing segment {utt_id} in {audio_name}: {e}")
                predicted = "Error"
                print("Error")
        else:
            predicted = "NON_SPEECH"

        results.append({
            "audio_name": audio_name,
            "utt_id": utt_id,
            "start": start_ms,
            "end": end_ms,
            "length": row["length"],
            "language_tag": predicted
        })

    output_df = pd.DataFrame(results)
    output_df.to_csv(output_csv, index=False)
    print(f"\n✅ Saved classified segments to: {output_csv}")

# -------------------------------
# Execute
# -------------------------------
if __name__ == "__main__":
    vad_csv = "/home/teaching/Desktop/priyam/labels/vad_all_segments_all_audios.csv"
    audio_dir = "/home/teaching/Desktop/priyam/_audio"
    output_csv = "/home/teaching/Desktop/priyam/labels/classified_segments_all.csv"

    # Option 1: classify only one audio file
    torch.cuda.empty_cache()
    classify_vad_segments(vad_csv, audio_dir, output_csv)

    # Option 2: classify all audio files
    # classify_vad_segments(vad_csv, audio_dir, output_csv)
