<a href="https://colab.research.google.com/github/Parthieshwar/Research-Paper/blob/main/Research_Paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade librosa datasets 

In [11]:
import os
import torch
import numpy as np
import librosa
from datasets import Dataset, Audio
from transformers import (
    Wav2Vec2ForSequenceClassification,
    Wav2Vec2FeatureExtractor,
    TrainingArguments,
    Trainer,
)
from sklearn.preprocessing import LabelEncoder
from dataclasses import dataclass
from typing import List, Dict, Union
import json

# ✅ 1. Set Paths
data_dir = "/content/emotion_data"

# ✅ 2. Emotion labels from RAVDESS
emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised",
}

# ✅ 3. Parse filenames and prepare data
def parse_filename(filename):
    parts = filename.split("-")
    emotion_id = parts[2]
    return emotion_map.get(emotion_id, "unknown")

data = []
for file in os.listdir(data_dir):
    if file.endswith(".wav"):
        emotion = parse_filename(file)
        if emotion != "unknown":
            data.append({
                "path": os.path.join(data_dir, file),
                "label": emotion
            })

# ✅ 4. Convert to HF Dataset
dataset = Dataset.from_list(data).cast_column("path", Audio(sampling_rate=16000))

# ✅ 5. Encode labels
label_encoder = LabelEncoder()
label_encoder.fit([item["label"] for item in data])
dataset = dataset.map(lambda x: {"label": label_encoder.transform([x["label"]])[0]})

# ✅ 6. Load feature extractor & model
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    num_labels=len(label_encoder.classes_),
    problem_type="single_label_classification"
)

# ✅ 7. Preprocessing with signal features
def preprocess(example):
    audio = example["path"]["array"]

    # 📊 Extract signal features
    mfccs = librosa.feature.mfcc(y=audio, sr=16000, n_mfcc=13).mean(axis=1)
    pitch, _ = librosa.piptrack(y=audio, sr=16000)
    pitch_mean = pitch[pitch > 0].mean() if np.any(pitch > 0) else 0
    zcr = librosa.feature.zero_crossing_rate(y=audio).mean()

    # 🧠 Wav2Vec2 inputs
    inputs = feature_extractor(audio, sampling_rate=16000)

    # Optionally, print for debugging
    print(f"📌 MFCC: {mfccs.tolist()}")
    print(f"📌 Pitch Mean: {pitch_mean}")
    print(f"📌 Zero-Crossing Rate: {zcr}")

    return {
        "input_values": inputs["input_values"][0],
        "label": example["label"],
        "mfcc": mfccs.tolist(),
        "pitch": float(pitch_mean),
        "zcr": float(zcr)
    }

processed_dataset = dataset.map(preprocess, remove_columns=["path"])

# ✅ 8. Custom Data Collator
@dataclass
class CustomDataCollator:
    feature_extractor: Wav2Vec2FeatureExtractor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[float], int]]]) -> Dict[str, torch.Tensor]:
        input_values = [f["input_values"] for f in features]
        labels = torch.tensor([f["label"] for f in features], dtype=torch.long)
        batch = self.feature_extractor(
            input_values,
            sampling_rate=16000,
            padding=self.padding,
            return_tensors="pt"
        )
        batch["labels"] = labels
        return batch

data_collator = CustomDataCollator(feature_extractor=feature_extractor)

# ✅ 9. Training Arguments
training_args = TrainingArguments(
    output_dir="./emotion_model",
    evaluation_strategy="no",
    per_device_train_batch_size=4,
    num_train_epochs=2,
    save_steps=10,
    save_total_limit=2,
    logging_steps=5,
    learning_rate=1e-4,
    report_to="none",
)

# ✅ 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
    data_collator=data_collator,
)

# ✅ 11. Train
trainer.train()

# ✅ 12. Save model and label mapping
model.save_pretrained("./emotion_model")
feature_extractor.save_pretrained("./emotion_model")
with open("./emotion_model/label_mapping.json", "w") as f:
    json.dump({i: label for i, label in enumerate(label_encoder.classes_)}, f)

print("🎉 Fine-tuning complete with audio signal analysis!")

Map:   0%|          | 0/44 [00:00<?, ? examples/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/44 [00:00<?, ? examples/s]

📌 MFCC: [-600.670039184779, 52.31916149838492, -13.012061561511645, 13.054221518105123, -4.7676066476214345, -10.750125711838525, -12.126566967750199, -16.72067552177685, -6.563641331906493, 4.0977645209217055, -9.92974834592948, -2.218678647608094, -13.577413324012074]
📌 Pitch Mean: 1914.551450547888
📌 Zero-Crossing Rate: 0.22427615603885134
📌 MFCC: [-337.33333267875577, 39.46608688101908, -29.07419478006065, 5.593970011650886, -8.05254983055527, -18.398241471470513, -12.943607454563418, -17.876371811715423, -13.975507961128104, 6.411325015418941, -16.13892963434612, 0.09580797407808896, -11.352062426450106]
📌 Pitch Mean: 1887.5006046111741
📌 Zero-Crossing Rate: 0.2244865557898773
📌 MFCC: [-529.249221279816, 44.047849798075966, -15.068906703777609, 7.601328612905086, -4.84251357610256, -11.074563784634092, -12.684420652687185, -12.605661839634836, -18.073461762063136, -0.6913828629663267, -9.78744517192578, -3.9370821613939557, -8.956718259146479]
📌 Pitch Mean: 1901.9204140805164
📌 Ze



Step,Training Loss
5,1.7918
10,1.8017
15,1.8114
20,1.7848


🎉 Fine-tuning complete with audio signal analysis!


In [12]:
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
import torch, torchaudio, json

# ✅ Choose device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ Load model and feature extractor
model = Wav2Vec2ForSequenceClassification.from_pretrained("./emotion_model").to(device)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("./emotion_model")

# ✅ Load label map
with open("./emotion_model/label_mapping.json", "r") as f:
    id2label = json.load(f)

# ✅ Prediction function
def predict_emotion(path):
    # Load and resample audio
    speech, sr = torchaudio.load(path)
    speech = speech.mean(dim=0)  # convert to mono

    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        speech = resampler(speech)

    # Limit to 10 seconds (optional)
    speech = speech[:16000 * 10]

    # Extract features
    inputs = feature_extractor(speech.numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
    input_values = inputs["input_values"].to(device)

    # Predict
    with torch.no_grad():
        logits = model(input_values).logits
    pred_id = torch.argmax(logits, dim=-1).item()
    return id2label[str(pred_id)]

# ✅ Run prediction
print(predict_emotion("/content/WhatsApp Ptt 2025-04-12 at 3.55.29 AM.wav"))


calm
