<a href="https://colab.research.google.com/github/Parthieshwar/Research-Paper/blob/main/Research_Paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installation



In [2]:
!pip install git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-onwi3y5c
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-onwi3y5c
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [3]:
!pip install --upgrade language_tool_python librosa



## Import libraries

In [4]:
# Import neccessary library
import whisper
import torch
import language_tool_python
import numpy as np
import matplotlib.pyplot as plt
import warnings
import librosa
import librosa.display
import speechbrain as sb
import torchaudio
import torch
from speechbrain.pretrained import EncoderClassifier

  from speechbrain.pretrained import EncoderClassifier


In [5]:
# Changing the device to gpu
#device = "cuda" if torch.cuda.is_available() else "cpu"
#device

In [6]:
# To ignore warnings
warnings.simplefilter("ignore", category=FutureWarning)

## Extract Features Using Librosa


In [7]:
file_path = '/content/Output.wav'
y, sr = librosa.load(file_path, sr=22050)

# 1.Extract Pitch
pitch = librosa.yin(y, fmin=50, fmax=300)

# 2.Extract Energy(Stress Indicator)
energy = librosa.feature.rms(y=y)

# 3.Capture Tone & Timbre
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

# 4.Measures frequency of signal changes
zcr = librosa.feature.zero_crossing_rate(y)

In [None]:
plt.figure(figsize=(12, 7))

# Pitch Plot
plt.subplot(3, 1, 1)
plt.plot(pitch, label="Pitch (F0)")
plt.legend()
plt.title("Pitch Over Time")

# Energy (Stress) Plot
plt.subplot(3, 1, 2)
plt.plot(energy[0], label="Stress (Energy Levels)", color="red")
plt.legend()
plt.title("Stress Over Time")

# Zero-Crossing Rate Plot
plt.subplot(3, 1, 3)
plt.plot(zcr[0], label="Zero-Crossing Rate", color="green")
plt.legend()
plt.title("Zero-Crossing Rate")

plt.tight_layout()
plt.show()

# MFCCs with Colorbar
plt.figure(figsize=(12, 5))
librosa.display.specshow(mfccs, x_axis="time", sr=sr, cmap="magma")
plt.colorbar(label="MFCC Coefficients")
plt.title("Mel-Frequency Cepstral Coefficients (MFCCs)")
plt.xlabel("Time")
plt.ylabel("MFCC Index")
plt.show()


In [None]:
# To Calculate the average
avg_pitch=np.nanmean(pitch)
avg_energy=np.mean(energy)
avg_mfccs=np.mean(mfccs, axis=1)
avg_zcr=np.mean(zcr)

audio_features = np.concatenate(([avg_pitch], [avg_energy], avg_mfccs, [avg_zcr]))

print("Average Pitch:", avg_pitch)
print("Average Energy:", avg_energy)
print("Average MFCCs:", avg_mfccs)
print("Average Zero Crossing Rate:", avg_zcr)
print("Audio Features:", audio_features)

In [None]:
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification

MODEL_NAME = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME)

signal, sr = torchaudio.load(file_path)

if sr != 16000:
    signal = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(signal)

if signal.shape[0] > 1:
    signal = torch.mean(signal, dim=0, keepdim=True)

input_values = processor(signal.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values

with torch.no_grad():
    logits = model(input_values).logits

predicted_class = torch.argmax(logits, dim=-1).item()

emotion_labels = ["neutral", "happy", "sad", "angry", "contempt", "disgust", "fear", "surprise"]

print(f"Predicted Emotion: {emotion_labels[predicted_class]}")


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Convert audio into text
model = whisper.load_model("turbo")
audio = model.transcribe("/content/Output.wav")
print(audio["text"])