In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
pip install transformers



In [None]:
|pip install pydub




In [None]:
pip install SpeechRecognition



In [None]:
audio_file="/content/dilshad.wav"

In [None]:
import speech_recognition as sr

def transcribe_audio(audio_file):
    # Initialize the recognizer
    recognizer = sr.Recognizer()

    # Open and read the audio file
    with sr.AudioFile(audio_file) as source:
        audio_data = recognizer.record(source)

    try:
        # Recognize the audio using the Google Web Speech API
        transcribed_text = recognizer.recognize_google(audio_data)
        return transcribed_text
    except sr.UnknownValueError:
        print("Google Web Speech API could not understand the audio.")
        return ""
    except sr.RequestError as e:
        print("Could not request results from Google Web Speech API; {0}".format(e))
        return ""

In [None]:
# Importing necessary libraries
from transformers import pipeline
import sys
model_name = "sanskar/DepressionAnalysis"
classifier = pipeline("text-classification", model=model_name)


def predict_depression(audio_file):
    """
    Function to make a prediction on the provided transcript.
    """
    transcript= transcribe_audio(audio_file)

    # Make a prediction
    results = classifier(transcript)

    # Extract the label and score from the results
    label = results[0]['label']
    score = results[0]['score']

    # Return the results
    return label, score







In [None]:



# Predict the depression level based on the transcript
label, score = predict_depression(audio_file)

# Output the prediction results
print(f"\nPrediction: {label}\nConfidence Score: {score}")


Prediction: Not Depressed
Confidence Score: 0.9422476887702942


In [None]:
import pandas as pd
import numpy as np
import joblib
import librosa
from tensorflow.keras.models import load_model
from sklearn.preprocessing import StandardScaler
import keras
model_path_json = 'depression_model_using_wav_audio.json'
model_path = 'depression_model_using_wav_audio.h5'
path_of_wav_audio_file='/content/dilshad.wav'
path_of_scaler_filename='scaler_filename_dep.pkl'

def emotion_model_using_wav_audio(path):
  def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

  def stretch(data, rate=0.8):
      return librosa.effects.time_stretch(y=data, rate=rate)


  def shift(data):
      shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
      return np.roll(data, shift_range)

  def pitch(data, sampling_rate, n_steps=0.7):  # Changed pitch_factor to n_steps for clarity
      return librosa.effects.pitch_shift(y=data, sr=sampling_rate, n_steps=n_steps)


  def extract_features(data,sample_rate):
      # ZCR
      result = np.array([])
      zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
      result=np.hstack((result, zcr)) # stacking horizontally

      # Chroma_stft
      stft = np.abs(librosa.stft(data))
      chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
      result = np.hstack((result, chroma_stft)) # stacking horizontally

      # MFCC
      mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
      result = np.hstack((result, mfcc)) # stacking horizontally

      # Root Mean Square Value
      rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
      result = np.hstack((result, rms)) # stacking horizontally

      # MelSpectogram
      mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
      result = np.hstack((result, mel)) # stacking horizontally

      return result

  def get_features(path):
      # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
      data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)

      # without augmentation
      res1 = extract_features(data,sample_rate)
      result = np.array(res1)

      # data with noise
      noise_data = noise(data)
      res2 = extract_features(noise_data,sample_rate)
      result = np.vstack((result, res2)) # stacking vertically

      # data with stretching and pitching
      new_data= stretch(data)
      data_stretch_pitch = pitch(new_data, sample_rate)
      res3 = extract_features(data_stretch_pitch,sample_rate)
      result = np.vstack((result, res3)) # stacking vertically

      return result

  X= []
  feature = get_features(path)
  for ele in feature:
      X.append(ele)
  Features = pd.DataFrame(X)
  X = Features.iloc[: ,:-1].values
  scaler = joblib.load(path_of_scaler_filename)
  x_test = scaler.transform(Features)
  x_test = np.expand_dims(x_test, axis=2)
  from keras.models import model_from_json
  with open(model_path_json, "r") as json_file:
      model_json = json_file.read()
  model = model_from_json(model_json)
  model.load_weights(model_path)

  pred_test = model.predict(x_test)
  Threshold=0.29
  print(pred_test)
  # if(pred_test>= Threshold):
  #     return 1
  # else:
  #     return 0





emotion_model_using_wav_audio(path_of_wav_audio_file)




