In [None]:
%%capture install_log
!pip install crepe gradio transformers speechbrain

In [None]:
import crepe
import spacy
import librosa
import gradio as gr
import pandas as pd
from transformers import pipeline
from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizer
from transformers import WhisperProcessor
from transformers import WhisperForConditionalGeneration
from speechbrain.pretrained import SepformerSeparation as separator
import torchaudio

In [None]:
model_name = "openai/whisper-medium"
finetuned_model_name = "gngpostalsrvc/whisper_ami_finetuned"

processor = WhisperProcessor.from_pretrained(model_name, language="english", task="transcribe")

model = WhisperForConditionalGeneration.from_pretrained(finetuned_model_name)

denoise_model = separator.from_hparams(source="speechbrain/sepformer-wham-enhancement",
                                       savedir='pretrained_models/sepformer-wham-enhancement')

Downloading:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/830 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.91k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/113M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.3k [00:00<?, ?B/s]

In [None]:
# asr = pipeline('automatic-speech-recognition', model='facebook/wav2vec2-large-960h-lv60-self')
emo = pipeline('sentiment-analysis', model='arpanghoshal/EmoRoBERTa')
pos = pipeline("token-classification", model="vblagoje/bert-english-uncased-finetuned-pos")

Downloading:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at arpanghoshal/EmoRoBERTa.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
def transcribe_and_describe(audio_path):

  audio, sr = librosa.load(audio_path, sr=16000)

  inputs = processor(audio, return_tensors="pt")
  input_features = inputs.input_features
  generated_ids = model.generate(inputs=input_features)
  text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
  # text = asr(audio)['text']

  tagged_text = pos(text)
  filler_words = [entry['word'] for entry in tagged_text if entry['entity'] == 'INTJ']
  filler_word_pr =  len(filler_words) / len(tagged_text)

  est_sources = denoise_model.separate_file(path=audio_path)
  audio = est_sources[:, :, 0].reshape((-1,))
  audio = audio.numpy()

  flatness = pd.DataFrame(librosa.feature.spectral_flatness(y=audio).T).describe().T
  loudness = pd.DataFrame(librosa.feature.rms(audio).T).describe().T
  time, frequency, confidence, activation = crepe.predict(audio, sr)
  frequency = pd.DataFrame(frequency.T).describe().T

  mean_spectral_flatness = flatness.loc[0, 'mean'] 
  spectral_flatness_std = flatness.loc[0, 'std'] 
  mean_pitch = frequency.loc[0, 'mean'] 
  pitch_std = frequency.loc[0, 'std'] 
  mean_volume = loudness.loc[0, 'mean'] 
  volume_std = loudness.loc[0, 'std'] 

  words_per_minute = len(text.split(" ")) / (librosa.get_duration(audio, sr) / 60)

  emotion = emo(text)[0]['label']

  return ((8000, audio),text, f"{filler_word_pr:.2f}", f"{words_per_minute:.2f}", f"{mean_pitch:.2f}", f"{pitch_std:.2f}", f"{mean_volume:.2f}", f"{volume_std:.2f}", f"{mean_spectral_flatness:.2f}", f"{spectral_flatness_std:.2f}",  emotion)

In [None]:
gr.Interface(
    fn=transcribe_and_describe, 
    inputs=gr.Audio(source="microphone", type="filepath"), 
    outputs=[
        gr.Audio(label="Cleaned output"),
        gr.Text(label="Transcription"), 
        gr.Text(label="Filler Word Percent"),
        gr.Text(label="Rate of Speech (WPM)"), 
        gr.Text(label="Mean Pitch (Hz)"), 
        gr.Text(label="Pitch Variation (Hz)"), 
        gr.Text(label="Mean Volume (W)"),
        gr.Text(label="Volume Variation (W)"),
        gr.Text(label="Mean Spectral Flatness (dB)"),
        gr.Text(label="Spectral Flatness Variation (dB)"),
        gr.Text(label="Emotion")
        ]
        ).launch(debug =True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

It is strongly recommended to pass the `sampling_rate` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


Resampling the audio from 48000 Hz to 8000 Hz




Keyboard interruption in main thread... closing server.


