In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, pipeline, AutoProcessor
from datasets import load_dataset

In [2]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

In [3]:
model_id = 'openai/whisper-tiny'

In [4]:
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, 
                                                  torch_dtype=torch_dtype, 
                                                  low_cpu_mem_usage = True,
                                                  use_safetensors = True
                                                  )

In [5]:
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 384)
      (layers): ModuleList(
        (0-3): 4 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=384, out_features=384, bias=False)
            (v_proj): Linear(in_features=384, out_features=384, bias=True)
            (q_proj): Linear(in_features=384, out_features=384, bias=True)
            (out_proj): Linear(in_features=384, out_features=384, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          (fin

In [6]:
processor = AutoProcessor.from_pretrained(model_id)

In [7]:
pipe = pipeline("automatic-speech-recognition",
                model=model,
                tokenizer=processor.tokenizer,
                feature_extractor=processor.feature_extractor,
                max_new_tokens = 128,
                chunk_length_s = 30,
                batch_size = 16,
                return_timestamps = True,
                torch_dtype=torch_dtype,
                device=device 
                )

In [8]:
dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")

distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
    return_all_scores=True
)



In [15]:
sample = dataset[0]['audio']
result = pipe("sad_truth.mp3")
result['text']

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


" I love you Lizzy. Do you love me? No. Enough. Get down. Now. What's the point? Lizzy doesn't love me. I hope he loves me. I'm gonna jump! Stop, Liam, put the tone. Don't do this. Oh. Hi, Ores. What do you mean? I just... You don't want to jump. No! I think I do. You don't. Look. Sometimes the people we like don't like us back. It's painful, but there's nothing we can do about it. You don't understand. I do. I do understand. I know what it's like when someone doesn't nothing we can do about it. You don't understand. I do. I do understand. I know what it's like when someone doesn't feel the same way about you. Someone you can't stop thinking about. It hurts. But you can't make people like you. I don't like her. I love her. I know. But love isn't about grand gestures or the moon and the stars. It's just dumb luck. And sometimes you meet someone who feels the same way. And then sometimes you're unlucky. But one day, you're going to meet someone who appreciates you for who you are. I mean

In [16]:
distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
    return_all_scores=True
)



In [17]:
sent = distilled_student_sentiment_classifier(result['text'])
sent = sent[0]
sent

[{'label': 'positive', 'score': 0.29348886013031006},
 {'label': 'neutral', 'score': 0.2647108733654022},
 {'label': 'negative', 'score': 0.44180023670196533}]

In [18]:
df = sent[0]
df

{'label': 'positive', 'score': 0.29348886013031006}

In [19]:
ratings = []
label = []


for i in range(0,3):
    ratings.append(sent[i]['score'])
    label.append(sent[i]['label'])

maxi = max(ratings)
rightLabel = ratings.index(maxi)


print(f"{result['text']}\n The Given text has a {label[rightLabel]} Marking with a overall score of {maxi*100:.2f}%")

 I love you Lizzy. Do you love me? No. Enough. Get down. Now. What's the point? Lizzy doesn't love me. I hope he loves me. I'm gonna jump! Stop, Liam, put the tone. Don't do this. Oh. Hi, Ores. What do you mean? I just... You don't want to jump. No! I think I do. You don't. Look. Sometimes the people we like don't like us back. It's painful, but there's nothing we can do about it. You don't understand. I do. I do understand. I know what it's like when someone doesn't nothing we can do about it. You don't understand. I do. I do understand. I know what it's like when someone doesn't feel the same way about you. Someone you can't stop thinking about. It hurts. But you can't make people like you. I don't like her. I love her. I know. But love isn't about grand gestures or the moon and the stars. It's just dumb luck. And sometimes you meet someone who feels the same way. And then sometimes you're unlucky. But one day, you're going to meet someone who appreciates you for who you are. I mean,