In [1]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, pipeline, AutoProcessor
from datasets import load_dataset

In [2]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

In [3]:
model_id = 'openai/whisper-tiny'

In [4]:
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, 
                                                  torch_dtype=torch_dtype, 
                                                  low_cpu_mem_usage = True,
                                                  use_safetensors = True
                                                  )

In [5]:
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 384)
      (layers): ModuleList(
        (0-3): 4 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=384, out_features=384, bias=False)
            (v_proj): Linear(in_features=384, out_features=384, bias=True)
            (q_proj): Linear(in_features=384, out_features=384, bias=True)
            (out_proj): Linear(in_features=384, out_features=384, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          (fin

In [6]:
processor = AutoProcessor.from_pretrained(model_id)

In [7]:
pipe = pipeline("automatic-speech-recognition",
                model=model,
                tokenizer=processor.tokenizer,
                feature_extractor=processor.feature_extractor,
                max_new_tokens = 128,
                chunk_length_s = 30,
                batch_size = 16,
                return_timestamps = True,
                torch_dtype=torch_dtype,
                device=device 
                )

In [8]:
dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")

distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
    return_all_scores=True
)



In [26]:
sample = dataset[0]['audio']
result = pipe("Madara.mp3")
result['text']

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


" Wake up to reality. Nothing ever goes as planned in this accursed world. The longer you live the more you'll realize that the only things that truly exist in this reality are merely pain, suffering and futility. Listen, everywhere you look in this world, wherever there is light, there will always be shadows to be found as well. As long as there is a concept of victors, the fact pushed will also exist. The selfish intent of wanting to preserve peace, initiate wars, and hatred is borne at order to protect love. There are nexus, causal relationships that cannot be separated. I want to sever the fate of this world. A world of only victors, a world of only peace, a world of only love. I will create such a world. I will create such a world. I am the ghost of the U.G.R. For truly this reality, is our hell."

In [27]:
distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student", 
    top_k=None
)

In [28]:
sent = distilled_student_sentiment_classifier(result['text'])
sent = sent[0]
sent

[{'label': 'negative', 'score': 0.8088902235031128},
 {'label': 'positive', 'score': 0.12696124613285065},
 {'label': 'neutral', 'score': 0.06414858251810074}]

In [29]:
df = sent[0]
df

{'label': 'negative', 'score': 0.8088902235031128}

In [30]:
ratings = []
label = []


for i in range(0,3):
    ratings.append(sent[i]['score'])
    label.append(sent[i]['label'])

maxi = max(ratings)
rightLabel = ratings.index(maxi)


print(f"{result['text']}\n The Given text has a {label[rightLabel]} Marking with a overall score of {maxi*100:.2f}%")

 Wake up to reality. Nothing ever goes as planned in this accursed world. The longer you live the more you'll realize that the only things that truly exist in this reality are merely pain, suffering and futility. Listen, everywhere you look in this world, wherever there is light, there will always be shadows to be found as well. As long as there is a concept of victors, the fact pushed will also exist. The selfish intent of wanting to preserve peace, initiate wars, and hatred is borne at order to protect love. There are nexus, causal relationships that cannot be separated. I want to sever the fate of this world. A world of only victors, a world of only peace, a world of only love. I will create such a world. I will create such a world. I am the ghost of the U.G.R. For truly this reality, is our hell.
 The Given text has a negative Marking with a overall score of 80.89%
