In [35]:
import warnings
import numpy as np
import torchaudio
import torch
import matplotlib.pyplot as plt
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2CTCTokenizer
torchaudio.set_audio_backend("librosa")
warnings.filterwarnings("ignore", category=FutureWarning)

  torchaudio.set_audio_backend("librosa")


In [36]:
def process_audio(file_path, processor):
    waveform, sample_rate = torchaudio.load(file_path)
    if sample_rate != processor.feature_extractor.sampling_rate:
        waveform = torchaudio.transforms.Resample(
            orig_freq= sample_rate, 
            new_freq= processor.feature_extractor.sampling_rate
        )(waveform)
 
    return waveform

In [37]:
base_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
base_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

large_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
large_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
def general_predict_and_probability(file_path, processor, model):
    waveform = process_audio(file_path, processor)
    inputs = processor(
        waveform,
        return_tensors= 'pt',
        sampling_rate= processor.feature_extractor.sampling_rate
    )
    
    inputs = inputs.input_values.squeeze(1)
    
    with torch.no_grad():
        logits = model(input_values= inputs).logits 
        
    predicted_ids = torch.argmax(logits, dim=-1)
    predicted_word = processor.decode(predicted_ids[0])
    softmax_probs = torch.nn.functional.softmax(logits, dim=-1)
    max_probs, _ = torch.max(softmax_probs, dim=-1) 
    
#     print(logits.shape)
#     print(logits)
#     print(softmax_probs)
#     print(softmax_probs.shape)
#     top_probs, top_indices = torch.topk(softmax_probs, 2, dim=-1)
#     print(top_probs.shape)
#     print(processor.decode(top_indices[:, :, 1].squeeze()))

    return predicted_word, max_probs

In [42]:
chip_path = "/home/cogsci-lasrlab/Desktop/KT2 exptal audio/K2a0_chip.wav"

base_word, base_prob = general_predict_and_probability(
    file_path= chip_path,
    processor= base_processor,
    model= base_model
)

base_word, base_prob

('CHIP',
 tensor([[0.9985, 0.9986, 0.9982, 0.9973, 0.9950, 0.9926, 0.9946, 0.8415, 0.9932,
          0.7393, 0.9329, 0.9832, 0.9813, 0.9873, 0.9923, 0.9911, 0.4034, 0.6599,
          0.9232, 0.9854, 0.9957, 0.6874, 0.9595, 0.8922, 0.8735, 0.9207, 0.9622,
          0.6285, 0.6359, 0.7261, 0.9979, 0.9979, 0.9991, 0.9993, 0.9988, 0.9991,
          0.9992, 0.9992, 0.9990, 0.9992, 0.9992, 0.9993, 0.9989, 0.9993, 0.9989,
          0.9994, 0.9989, 0.9989, 0.9995, 0.9994, 0.9989, 0.9991]]))

In [43]:
large_word, large_prob = general_predict_and_probability(
    file_path= chip_path,
    processor= large_processor,
    model= large_model
)

large_word, large_prob

('CHAP',
 tensor([[0.9999, 0.9999, 1.0000, 0.9999, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
          1.0000, 1.0000, 0.9780, 0.9928, 0.9997, 0.9994, 0.9985, 0.8239, 0.9994,
          0.9991, 0.9867, 0.9324, 0.9775, 0.9979, 0.9987, 0.9993, 0.9997, 0.9997,
          0.9995, 0.9984, 0.9145, 1.0000, 0.9987, 0.9990, 0.9992, 0.9986, 0.9874,
          0.9999, 0.9986, 0.9986, 0.9995, 0.8929, 1.0000, 1.0000, 1.0000, 1.0000,
          1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000]]))

In [44]:
fan_path = "/home/cogsci-lasrlab/Desktop/KT2 exptal audio/K2a0_fan.wav"

base_word, base_prob = general_predict_and_probability(
    file_path= fan_path,
    processor= base_processor,
    model= base_model
)

base_word, base_prob

('FAON',
 tensor([[0.9997, 0.9997, 0.9997, 0.9997, 0.4753, 0.8813, 0.9283, 0.9959, 0.9973,
          0.9575, 0.7329, 0.7121, 0.8088, 0.9664, 0.9863, 0.9896, 0.9836, 0.9599,
          0.8962, 0.9889, 0.9947, 0.9948, 0.9944, 0.9947, 0.9966, 0.6045, 0.9848,
          0.9932, 0.9608, 0.9471, 0.8795, 0.9249, 0.9784, 0.9900, 0.9979, 0.9460,
          0.9999, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 0.9999, 1.0000, 0.9999,
          0.9999, 1.0000, 0.9999, 0.9999, 0.9999, 0.9999]]))

In [45]:
large_word, large_prob = general_predict_and_probability(
    file_path= fan_path,
    processor= large_processor,
    model= large_model
)

large_word, large_prob

('FELL ON',
 tensor([[0.9999, 0.9999, 0.9998, 0.9997, 0.9985, 0.9413, 0.6250, 0.7739, 0.9917,
          0.6135, 0.9964, 0.9888, 0.7475, 0.9287, 0.5199, 0.8009, 0.9006, 0.6589,
          0.8981, 0.9890, 0.9909, 0.9995, 0.9550, 0.9958, 0.9905, 0.9990, 0.9994,
          0.9893, 0.9916, 0.9668, 0.9905, 0.9960, 0.9789, 0.9969, 0.9714, 0.9427,
          0.9621, 0.9917, 0.9996, 0.9628, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
          1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000]]))

In [46]:
run_path = "/home/cogsci-lasrlab/Desktop/KT2 exptal audio/K2a0_run.wav"

base_word, base_prob = general_predict_and_probability(
    file_path= run_path,
    processor= base_processor,
    model= base_model
)

base_word, base_prob

('BRAN',
 tensor([[0.4337, 0.9933, 0.9902, 0.8525, 0.9865, 0.9855, 0.9875, 0.9901, 0.9928,
          0.9890, 0.8744, 0.9209, 0.9610, 0.9522, 0.9898, 0.6860, 0.9006, 0.9794,
          0.9841, 0.9308, 0.9963, 0.9994, 0.9996, 0.9985, 0.9985, 0.9985]]))

In [47]:
large_word, large_prob = general_predict_and_probability(
    file_path= run_path,
    processor= large_processor,
    model= large_model
)

large_word, large_prob

('RAN',
 tensor([[0.9990, 0.9969, 0.9818, 0.9485, 0.8027, 0.9983, 0.9996, 0.7243, 0.9942,
          0.9910, 0.9751, 0.9549, 0.9612, 0.8069, 0.9590, 0.9780, 0.9704, 0.9787,
          0.9752, 0.9729, 0.9682, 0.5553, 0.9523, 0.9838, 0.8307, 0.9990]]))

In [55]:
def predict_predict_and_probability(file_path, processor, model):
    waveform = process_audio(file_path, processor)
    inputs = processor(
        waveform,
        return_tensors= 'pt',
        sampling_rate= processor.feature_extractor.sampling_rate
    )
    
    inputs = inputs.input_values.squeeze(1)
    
    with torch.no_grad():
        logits = model(input_values= inputs).logits 
        
    predicted_ids = torch.argmax(logits, dim=-1)
    predicted_word = processor.decode(predicted_ids[0])
    softmax_probs = torch.nn.functional.softmax(logits, dim=-1)


    return predicted_word

In [56]:
chip_vocab_path = '/home/cogsci-lasrlab/Documents/FANN/vocab/4_chip.json'
chip_custom_tokenizer = Wav2Vec2CTCTokenizer(chip_vocab_path)

base_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
base_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h", tokenizer = chip_custom_tokenizer)

large_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
large_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h", tokenizer = chip_custom_tokenizer)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
base_processor.tokenizer.vocab_size

4

In [58]:
chip = predict_word_and_probability(
    file_path= "/home/cogsci-lasrlab/Desktop/KT2 exptal audio/K2a0_chip.wav", 
    processor= base_processor, 
    model= base_model
)

chip

torch.Size([1, 52, 32])
tensor([[[  8.5268, -21.4147, -21.1832,  ...,  -4.8406,  -4.9408,  -6.1256],
         [  8.6224, -21.6634, -21.4232,  ...,  -4.5749,  -5.1386,  -5.7544],
         [  8.6859, -21.8377, -21.6726,  ...,  -4.1219,  -5.3921,  -5.3512],
         ...,
         [  9.3211, -22.3689, -22.0702,  ...,  -5.2648,  -6.6861,  -4.9645],
         [  8.6201, -19.5929, -19.3630,  ...,  -3.6756,  -4.7550,  -3.4002],
         [  9.1385, -22.5031, -22.2302,  ...,  -5.4515,  -6.6649,  -5.0894]]])


'chip<unk>chip<unk>chip<unk>chip<unk>chip<s>chip'

In [41]:
chip = predict_word_and_probability(
    file_path= "/home/cogsci-lasrlab/Desktop/KT2 exptal audio/K2a0_chip.wav", 
    processor= large_processor, 
    model= large_model
)

chip

torch.Size([1, 52, 32])
tensor([[[ 11.9968, -28.1995, -28.2689,  ...,  -4.3413,  -3.7289,  -6.7600],
         [ 11.9771, -28.1132, -28.1832,  ...,  -4.3190,  -3.9699,  -6.6605],
         [ 12.5852, -29.3019, -29.3424,  ...,  -4.4923,  -4.3289,  -6.6859],
         ...,
         [ 14.9882, -33.8904, -33.7044,  ...,  -5.8607,  -5.2790,  -7.8795],
         [ 14.8117, -33.4340, -33.2653,  ...,  -5.6307,  -5.1198,  -7.7124],
         [ 14.9083, -33.6068, -33.4321,  ...,  -5.6873,  -5.2208,  -7.7465]]])


'chip<unk>chipchip<unk>chip<s>chip'