In [None]:
!pip install transformers sentence-transformers torch
!pip install datasets


Collecting sentence-transformers
  Downloading sentence_transformers-3.0.0-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.7/224.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cach

In [1]:
import torch
from transformers import AutoTokenizer, AutoModel

def embed_sentences(sentences, tokenizer, model):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
    encoder_inputs = inputs["input_ids"]
    decoder_inputs = torch.tensor([[tokenizer.cls_token_id]] * len(sentences))  # or other appropriate decoder start token

    # Get the outputs from the model
    with torch.no_grad():
        outputs = model(input_ids=encoder_inputs, decoder_input_ids=decoder_inputs)

    embeddings = outputs.encoder_last_hidden_state[:,0,:]
        # print(outputs.encoder_last_hidden_state.shape)
    return embeddings.tolist()

    # with torch.no_grad():
    #     outputs = model(**inputs)
    # embeddings = outputs.last_hidden_state[:, 0, :]
    # return embeddings

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torchaudio
import librosa

def load_audio(audio_path):
    #waveform, sample_rate = torchaudio.load(audio_path)
    audio, sr = librosa.load(audio_path, sr=16000)

    return audio, sr

def process_audio(waveform, sample_rate, processor):
    inputs = processor(waveform.squeeze(), sampling_rate=sample_rate, return_tensors="pt", padding="longest")
    return inputs

def batch_process_audio(audio_paths, processor, model):
    embeddings_list = []
    for audio_path in audio_paths:
        waveform, sample_rate = load_audio(audio_path)
        inputs = process_audio(waveform, sample_rate, processor)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings_list.append(embeddings)

    # Concatenate the embeddings along the batch dimension
    batch_embeddings = torch.cat(embeddings_list, dim=0)

    return batch_embeddings

# Example usage
audio_paths = ["librespeech_1.wav", "librespeech_2.wav"]
audio_embeddings = batch_process_audio(audio_paths, asr_processor, asr_model)
print(audio_embeddings.shape)


torch.Size([2, 768])


In [None]:
# List of sentences
sentences = [
    "He first appeared on the banks of the Jordan in the form of perfect manhood. But it was a form only, and not a substance, a human figure created by the hand of omnipotence to imitate the faculties and actions of a man, and to impose a perpetual illusion on the senses of his friends and enemies. ",
    "For a time the death of Mary obscured her life for me. But now her living presence is more in my mind again. ",
]

def get_text_embeddings(setences, tokenizer, model):
  # Tokenize the sentences
  inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)

  # Forward pass, get hidden states
  with torch.no_grad():
      outputs = model(**inputs)

  # Get the hidden states from the output tuple
  hidden_states = outputs.last_hidden_state

  # Extract the embeddings for the [CLS] token (first token) of each sentence
  text_embeddings = hidden_states[:, 0, :]

  return text_embeddings


In [None]:
from datasets import load_dataset
from transformers import Wav2Vec2Processor, Wav2Vec2Model, BertTokenizer, BertModel
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

# Load the minds_14 dataset
minds_14 = load_dataset("PolyAI/minds14", "en-US", trust_remote_code=True)

# Load the Wav2Vec2 model and processor
asr_model_name = "facebook/wav2vec2-base-960h"
asr_processor = Wav2Vec2Processor.from_pretrained(asr_model_name)
asr_model = Wav2Vec2Model.from_pretrained(asr_model_name)

# Load the BERT model and tokenizer
text_model_name = "bert-base-uncased"
text_tokenizer = BertTokenizer.from_pretrained(text_model_name)
text_model = BertModel.from_pretrained(text_model_name)

Downloading builder script:   0%|          | 0.00/5.90k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.29k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Create a simple neural network for transformation

import torch.nn.functional as F

def custom_loss(output, target):
    mse_loss = F.mse_loss(output, target)

    # Calculate cosine similarity
    cos_sim = F.cosine_similarity(output, target, dim=1)
    cos_loss = 1 - cos_sim.mean()

    # Combine MSE and cosine similarity losses
    combined_loss = mse_loss + cos_loss

    return combined_loss

class TransformNet(nn.Module):
    def __init__(self, input_size, output_size):
        super(TransformNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.fc2 = nn.Linear(512, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define transformation dimensions
input_size = text_model.config.hidden_size
output_size = asr_model.config.hidden_size

# Initialize the neural network
net = TransformNet(input_size, output_size).to(device)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

# Prepare data for training
data = []
for example in minds_14["train"]:
    text_embeddings = get_text_embeddings([example['transcription']], text_tokenizer, text_model)
    speech_embeddings = batch_process_audio([example['path']], asr_processor, asr_model)
    data.append((text_embeddings, speech_embeddings))

data_loader = DataLoader(data, batch_size=32, shuffle=True)

# Train the neural network
num_epochs = 5
for epoch in range(num_epochs):
    running_loss = 0.0
    for text_embeddings, speech_embeddings in data_loader:
        optimizer.zero_grad()
        text_embeddings, speech_embeddings = text_embeddings.to(device), speech_embeddings.to(device)
        outputs = net(text_embeddings)
        loss = custom_loss(outputs, speech_embeddings)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss}")

print("Training complete.")


NameError: name 'output' is not defined

In [None]:
get_text_embeddings(setences, tokenizer, model):

In [None]:
sentences = [
    "He first appeared on the banks of the Jordan in the form of perfect manhood. But it was a form only, and not a substance, a human figure created by the hand of omnipotence to imitate the faculties and actions of a man, and to impose a perpetual illusion on the senses of his friends and enemies. ",
    "I am here ",
]

In [None]:
text_embeddings = get_text_embeddings(sentences, text_tokenizer, text_model)

In [None]:
audio_paths = ["librespeech_1.wav", "librespeech_2.wav"]
audio_embeddings = batch_process_audio(audio_paths, asr_processor, asr_model)
audio_embeddings = audio_embeddings.to(device)
print(audio_embeddings.shape)

torch.Size([2, 768])


In [None]:
text_embeddings_2 = net.forward(text_embeddings.to(device))
text_embeddings_2

tensor([[-0.0224,  0.0024, -0.0763,  ..., -0.0746,  0.0236, -0.0238],
        [-0.0211, -0.0150, -0.0797,  ...,  0.0012,  0.0506, -0.0077]],
       device='cuda:0', grad_fn=<AddmmBackward0>)

In [None]:
import torch
import torch.nn.functional as F

# Assuming audio_embeddings and text_embeddings are tensors of shape [Batch-size, 768]
# Iterate over each pair of embeddings

for i in range(len(audio_embeddings)):
    for j in range(len(text_embeddings_2)):
        audio_embedding = audio_embeddings[i].unsqueeze(0)  # Add batch dimension
        text_embedding = text_embeddings_2[j].unsqueeze(0)    # Add batch dimension
        cos_sim = F.cosine_similarity(audio_embedding, text_embedding)
        print(f"Cosine Similarity between audio sample {i+1} and text sample {j+1}: {cos_sim.item()}")

Cosine Similarity between audio sample 1 and text sample 1: 0.943386971950531
Cosine Similarity between audio sample 1 and text sample 2: 0.92436683177948
Cosine Similarity between audio sample 2 and text sample 1: 0.9683031439781189
Cosine Similarity between audio sample 2 and text sample 2: 0.9502497911453247
