In [None]:
import transformers
print(transformers.__version__)

In [None]:
import librosa
import torch
import IPython.display as display
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import numpy as np

In [None]:
#load pre-trained model and tokenizer
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

In [None]:
audio, sampling_rate = librosa.load("Medical Speech, Transcription, and Intent/recordings/train/1249120_44142156_69073946.wav",sr=16000)

In [None]:
audio,sampling_rate

In [None]:
# audio
display.Audio("Medical Speech, Transcription, and Intent/recordings/train/1249120_44142156_69073946.wav", autoplay=True)

In [None]:
input_values = tokenizer(audio, return_tensors = 'pt').input_values
input_values

In [None]:
# store logits (non-normalized predictions)
logits = model(input_values).logits
logits

In [None]:
# store predicted id's
# pass the logit values to softmax to get the predicted values
predicted_ids = torch.argmax(logits, dim =-1)

In [None]:
# pass the prediction to the tokenzer decode to get the transcription
transcriptions = tokenizer.decode(predicted_ids[0])

In [None]:
transcriptions

In [2]:
import librosa
import torch
import IPython.display as display
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import numpy as np
from os import listdir
import pandas as pd


In [2]:
#load pre-trained model and tokenizer
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2

In [3]:
def speechToText(filename):
    audio, sampling_rate = librosa.load(filename,sr=16000)
    input_values = tokenizer(audio, return_tensors = 'pt').input_values
    logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim =-1)
    transcriptions = tokenizer.decode(predicted_ids[0])
    return transcriptions

In [None]:
# train_files = listdir("Medical Speech, Transcription, and Intent/recordings/test")
test_files = listdir("Medical Speech, Transcription, and Intent/recordings/train")
# validation_files = listdir("Medical Speech, Transcription, and Intent/recordings/validate")

df_train = pd.DataFrame()
suffix_train = "Medical Speech, Transcription, and Intent/recordings/train/"
transcriptions = []
for row in test_files:
    transcriptions.append(speechToText(str(suffix_train + row)))

df_train["filename"] = test_files
df_train["transcriptions"] = transcriptions
# df_test = pd.DataFrame()
# df_train["filename"] = train_files

# df_train = pd.DataFrame()
# df_train["filename"] = train_files

In [5]:
df_train = pd.read_csv("sample_transcriptions.csv")

In [3]:
from transformers import ElectraForMaskedLM, ElectraTokenizer

# Load pre-trained ELECTRA model and tokenizer
model_name = 'google/electra-base-discriminator'  # Use ELECTRA base model
model = ElectraForMaskedLM.from_pretrained(model_name)
tokenizer = ElectraTokenizer.from_pretrained(model_name)

# Function to correct sentences using ELECTRA
def correct_sentences(sentences):
    corrected_sentences = []
    for sentence in sentences:
        # Replace [MASK] with the appropriate masking token recognized by ELECTRA
        sentence = sentence.replace("[MASK]", tokenizer.mask_token)
        
        # Tokenize the sentence
        tokenized_sentence = tokenizer.tokenize(sentence)
        
        # Convert tokenized sentence to IDs
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_sentence)
        
        # Convert token IDs to tensor
        tokens_tensor = torch.tensor([indexed_tokens])
        
        # Predict token probabilities
        with torch.no_grad():
            outputs = model(tokens_tensor)
            predictions = outputs[0]
        
        # Get the predicted token IDs for each token position
        predicted_token_ids = torch.argmax(predictions, dim=2)[0]
        
        # Convert token IDs back to tokens
        predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_token_ids)
        
        # Filter out special tokens and join the corrected words into a sentence
        corrected_sentence = ' '.join(token for token in predicted_tokens if token not in tokenizer.all_special_tokens)
        
        # Append the corrected sentence to the list
        corrected_sentences.append(corrected_sentence)
    
    return corrected_sentences

# Example sentences
sentences = ["I have a pen, I have an apple, [MASK] apple pen.", "He is go to the school."]
# Correct the sentences
corrected_sentences = correct_sentences(sentences)

# Print the original and corrected sentences
for original, corrected in zip(sentences, corrected_sentences):
    print("Original:", original)
    print("Corrected:", corrected)
    print()


Some weights of ElectraForMaskedLM were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['generator_lm_head.bias', 'generator_predictions.LayerNorm.bias', 'generator_predictions.LayerNorm.weight', 'generator_predictions.dense.bias', 'generator_predictions.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Original: I have a pen, I have an apple, [MASK] apple pen.
Corrected: coughed ##fra bodily ##fra ##ifice bravo serial antoine elevators liberia ##points ##erus ##fra ##fra

Original: He is go to the school.
Corrected: ##olo elevators ##fra ##ivo flats ##fra ##fra



## Mapping the audios to their respective prompts

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import IPython.display as display
import librosa
from glob import glob
from tqdm import tqdm

In [2]:
def create_melspectrogram(filename,name):
    plt.interactive(False)
    clip, sample_rate = librosa.load(filename, sr=None)
    fig = plt.figure(figsize=[0.72,0.72])
    ax = fig.add_subplot(111)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.set_frame_on(False)
    S = librosa.feature.melspectrogram(y=clip, sr=sample_rate)
    librosa.display.specshow(librosa.power_to_db(S, ref=np.max))
    filename = 'Medical Speech, Transcription, and Intent/spectrograms/' + name + '.jpg'
    plt.savefig(filename, dpi=400, bbox_inches='tight',pad_inches=0)
    plt.close()    
    fig.clf()
    plt.close(fig)
    plt.close('all')
    del filename,name,clip,sample_rate,fig,ax,S

In [3]:

Data_dir_train=np.array(glob("Medical Speech, Transcription, and Intent/recordings/test/*"))
Data_dir_test=np.array(glob("Medical Speech, Transcription, and Intent/recordings/train/*"))
Data_dir_val=np.array(glob("Medical Speech, Transcription, and Intent/recordings/validate/*"))

for file in tqdm(Data_dir_train):
    filename,name = file,file.split('/')[-1].split('.')[0]
    create_melspectrogram(filename,name)
for file in tqdm(Data_dir_test):
    filename,name = file,file.split('/')[-1].split('.')[0]
    create_melspectrogram(filename,name)
for file in tqdm(Data_dir_val):
    filename,name = file,file.split('/')[-1].split('.')[0]
    create_melspectrogram(filename,name)

  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **

In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from keras.applications import ResNet50
from keras.models import Sequential
from keras.layers import Dense, GlobalAveragePooling2D

In [2]:
df_overview = pd.read_csv("Medical Speech, Transcription, and Intent/overview-of-recordings.csv")
df_overview = df_overview[["file_name","prompt"]]
df_overview["file_name"] = "Medical Speech, Transcription, and Intent/spectrograms/" + df_overview["file_name"].str.split(".").str[0] + ".jpg"


In [3]:
datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

In [4]:
generator = datagen.flow_from_dataframe(
    df_overview,
    x_col='file_name',
    y_col='prompt',
    target_size=(150, 150),  # Adjust to match your model's input size
    batch_size=32,
    class_mode='categorical',  # Change to 'binary' if you have binary classes
    shuffle=True
)


Found 6661 validated image filenames belonging to 25 classes.


In [None]:
# ResNet50 Model
base_model = ResNet50(weights='imagenet', include_top=False)
model = Sequential()
model.add(base_model)
model.add(GlobalAveragePooling2D())
model.add(Dense(25, activation='softmax'))

# Compile the Model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the Model
model.fit(generator, epochs=10, steps_per_epoch=len(df_overview)//32)

In [None]:
# Unfreeze layers for fine-tuning
for layer in base_model.layers:
    layer.trainable = True

# Compile the Model for fine-tuning
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Fine-tune the Model
model.fit(generator, epochs=10, steps_per_epoch=len(df_overview)//32)


In [None]:
# Evaluate the Model
evaluation = model.evaluate(generator)

# Print Model Performance Overview
print("Model Performance Overview:")
print("Loss:", evaluation[0])
print("Accuracy:", evaluation[1])


In [8]:
df_overview = pd.read_csv("Medical Speech, Transcription, and Intent/overview-of-recordings.csv")
df_overview = df_overview[["file_name","prompt"]]
df_overview["file_name"] = df_overview["file_name"].str.split(".").str[0] + ".jpg"


In [10]:
import os
import shutil
import pandas as pd

# Assuming your dataframe is named df and has columns 'filename' and 'class_name'
# Example:
# df = pd.DataFrame({'filename': ['file1.jpg', 'file2.jpg', 'file3.jpg'],
#                    'class_name': ['class1', 'class2', 'class1']})

# Path to the folder containing the files
source_folder = 'Medical Speech, Transcription, and Intent/spectrograms'

# Path to the folder where you want to organize files
output_folder = 'Medical Speech, Transcription, and Intent/spectrogram_subs'

# Create output folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Get unique class names
classes = df_overview['prompt'].unique()

# Loop through each class
for class_name in classes:
    # Create a folder for the class
    class_folder = os.path.join(output_folder, class_name)
    os.makedirs(class_folder, exist_ok=True)
    
# Loop through each row in the dataframe
for index, row in df_overview.iterrows():
    filename = row['file_name']
    class_name = row['prompt']
    
    # Copy the file to the respective class folder
    source_path = os.path.join(source_folder, filename)
    destination_path = os.path.join(output_folder, class_name, filename)
    shutil.copyfile(source_path, destination_path)

print("Files organized successfully!")


Files organized successfully!
