In [9]:
import asyncio
from temporalio import workflow, activity
from temporalio.client import Client
from temporalio.worker import Worker
client = await Client.connect("localhost:7233")


RuntimeError: Failed client connect: `get_system_info` call error after connection: Status { code: Unknown, message: "transport error", source: Some(tonic::transport::Error(Transport, hyper::Error(Io, Custom { kind: BrokenPipe, error: "stream closed because of a broken pipe" }))) }

In [27]:
import glob
import os
import json
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import torch
import librosa

class_map = pd.read_csv('yamnet_class_map.csv').set_index('display_name').to_dict()['mid']

# Initialize the model and processor
model = Wav2Vec2ForSequenceClassification.from_pretrained('facebook/wav2vec2-large-960h', num_labels=len(class_map))
processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-large-960h')

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Moved model to GPU if available")

class AudioDataset(Dataset):
    def __init__(self, audio_directory, ontology_file, class_map):
        with open(ontology_file, 'r') as f:
            self.ontology_data = json.load(f)

        self.class_map = class_map
        self.audio_directory = audio_directory
        self.audio_files = glob.glob(os.path.join(self.audio_directory, '**', '*.wav'), recursive=True)
        
        # Populate the dataset by calling prepare_data
        self.data = self.prepare_data()

    def prepare_data(self):
        data = []
        for category in self.ontology_data:
            if "positive_examples" in category:
                category_name = category["name"]
                label = self.class_map.get(category_name, -1)
                print(type(label),label)
                for audio_file in self.audio_files:
                    if category_name.lower() in audio_file.lower():
                        audio_file = audio_file.replace("\\", "/")
                        data.append({"audio": audio_file, "label": label})
        return data
    
    def load_audio(self, file_path):
        """Load and preprocess audio using Wav2Vec2Processor."""
        try:
            if not os.path.isfile(file_path):
                raise FileNotFoundError(f"WAV file not found: {file_path}")

            # Load audio using librosa and resample to 16kHz
            audio_data, sr = librosa.load(file_path, sr=16000)
            print(f"Successfully loaded audio: {file_path}, shape: {audio_data.shape}, dtype: {audio_data.dtype}")

            # Preprocess using Wav2Vec2Processor
            inputs = processor(audio_data, sampling_rate=sr, return_tensors="pt", padding=True)
            processed_audio = inputs.input_values.squeeze(0)  # Remove batch dimension

            # Ensure processed_audio is a tensor and not a string
            if isinstance(processed_audio, torch.Tensor):
                return processed_audio
            else:
                print(f"Unexpected processed audio data type: {type(processed_audio)}")
                return None

        except Exception as e:
            print(f"Error loading audio file {file_path}: {e}")
            return None


    def __getitem__(self, idx):
        """Get one item (audio, label) for the dataset."""
        sample = self.data[idx]
        audio_data = self.load_audio(sample["audio"])  # Load audio as tensor
        label = sample["label"]

        # Ensure audio_data is valid (tensor)
        if audio_data is None or not isinstance(audio_data, torch.Tensor):
            print(f"Error loading audio at index {idx}, returning dummy data.")
            return torch.zeros(1), torch.tensor(label)

        # Ensure it's a tensor before returning
        return audio_data.clone().detach(), torch.tensor(label)


    def __len__(self):
        return len(self.data)

def collate_fn(batch):
    """Custom collate_fn to handle batch loading of audio data."""
    audio_data = [item[0] for item in batch]  # Extract audio tensors
    labels = [item[1] for item in batch]  # Extract labels

    # Pad audio data (if necessary) to make the batch uniform
    audio_data_padded = torch.nn.utils.rnn.pad_sequence(audio_data, batch_first=True, padding_value=0.0)

    # Stack labels
    labels = torch.tensor(labels)

    return audio_data_padded, labels



# Initialize the dataset and dataloaders
audio_directory = r"audiosets/ontology"
ontology_file = 'ontology.json'
class_map = pd.read_csv('yamnet_class_map.csv').set_index('display_name').to_dict()['index']

# Initialize dataset and prepare data
dataset = AudioDataset(audio_directory, ontology_file, class_map)

# Now split the dataset into train and test sets (80% train, 20% test)
train_data, test_data = train_test_split(dataset.data, test_size=0.2, random_state=42)

# Initialize train and test datasets using the split data
train_dataset = AudioDataset(audio_directory, ontology_file, class_map)
test_dataset = AudioDataset(audio_directory, ontology_file, class_map)

# Assign the split data to the datasets
train_dataset.data = train_data
test_dataset.data = test_data

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).mean()
    return {'accuracy': accuracy}

# Training setup
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=3,
    save_steps=10,
    disable_tqdm=False,
    report_to="tensorboard",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Moved model to GPU if available
<class 'int'> -1
<class 'int'> -1
<class 'int'> 0
<class 'int'> -1
<class 'int'> -1
<class 'int'> 1
<class 'int'> 2
<class 'int'> 3
<class 'int'> 4
<class 'int'> 5
<class 'int'> 6
<class 'int'> 7
<class 'int'> 8
<class 'int'> 9
<class 'int'> -1
<class 'int'> 10
<class 'int'> 11
<class 'int'> 12
<class 'int'> 13
<class 'int'> 14
<class 'int'> 15
<class 'int'> 16
<class 'int'> 17
<class 'int'> 18
<class 'int'> 19
<class 'int'> 20
<class 'int'> 21
<class 'int'> 22
<class 'int'> 23
<class 'int'> 24
<class 'int'> 25
<class 'int'> 26
<class 'int'> 27
<class 'int'> 28
<class 'int'> -1
<class 'int'> -1
<class 'int'> 29
<class 'int'> 30
<class 'int'> 31
<class 'int'> 32
<class 'int'> 33
<class 'int'> 34
<class 'int'> -1
<class 'int'> 35
<class 'int'> -1
<class 'int'> -1
<class 'int'> 36
<class 'int'> 37
<class 'int'> 38
<class 'int'> 39
<class 'int'> 40
<class 'int'> 41
<class 'int'> 42
<class 'int'> 43
<class 'int'> 44
<class 'int'> 45
<class 'int'> -1
<class 'i



Successfully loaded audio: audiosets/ontology/Snort_0.wav, shape: (1857411,), dtype: float32
Successfully loaded audio: audiosets/ontology/Electronic organ_6.wav, shape: (160125,), dtype: float32
Successfully loaded audio: audiosets/ontology/Blender_6.wav, shape: (159754,), dtype: float32
Successfully loaded audio: audiosets/ontology/Tubular bells_2.wav, shape: (160125,), dtype: float32
Successfully loaded audio: audiosets/ontology/Hubbub, speech noise, speech babble_0.wav, shape: (160125,), dtype: float32
Successfully loaded audio: audiosets/ontology/Ice cream truck, ice cream van_5.wav, shape: (435235,), dtype: float32
Successfully loaded audio: audiosets/ontology/Chopping (food)_2.wav, shape: (160125,), dtype: float32
Successfully loaded audio: audiosets/ontology/Duck_6.wav, shape: (3829621,), dtype: float32
Successfully loaded audio: audiosets/ontology/Drum machine_5.wav, shape: (160125,), dtype: float32
Successfully loaded audio: audiosets/ontology/Cap gun_0.wav, shape: (150466,),

TypeError: vars() argument must have __dict__ attribute

In [1]:
import os
import json
import numpy as np
import ffmpeg
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from multiprocessing import Pool
from tqdm import tqdm  # Ensure tqdm is imported
import torch

model = Wav2Vec2ForCTC.from_pretrained('wav2vec2_model')

# Load the processor
processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-large-960h')

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


class AudioDataset(Dataset):
    def __init__(self, audio_directory, ontology_file, class_map):
        # Load the ontology data (audio categories and their URLs)
        with open(ontology_file, 'r') as f:
            self.ontology_data = json.load(f)

        self.class_map = class_map
        self.audio_directory = audio_directory

        self.data = self.prepare_data()

    def prepare_data(self):
        data = []

        # Get all .wav files in the audio directory (recursively if needed)
        audio_files = glob.glob(os.path.join(self.audio_directory, '**', '*.wav'), recursive=True)

        # Collect all the audio files
        for category in self.ontology_data:
            if "positive_examples" in category:
                # Get the category name
                category_name = category["name"]

                # Get the label (id) for this category
                label = self.class_map.get(category_name, -1)  # Default to -1 if not found
                if label == -1:
                    print(f"Warning: Category {category_name} not found in class_map")

                # Match files to category
                for audio_file in audio_files:
                    # Check if the filename contains the category name (assuming filenames follow a naming pattern)
                    if category_name.lower() in audio_file.lower():
                        # Load the audio file
                        audio_data = self.load_audio(audio_file)
                        data.append({"audio": audio_data, "label": label})

        return data

    def load_audio(self, file_path):
        """Load audio from a file (WAV format expected)."""
        try:
            if not os.path.isfile(file_path):
                raise FileNotFoundError(f"WAV file not found: {file_path}")

            # Load audio using numpy
            audio_data = np.fromfile(file_path, dtype=np.int16)  # Load the raw bytes as int16
            return audio_data
        except Exception as e:
            print(f"Error loading audio file {file_path}: {e}")
            return None

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


# Initialize the dataset (replace with the correct paths)
audio_directory = r"audiosets/ontology"  # Directory where audio files are stored
ontology_file = 'ontology.json'  # Path to the ontology file
class_map = pd.read_csv('yamnet_class_map.csv')  # Class map CSV file

# Initialize the AudioDataset
dataset = AudioDataset(audio_directory, ontology_file, class_map)

# Split dataset into train and test sets (80% train, 20% test)
train_data, test_data = train_test_split(dataset.data, test_size=0.2, random_state=42)

# Create DataLoaders for training and testing
train_dataset = torch.utils.data.TensorDataset(torch.tensor([item["audio"] for item in train_data]), torch.tensor([item["label"] for item in train_data]))
test_dataset = torch.utils.data.TensorDataset(torch.tensor([item["audio"] for item in test_data]), torch.tensor([item["label"] for item in test_data]))

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# Set up your training arguments with tqdm's progress bar enabled
training_args = TrainingArguments(
    output_dir="./results",              # output directory where models will be saved
    evaluation_strategy="epoch",         # evaluate after each epoch
    save_strategy="epoch",               # save model after each epoch
    learning_rate=2e-5,                  # learning rate
    per_device_train_batch_size=16,      # batch size for training
    per_device_eval_batch_size=16,       # batch size for evaluation
    num_train_epochs=3,                  # number of training epochs
    logging_dir="./logs",                # directory for storing logs
    logging_steps=10,                    # log every 10 steps
    save_total_limit=3,                  # Limit the number of saved checkpoints
    save_steps=10,                       # Save more frequently if needed
    disable_tqdm=False,                  # Enable tqdm progress bar
    report_to="tensorboard",             # You can also use TensorBoard for more detailed visualization
    load_best_model_at_end=True,         # Load best model after training
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the model to be fine-tuned
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # the training dataset
    eval_dataset=test_dataset,           # evaluation dataset
    data_collator=None,                  # You can use a data collator if needed
    compute_metrics=None                 # Define metrics if needed
)
print("training arrambika ready ah da punda")
# Start fine-tuning and saving model each epoch
trainer.train()







Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at wav2vec2_model and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'glob' is not defined

In [None]:
import os
import json
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch

# Initialize the model and processor
model = Wav2Vec2ForCTC.from_pretrained('wav2vec2_model')
processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-large-960h')

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Moved model to GPU if available")


class AudioDataset(Dataset):
    def __init__(self, audio_directory, ontology_file, class_map):
        # Load the ontology data (audio categories and their URLs)
        with open(ontology_file, 'r') as f:
            self.ontology_data = json.load(f)

        self.class_map = class_map
        self.audio_directory = audio_directory

        self.data = self.prepare_data()

    def prepare_data(self):
        data = []

        # Collect all the audio files
        for category in self.ontology_data:
            if "positive_examples" in category:
                for idx, _ in enumerate(category["positive_examples"]):
                    file_name = f"{category['name']}_{idx}.wav"  # Assuming you already have .wav files
                    file_path = os.path.join(self.audio_directory, file_name)
                    file_path = file_path.replace("\\", "/")  # Ensure consistent paths

                    if os.path.isfile(file_path):
                        # Load the audio file
                        audio_data = self.load_audio(file_path)

                        # Assign the label based on the class map
                        label = self.class_map.get(category['name'], -1)  # Default to -1 if not found
                        data.append({"audio": audio_data, "label": label})

        return data

    def load_audio(self, file_path):
        """Load audio from a file (WAV format expected)."""
        try:
            if not os.path.isfile(file_path):
                raise FileNotFoundError(f"WAV file not found: {file_path}")

            # Load audio using numpy
            audio_data = np.fromfile(file_path, dtype=np.int16)  # Load the raw bytes as int16
            return audio_data
        except Exception as e:
            print(f"Error loading audio file {file_path}: {e}")
            return None

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Initialize the dataset (replace with the correct paths)
print("Initialize the dataset")

audio_directory = r"audiosets/ontology"  # Directory where audio files are stored
ontology_file = 'ontology.json'  # Path to the ontology file
class_map = pd.read_csv('yamnet_class_map.csv').set_index('display_name').to_dict()['mid']  # Assuming 'class_name' and 'class_id' columns

# Initialize the AudioDataset
print("Initialize the AudioDataset")
dataset = AudioDataset(audio_directory, ontology_file, class_map)

# Split dataset into train and test sets (80% train, 20% test)
print("train test split")
train_data, test_data = train_test_split(dataset.data, test_size=0.2, random_state=42)

# Create DataLoaders for training and testing
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor([item["audio"] for item in train_data]),
    torch.tensor([item["label"] for item in train_data])
)
test_dataset = torch.utils.data.TensorDataset(
    torch.tensor([item["audio"] for item in test_data]),
    torch.tensor([item["label"] for item in test_data])
)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)

def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).mean()
    return {'accuracy': accuracy}

# Set up your training arguments
training_args = TrainingArguments(
    output_dir="./results",              # output directory where models will be saved
    evaluation_strategy="epoch",         # evaluate after each epoch
    save_strategy="epoch",               # save model after each epoch
    learning_rate=2e-5,                  # learning rate
    per_device_train_batch_size=16,      # batch size for training
    per_device_eval_batch_size=16,       # batch size for evaluation
    num_train_epochs=3,                  # number of training epochs
    logging_dir="./logs",                # directory for storing logs
    logging_steps=10,                    # log every 10 steps
    save_total_limit=3,                  # Limit the number of saved checkpoints
    save_steps=10,                       # Save more frequently if needed
    disable_tqdm=False,                  # Enable tqdm progress bar
    report_to="tensorboard",             # You can also use TensorBoard for more detailed visualization
    load_best_model_at_end=True,         # Load best model after training
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the model to be fine-tuned
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # the training dataset
    eval_dataset=test_dataset,           # evaluation dataset
    data_collator=None,                  # You can use a data collator if needed
    compute_metrics=compute_metrics                 # Define metrics if needed
)

# Start fine-tuning and saving model each epoch
trainer.train()


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at wav2vec2_model and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Moved model to GPU if available
Initialize the dataset
Initialize the AudioDataset


In [None]:
import pandas as pd
from transformers import Wav2Vec2ForCTC
import torch


In [6]:
from transformers import Wav2Vec2Processor
import pandas as pd


# Load the processor from Hugging Face
processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-large-960h')

# Save the processor locally
processor.save_pretrained('./local_wav2vec2_processor')


AttributeError: partially initialized module 'pandas' has no attribute '_pandas_parser_CAPI' (most likely due to a circular import)