In [1]:
import os
import librosa
import numpy as np
import soundfile as sf

def split_wav_file(input_path, output_path, duration=20, max_samples=3):
    try:
        os.makedirs(output_path)
    except FileExistsError:
        pass

    for root, dirs, files in os.walk(input_path):
        for file in files:
            if file.endswith('.wav'):
                input_file_path = os.path.join(root, file)
                output_folder = os.path.relpath(root, input_path)
                output_folder_path = os.path.join(output_path, output_folder)
                try:
                    os.makedirs(output_folder_path)
                except FileExistsError:
                    pass

                y, sr = librosa.load(input_file_path, sr=None)
                total_duration = librosa.get_duration(y=y, sr=sr)
                counter = 0

                for i in range(int(total_duration // duration)):
                    if counter >= max_samples:
                        break
                    start = i * duration
                    end = start + duration
                    split_y = y[int(start * sr):int(end * sr)]
                    output_file_path = os.path.join(output_folder_path, f"{file[:-4]}_{i}.wav")
                    sf.write(output_file_path, split_y, sr)
                    counter += 1
                print(f"Processed {file}")


In [2]:
train_input_path = "/kaggle/input/slpproject/SLP/train"
train_output_path = "/kaggle/working/train_10s"
test_input_path = "/kaggle/input/slpproject/SLP/test"
test_output_path = "/kaggle/working/test_10s"

split_wav_file(train_input_path, train_output_path)
split_wav_file(test_input_path, test_output_path)

Processed hebron_test022.wav
Processed hebron_train042.wav
Processed hebron_test025.wav
Processed hebron_test021.wav
Processed hebron_train043.wav
Processed hebron_test023.wav
Processed hebron_train045.wav
Processed hebron_train044.wav
Processed hebron_train041.wav
Processed hebron_test024.wav
Processed nablus_train045.wav
Processed nablus_train044.wav
Processed nablus_test025.wav
Processed nablus_test024.wav
Processed nablus_train042.wav
Processed nablus_train043.wav
Processed nablus_test023.wav
Processed nablus_train041.wav
Processed nablus_test022.wav
Processed nablus_test021.wav
Processed ramallah-reef_train042.wav
Processed ramallah-reef_test022.wav
Processed ramallah-reef_test021.wav
Processed ramallah-reef_train044.wav
Processed ramallah-reef_train045.wav
Processed ramallah-reef_test023.wav
Processed ramallah-reef_train041.wav
Processed ramallah-reef_train043.wav
Processed ramallah-reef_test024.wav
Processed ramallah-reef_test025.wav
Processed jerusalem_test024.wav
Processed jer

In [3]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification, TrainingArguments, Trainer
import torchaudio

# Define a custom dataset
class AccentDataset(Dataset):
    def __init__(self, directory, processor, max_length):
        self.file_paths = []
        self.labels = []
        self.processor = processor
        self.max_length = max_length
        
        for label, subdir in enumerate(os.listdir(directory)):
            subdir_path = os.path.join(directory, subdir)
            if os.path.isdir(subdir_path):
                for file_name in os.listdir(subdir_path):
                    if file_name.endswith('.wav'):
                        self.file_paths.append(os.path.join(subdir_path, file_name))
                        self.labels.append(label)
    
    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]
        waveform, sr = torchaudio.load(file_path)
        waveform = waveform.squeeze().numpy()
        
        # Concatenate audio to match max_length
        while len(waveform) < self.max_length:
            waveform = np.concatenate((waveform, waveform))
        waveform = waveform[:self.max_length]
        
        inputs = self.processor(waveform, sampling_rate=16000, return_tensors="pt", padding="max_length", max_length=self.max_length)
        inputs['labels'] = torch.tensor(label, dtype=torch.long)
        return {
            'input_values': inputs['input_values'].squeeze(),
            'labels': inputs['labels']
        }

# Define paths
train_dir = "/kaggle/working/train_10s"
test_dir = "/kaggle/working/test_10s"

# Determine the maximum length of audio files
max_length = 16000 * 20  # Limit to 10 seconds of audio

# Define the number of labels
num_labels = 4

# Load the processor and model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-base-960h", num_labels=num_labels)

# Create datasets
train_dataset = AccentDataset(train_dir, processor, max_length)
test_dataset = AccentDataset(test_dir, processor, max_length)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=4, num_workers=4)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Match evaluation strategy
    save_total_limit=1,
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # Reduced batch size
    per_device_eval_batch_size=4,  # Reduced batch size
    num_train_epochs=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,  # Enable mixed precision training
    gradient_accumulation_steps=4,  # Simulate a larger batch size
)

# Define a simple compute_metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": (preds == p.label_ids).mean()}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Debugging: Check labels
for batch in train_loader:
    print("Batch labels:", batch['labels'])
    assert batch['labels'].min() >= 0 and batch['labels'].max() < num_labels, "Labels are out of range"
    break

# Train the model
trainer.train()


2024-06-09 17:40:32.142907: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-09 17:40:32.143019: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-09 17:40:32.264882: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.pid = os.fork()
  self.pid = os.fork()


Batch labels: tensor([3, 0, 3, 0])


Epoch,Training Loss,Validation Loss,Accuracy
0,No log,1.385926,0.245283
2,1.385200,1.400446,0.283019
4,1.351700,1.400354,0.283019
6,1.304200,1.35265,0.339623
8,1.223000,1.339346,0.320755
10,1.223000,1.31461,0.358491
12,1.164300,1.276321,0.358491
14,1.128600,1.292158,0.339623
16,1.081000,1.195709,0.415094
18,1.024800,1.194244,0.415094


TrainOutput(global_step=400, training_loss=0.7171217727661133, metrics={'train_runtime': 1850.7765, 'train_samples_per_second': 3.89, 'train_steps_per_second': 0.216, 'total_flos': 1.162071293952e+18, 'train_loss': 0.7171217727661133, 'epoch': 88.88888888888889})

In [5]:
import shutil

def clear_output_folders(output_paths):
    for output_path in output_paths:
        try:
            shutil.rmtree(output_path)
            print(f"Cleared {output_path}")
        except FileNotFoundError:
            print(f"{output_path} not found")

output_paths = [
    "/kaggle/working/train_10s",
    "/kaggle/working/test_10s"
]

clear_output_folders(output_paths)

/kaggle/working/train_10s not found
/kaggle/working/test_10s not found


In [6]:
import os
import zipfile

def zip_folder(folder_path, output_path):
    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Walk through the directory
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                # Add file to zip
                zipf.write(file_path, os.path.relpath(file_path, folder_path))

# Example usage
folder_to_zip = '/kaggle/working/results'
zip_file_name = 'result_73.zip'
zip_folder(folder_to_zip, zip_file_name)

print(f"Folder {folder_to_zip} zipped successfully into {zip_file_name}")


Folder /kaggle/working/results zipped successfully into result_73.zip
