In [232]:
import csv
import os
from pathlib import Path


import torch
from torch import Tensor
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from typing import Dict, List, Tuple, Union


from torch.utils.data import Dataset,DataLoader
import torchvision.datasets as datasets
import torchvision.transforms as transforms

import torchaudio
import torchaudio.transforms as T


import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

#mps_device = torch.device("mps")

### Analyse de notre donnée

In [233]:
df = pd.read_csv('./data/cv-corpus-7.0-singleword/fr/train.tsv', sep='\t')
df.head()

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent,locale,segment
0,89fa8146b2c07e3e2fe2bbf3852ed8b22a625e383df7eb...,common_voice_fr_21982258.mp3,Firefox,2,0,teens,male,france,fr,Benchmark
1,89fa8146b2c07e3e2fe2bbf3852ed8b22a625e383df7eb...,common_voice_fr_21982260.mp3,cinq,3,0,teens,male,france,fr,Benchmark
2,89fa8146b2c07e3e2fe2bbf3852ed8b22a625e383df7eb...,common_voice_fr_21982262.mp3,trois,3,0,teens,male,france,fr,Benchmark
3,89fa8146b2c07e3e2fe2bbf3852ed8b22a625e383df7eb...,common_voice_fr_21982265.mp3,deux,3,2,teens,male,france,fr,Benchmark
4,89fa8146b2c07e3e2fe2bbf3852ed8b22a625e383df7eb...,common_voice_fr_21982266.mp3,sept,2,0,teens,male,france,fr,Benchmark


Donnée balanced ?

In [234]:
df.value_counts('sentence')

sentence
cinq       622
deux       621
neuf       621
non        621
zéro       621
oui        620
quatre     620
sept       620
trois      618
un         617
six        614
Firefox    611
Hey        611
huit       611
Name: count, dtype: int64

In [235]:
labels = list(df.value_counts('sentence').index)
print(labels)

['cinq', 'deux', 'neuf', 'non', 'zéro', 'oui', 'quatre', 'sept', 'trois', 'un', 'six', 'Firefox', 'Hey', 'huit']


In [236]:
print(len(labels))

14


### Trop de donnée
Nous ne voulons que les lignes qui contiennent les valeurs: oui, non, un, deux, trois, quatre

In [237]:
df = df[df['sentence'].isin(['un', 'deux', 'trois', 'quatre','oui','non'])]
labels = list(df.value_counts('sentence').index)
print(labels)
print(len(labels))

['deux', 'non', 'oui', 'quatre', 'trois', 'un']
6


In [238]:
# Fonction qui  va chercher les fichiers audio dans clips depuis les infos du tsv et les convertir en tensor
def load_commonvoice_item(
    line: List[str], header: List[str], path: str, folder_audio: str, ext_audio: str
) -> Tuple[Tensor, int, Dict[str, str]]:
    # Each line as the following data:
    # client_id, path, sentence, up_votes, down_votes, age, gender, accent

    if header[1] != "path":
        raise ValueError(f"expect `header[1]` to be 'path', but got {header[1]}")
    fileid = line[1]
    filename = os.path.join(path, folder_audio, fileid)
    if not filename.endswith(ext_audio):
        filename += ext_audio
    waveform, sample_rate = torchaudio.load(filename)

    dic = dict(zip(header, line))

    return waveform, sample_rate, dic


# Classe qui va permettre de charger les données de CommonVoice
class COMMONVOICE(Dataset):
    """*CommonVoice* :cite:`ardila2020common` dataset.

    Args:
        root (str or Path): Path to the directory where the dataset is located.
             (Where the ``tsv`` file is present.)
        tsv (str, optional):
            The name of the tsv file used to construct the metadata, such as
            ``"train.tsv"``, ``"test.tsv"``, ``"dev.tsv"``, ``"invalidated.tsv"``,
            ``"validated.tsv"`` and ``"other.tsv"``. (default: ``"train.tsv"``)
    """

    _ext_txt = ".txt"
    _ext_audio = ".mp3"
    _folder_audio = "clips"

    def __init__(self, root: Union[str, Path], tsv: str = "train.tsv", transform=None) -> None:

        # Get string representation of 'root' in case Path object is passed
        self._path = os.fspath(root)
        self._tsv = os.path.join(self._path, tsv)

        with open(self._tsv, "r") as tsv_:
            walker = csv.reader(tsv_, delimiter="\t")
            self._header = next(walker)
            self._walker = list(walker)

    def __getitem__(self, n: int) -> Tuple[Tensor, int, str]:
        """Load the n-th sample from the dataset.

        Args:
            n (int): The index of the sample to be loaded

        Returns:
            Tuple of the following items;

            Tensor:
                Waveform
            int:
                Sample rate
            Dict[str, str]:
                Dictionary containing the following items from the corresponding TSV file;

                * ``"client_id"``
                * ``"path"``
                * ``"sentence"``
                * ``"up_votes"``
                * ``"down_votes"``
                * ``"age"``
                * ``"gender"``
                * ``"accent"``
        """
        line = self._walker[n]
        waveform, sample_rate, dic = load_commonvoice_item(line, self._header, self._path, self._folder_audio, self._ext_audio)
        spectrogram = transform(waveform)
        length = spectrogram.shape[-1]  # Time dimension length
        label = dic["sentence"]
        print(label)
        return spectrogram, length, label


    def __len__(self) -> int:
        return len(self._walker)

In [239]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchaudio
from torchaudio.transforms import MelSpectrogram, FrequencyMasking, TimeMasking
from torch.nn.utils.rnn import pack_sequence, pad_sequence


audio_transforms = nn.Sequential(
    T.MelSpectrogram(sample_rate=48000)
)

commonvoice = COMMONVOICE(root="./data/cv-corpus-7.0-singleword/fr", tsv="train.tsv",transform=audio_transforms)

def custom_collate_fn(batch):
    # Sort the batch in the descending order of sequence length for packing
    sorted_batch = sorted(batch, key=lambda x: x[1], reverse=True)
    sequences = [x[0] for x in sorted_batch]
    lengths = torch.tensor([x[1] for x in sorted_batch])
    labels = [x[2] for x in sorted_batch]  # Collect labels

    # Depending on your model, you may pack sequences or pad them
    packed_sequences = pack_sequence(sequences, enforce_sorted=True)
    # If your labels are tensor-like and you're using something like CrossEntropyLoss, 
    # you might not need to pad or pack labels. Otherwise, adapt as necessary.
    
    # If labels need to be a tensor (e.g., for classification with CrossEntropyLoss),
    # ensure labels are converted to a tensor. This assumes labels are numeric.
    labels = torch.tensor(labels, dtype=torch.long)

    return packed_sequences, lengths, labels


# # Assuming `commonvoice` is your dataset instanc
commonvoice_loader = DataLoader(commonvoice, batch_size=64, shuffle=True,drop_last=True, collate_fn=custom_collate_fn)



In [241]:
class MyModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(MyModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
    
    def forward(self, packed_sequences):
        # LSTM can directly consume packed sequences
        packed_output, (ht, ct) = self.lstm(packed_sequences)
        # Do something with the output...
        return ht

# Example usage
model = MyModel(input_size=128, hidden_size=256, num_layers=2)  # Adjust sizes as needed
for packed_sequences in commonvoice_loader:
    output = model(packed_sequences)
    


sept
non
quatre
six
neuf
Hey
neuf
oui
Firefox
sept
Hey
trois
oui
oui
un
Firefox
un
cinq
zéro
quatre
neuf
trois
oui
oui
deux
non
huit
huit
un
Hey
cinq
Hey
six
quatre
oui
trois
non
sept
deux
non
oui
six
Firefox
cinq
cinq
zéro
huit
cinq
neuf
non
deux
sept
un
quatre
Firefox
trois
oui
neuf
trois
trois
trois
huit
deux
trois


RuntimeError: The size of tensor a (1106) must match the size of tensor b (1014) at non-singleton dimension 2