In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import os
os.chdir("drive/MyDrive/data/AMHARIC")
os.listdir()

['README.md', 'data', 'kaldi-script', 'lang', 'lm']

In [None]:
import librosa   #for audio processing
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import wavfile #for audio processing
#for pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
meta_data=pd.read_csv("data/train/trsTrain.txt",sep="\t",header=None)
def create_meta_data(df:pd.DataFrame, column1:str, column2:str):
    df.rename(columns = {0: column1}, inplace = True)
    df[column2] = df[column1].apply(lambda x: x.split("</s>")[1].replace("(", "").replace(")", "").strip())
    df[column1] = df[column1].apply(lambda x: x.split("</s>")[0])
    df[column1] = df[column1].apply(lambda x: x.split("<s>")[1].strip())
    df[column2] = df[column2].apply(lambda x: "data/train/wav/"+x+".wav")
    return df

pipe = Pipeline(steps = [("metadata", FunctionTransformer(create_meta_data, kw_args={"column1":'Transcript', "column2": 'audio'}))])
meta_pipe = pipe.fit_transform(meta_data)
meta_data

Unnamed: 0,Transcript,audio
0,ያንደኛ ደረጃ ትምህርታቸው ን ጐንደር ተ ም ረዋል,data/train/wav/tr_1_tr01001.wav
1,የተ ለቀቁት ምርኮኞች በ አካባቢያቸው ሰላማዊ ኑሮ እንዲ ኖሩ የ ትራንስፖ...,data/train/wav/tr_2_tr01002.wav
2,በ አዲስ አበባው ስታዲየም በ ተካሄዱ ት ሁለት ግጥሚያ ዎች በ መጀመሪያ ...,data/train/wav/tr_3_tr01003.wav
3,ወሬው ን ወሬ ያደረጉ ምስጢረ ኞች ናቸው,data/train/wav/tr_4_tr01004.wav
4,ኢትዮጵያዊ ቷ በ ብሄራዊ ባህላዊ አለባበስ ከ አለም አንደኝነት ን ተቀዳጀ ች,data/train/wav/tr_5_tr01005.wav
...,...,...
10870,እንስራ ው ተ ሸነቆረ,data/train/wav/tr_10871_tr09145.wav
10871,ቤዛ ጐረ መሰ መሰለኝ ትእዛዝ አል ቀበል ም አለ,data/train/wav/tr_10872_tr09146.wav
10872,በለጠ ች የ በየነ የ በኩር ልጅ ነች,data/train/wav/tr_10873_tr09147.wav
10873,እንዲያ መሬት አይ ን ካ ኝ ይል የነበረ ሰው በ ድንገት ቆረቆዘ አይደል,data/train/wav/tr_10874_tr09148.wav


In [None]:
import json
import pathlib
import sys
import os
import glob
import pandas as pd

class DataLoader:
    def __init__(self, data_dir, sample_rate=44100, max_duration=10.0,
                 max_samples=None, max_files=None, verbose=True):
        """Load audio data from a directory."""
        try:
            self.data_dir = data_dir
            self.sample_rate = sample_rate
            self.max_duration = max_duration
        except Exception as e:
            sys.exit(1)

    def load_audios(self, mono: bool, no_of_audios: int = 100) -> tuple:
        """Load the audio files from a folder.

        Args:
            mono (bool): whether to load the audio as mono or not
            no_of_audios (int): Number of files to load

        Returns:
            tuple: Dictionary of sampled audios and Maximum duration of audios
        """
        try:
            audio_data = {}
            max_duration = 0
            for i, file in enumerate(os.listdir(self.data_dir)):
                if i > no_of_audios:
                    break
                sampled_audio, sample_rate = librosa.load(
                    self.data_dir+file, sr=self.sample_rate, mono=mono)
                max_duration = max(len(sampled_audio) /
                                   sample_rate, max_duration)
                audio_data[file.split('.')[0]] = sampled_audio

            return audio_data, max_duration
        except Exception as e:
            sys.exit(1)

    def get_wav_files(self) -> list:
        """Get the wav files from a folder.

        Args:
            path (str): Path to the folder

        Returns:
            list: List of wav files
        """
      # try:
        path = self.data_dir
        # print(path)
        path = path + '*.wav'
        wav_files = glob.glob(path)
        return wav_files
        # except Exception as e:
        #     sys.exit(1)

    def load_transcription(self, file_path: str, dest_path: str, save=False) -> dict:
        """Load transcription data"""

        audio_path = []
        text = []
        duration = []
      # try:
        with open(file_path) as fp:
            Lines = fp.readlines()
            for line in Lines:
                valid_json = {}
                # val = line.split(' ')[1:]
                # val = ' '.join(val)
                # # Remove any new line character
                # val = val.replace("\n", "").strip()
                # path = line.split(' ')[0]
                name=line.split("</s>")[1]
                name=name.replace('(', '')
                name=name.replace(')', '')
                name=name.replace('\n','')
                name=name.replace(' ','')
                path=line.split("</s>")[0]
                trans=path.replace("<s>","")
                
                # print(pathlib.Path().resolve())
                path = 'wav/' + name + '.wav'
                audios = self.get_wav_files()
                # if name not in audios:
                #     continue

                audio_path.append(path)
                text.append(trans)
                duration.append(librosa.get_duration(filename=path))
                valid_json['text'] = trans
                valid_json['key'] = path
                # GEt the duration of the audio file
                valid_json['duration'] = librosa.get_duration(
                    filename=path)
                if save:
                    with open(dest_path, 'a', encoding='utf-8') as fp:
                        fp.write(json.dumps(
                            valid_json, ensure_ascii=False))
                        fp.write("\n")
        return audio_path, text, duration
      # except Exception as e:
      #     sys.exit(1)

    def generate_meta_data(self, path, dest_path):
        """Generate meta data csv"""

      # try:
        audio_path, text, duration = self.load_transcription(
            path, dest_path)
        data = pd.DataFrame(
            {'key': audio_path, 'text': text, 'duration': duration})
        data.to_csv(dest_path, index=False)
      # except Exception as e:
        

    def read_csv(self, csv_file) -> pd.DataFrame:
        """Csv file reader to open and read csv files into a panda dataframe.
        Args:
        -----
        csv_file: str - path of a json file

        Returns
        -------
        dataframe containing data extracted from the csv file"""
        return pd.read_csv(csv_file)


In [None]:
data = DataLoader('../data/AMHARIC/data/test/wav')

In [None]:
data.load_transcription('trsTest.txt', dest_path = 'valid_corpus.json', save=True )

In [None]:
!pwd
!ls

/content/drive/MyDrive/data/AMHARIC/data/test
spk2utt  text  trsTest.txt  utt2spk  valid_corpus.json	wav  wav.scp


In [None]:
cd test


/content/drive/MyDrive/data/AMHARIC/data/test


In [None]:
%tb

SystemExit: ignored