In [None]:
import os
import re
import shutil
import subprocess
import pandas as pd
import numpy as np
import librosa
import soundfile as sf
from multiprocessing import Pool, cpu_count
from concurrent.futures import ThreadPoolExecutor
from joblib import Parallel, delayed

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Audio Preprocessing

In [None]:
input_dir = "/content/drive/MyDrive/NLP_DS/processed_audio"
output_dir = "/content/drive/MyDrive/NLP_DS/final_audio"


os.makedirs(output_dir, exist_ok=True)


TARGET_SR = 22050
TARGET_FORMAT = "PCM_16"


audio_files = [f for f in os.listdir(input_dir) if f.endswith(".wav")]


def normalize_audio(audio):
    """ Normalize audio to -1 to 1 range (min-max scaling). """
    return audio / np.max(np.abs(audio))


def process_audio(file_name):
    try:
        input_path = os.path.join(input_dir, file_name)
        output_path = os.path.join(output_dir, file_name)


        audio, sr = librosa.load(input_path, sr=None, mono=False)

        if len(audio.shape) > 1:
            audio = librosa.to_mono(audio)

        if sr != TARGET_SR:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=TARGET_SR)

        # Normalize audio
        audio = normalize_audio(audio)

        # Save as 16-bit WAV
        sf.write(output_path, audio, TARGET_SR, subtype=TARGET_FORMAT)

        return f"Processed: {file_name}"

    except Exception as e:
        return f"Error processing {file_name}: {str(e)}"

if __name__ == "__main__":
    num_workers = min(cpu_count(), len(audio_files))
    print(f"Processing {len(audio_files)} audio files using {num_workers} cores...")

    with Pool(num_workers) as pool:
        results = pool.map(process_audio, audio_files)

    for res in results:
        print(res)

    print("Audio preprocessing complete! All files are ready for Tacotron 2.")


🔄 Processing 4538 audio files using 2 cores...
Processed: tweet_598.wav
Processed: tweet 3210.wav
Processed: tweet_982.wav
Processed: tweet 3388.wav
Processed: tweet_2397.wav
Processed: tweet_6191.wav
Processed: tweet_2256.wav
Processed: tweet_297.wav
Processed: tweet 3202.wav
Processed: tweet_5892.wav
Processed: tweet 2920.wav
Processed: tweet_6706.wav
Processed: tweet_886.wav
Processed: tweet_2385.wav
Processed: tweet 3396.wav
Processed: tweet_2511.wav
Processed: tweet_6159.wav
Processed: tweet_2349.wav
Processed: tweet_5989.wav
Processed: tweet_2191.wav
Processed: tweet_5727.wav
Processed: tweet_1614.wav
Processed: tweet 2763.wav
Processed: tweet_1830.wav
Processed: tweet 3018.wav
Processed: tweet 2706.wav
Processed: tweet_262.wav
Processed: tweet_6468.wav
Processed: tweet_2202.wav
Processed: tweet_6013.wav
Processed: tweet_2135.wav
Processed: tweet_1936.wav
Processed: tweet 3147.wav
Processed: tweet 3464.wav
Processed: tweet_1305.wav
Processed: tweet 2742.wav
Processed: tweet_2280.

## Merged audio and transcript

In [None]:
audio_folder = "/content/drive/MyDrive/NLP_DS/final_audio"
transcript_folder = "/content/drive/MyDrive/NLP_DS/tweet_transcript"
output_csv = "/content/drive/MyDrive/NLP_DS/merge.csv"


def extract_number(filename):
    match = re.search(r'\d+', filename)
    return match.group() if match else None

audio_files = os.listdir(audio_folder)
transcript_files = os.listdir(transcript_folder)

audio_map = {}
for audio in audio_files:
    num = extract_number(audio)
    if num:
        if num in audio_map:
            audio_map[num].append(audio)
        else:
            audio_map[num] = [audio]

transcript_map = {extract_number(transcript): transcript for transcript in transcript_files if extract_number(transcript)}

mapped_data = []
for num, audios in audio_map.items():
    if num in transcript_map:
        transcript_file = transcript_map[num]
        for i, audio in enumerate(audios):
            if i > 0:
                new_transcript = f"{num}_copy{i}.txt"
                shutil.copy(os.path.join(transcript_folder, transcript_file), os.path.join(transcript_folder, new_transcript))
                mapped_data.append([num, audio, new_transcript])
            else:
                mapped_data.append([num, audio, transcript_file])
    else:
        mapped_data.append([num, audios[0], None])

df = pd.DataFrame(mapped_data, columns=["Unique_ID", "Audio_File", "Transcript_File"])

df.to_csv(output_csv, index=False)
print("Audio and transcript mapping completed and saved to CSV!")


Audio and transcript mapping completed and saved to CSV!


In [None]:
df=pd.read_csv("/content/drive/MyDrive/NLP_DS/merge.csv")

In [None]:
df["Audio_Path"] = "/content/drive/MyDrive/NLP_DS/final_audio/" + df["Audio_File"]
df["Transcript_Path"] = "/content/drive/MyDrive/NLP_DS/tweet_transcript/" + df["Transcript_File"]


In [None]:
df.head()

Unnamed: 0,Unique_ID,Audio_File,Transcript_File,Audio_Path,Transcript_Path
0,5945,tweet_5945.wav,tweet5945.txt,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...
1,598,tweet_598.wav,tweet598.txt,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...
2,161,tweet_161.wav,tweet161.txt,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...
3,3210,tweet 3210.wav,tweet3210.txt,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...
4,982,tweet_982.wav,tweet982.txt,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...


In [None]:
df.shape

(4538, 5)

In [None]:
df=df.drop(columns=["Audio_File","Transcript_File"])

## Preprocessing Transcript

In [None]:
def read_transcript(path):
    if os.path.exists(path):
        try:
            with open(path, "r", encoding="utf-8") as file:
                return file.read().strip()
        except Exception as e:
            return f"Error: {str(e)}"
    return "File Not Found"

num_cores = os.cpu_count()
df["transcript"] = Parallel(n_jobs=num_cores)(
    delayed(read_transcript)(path) for path in df["Transcript_Path"]
)

print(df.head())


   Unique_ID                                         Audio_Path  \
0       5945  /content/drive/MyDrive/NLP_DS/final_audio/twee...   
1        598  /content/drive/MyDrive/NLP_DS/final_audio/twee...   
2        161  /content/drive/MyDrive/NLP_DS/final_audio/twee...   
3       3210  /content/drive/MyDrive/NLP_DS/final_audio/twee...   
4        982  /content/drive/MyDrive/NLP_DS/final_audio/twee...   

                                     Transcript_Path  \
0  /content/drive/MyDrive/NLP_DS/tweet_transcript...   
1  /content/drive/MyDrive/NLP_DS/tweet_transcript...   
2  /content/drive/MyDrive/NLP_DS/tweet_transcript...   
3  /content/drive/MyDrive/NLP_DS/tweet_transcript...   
4  /content/drive/MyDrive/NLP_DS/tweet_transcript...   

                                          transcript  
0  <s> chakula chetu   a book on indigenous recip...  
1  <s> polisi amuua mpenziwe  afisa wa polisi amu...  
2  <s> jamaa ni mtafutaji halisi  weeeeeee kumbe ...  
3  <s> vodka na bei yake imekubali  ama 

In [None]:
df.head()

Unnamed: 0,Unique_ID,Audio_Path,Transcript_Path,transcript
0,5945,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,<s> chakula chetu a book on indigenous recip...
1,598,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,<s> polisi amuua mpenziwe afisa wa polisi amu...
2,161,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,<s> jamaa ni mtafutaji halisi weeeeeee kumbe ...
3,3210,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,<s> vodka na bei yake imekubali ama aje wadau...
4,982,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,<s> mauti ya familia mama na wanawe watatu wa...


In [None]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"<s>", "", text)  # Remove <s> markers
    text = re.sub(r"\(tweet_\d+\)", "", text)  # Remove tweet IDs
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation except spaces
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text
df["transcript"] = df["transcript"].apply(clean_text)



In [None]:
df.head()

Unnamed: 0,Unique_ID,Audio_Path,Transcript_Path,transcript
0,5945,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,chakula chetu a book on indigenous recipes fro...
1,598,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,polisi amuua mpenziwe afisa wa polisi amuua mp...
2,161,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,jamaa ni mtafutaji halisi weeeeeee kumbe dolph...
3,3210,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,vodka na bei yake imekubali ama aje wadau s
4,982,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,mauti ya familia mama na wanawe watatu wafarik...


In [None]:
df.to_csv("/content/drive/MyDrive/NLP_DS/tts_dataset.csv")

In [None]:
df.head()

Unnamed: 0,Unique_ID,Audio_Path,Transcript_Path,transcript
0,5945,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,chakula chetu a book on indigenous recipes fro...
1,598,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,polisi amuua mpenziwe afisa wa polisi amuua mp...
2,161,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,jamaa ni mtafutaji halisi weeeeeee kumbe dolph...
3,3210,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,vodka na bei yake imekubali ama aje wadau s
4,982,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,mauti ya familia mama na wanawe watatu wafarik...


# Feature engineering

## Convert transcript to phonemes

In [None]:
phoneme_file = "/content/drive/MyDrive/NLP_DS/kencorpus.phone"
with open(phoneme_file, "r") as f:
    phoneme_list = [line.strip() for line in f.readlines()]

swahili_phoneme_map = {
    'a': 'AA', 'b': 'B', 'ch': 'CH', 'd': 'D', 'dh': 'DH', 'e': 'EH', 'ee': 'EE',
    'f': 'F', 'g': 'G', 'gh': 'GH', 'h': 'HH', 'i': 'IH', 'ii': 'II', 'j': 'JH',
    'k': 'K', 'kh': 'KH', 'l': 'L', 'm': 'M', 'mb': 'MB', 'n': 'N', 'nd': 'ND',
    'nj': 'NJ', 'ng': 'NG', "ng'": "NG'", 'ny': 'NY', 'nz': 'NZ', 'o': 'OH',
    'oo': 'OO', 'p': 'P', 'r': 'R', 's': 'S', 'sh': 'SH', 't': 'T', 'th': 'TH',
    'u': 'UH', 'uu': 'UU', 'v': 'V', 'w': 'W', 'y': 'Y', 'z': 'Z'
}

def swahili_to_phoneme(word):
    phonemes = []
    i = 0
    while i < len(word):
        if i + 1 < len(word) and word[i:i+2] in swahili_phoneme_map:
            phonemes.append(swahili_phoneme_map[word[i:i+2]])
            i += 2
        elif word[i] in swahili_phoneme_map:
            phonemes.append(swahili_phoneme_map[word[i]])
            i += 1
        else:
            phonemes.append(word[i])  # Keep unknown letters as they are
            i += 1
    return ' '.join(phonemes)

file_path = "/content/drive/MyDrive/NLP_DS/tts_dataset.csv"
df = pd.read_csv(file_path)
if 'transcript' not in df.columns:
    raise ValueError("The dataset does not contain a 'transcript' column. Please check the file.")

df['phonemes'] = df['transcript'].astype(str).apply(swahili_to_phoneme)

df.to_csv(file_path, index=False)

## convert phonemes to phonemes_id

In [None]:
phoneme_file = "/content/drive/MyDrive/NLP_DS/kencorpus.phone"
with open(phoneme_file, "r") as f:
    phoneme_list = [line.strip() for line in f if line.strip()]

In [None]:
phoneme_to_id = {phoneme: idx for idx, phoneme in enumerate(phoneme_list)}

In [None]:
def convert_to_ids(phoneme_str):
    return [phoneme_to_id[ph] for ph in phoneme_str.split() if ph in phoneme_to_id]
df['phoneme_ids'] = df['phonemes'].apply(convert_to_ids)

## Final preprocessed dataframe

In [None]:
df=pd.read_csv("/content/drive/MyDrive/NLP_DS/tts_dataset.csv")

In [None]:
df=df.drop(columns=["mel_spec"])

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,Unique_ID,Audio_Path,Transcript_Path,transcript,phonemes,phoneme_ids
0,0,5945,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,chakula chetu a book on indigenous recipes fro...,CH AA K UH L AA CH EH T UH AA B OO K O...,"[3, 0, 15, 37, 17, 0, 3, 7, 34, 37, 0, 2, 27, ..."
1,1,598,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,polisi amuua mpenziwe afisa wa polisi amuua mp...,P OH L IH S IH AA M UU AA M P EH NZ IH W E...,"[29, 28, 17, 13, 31, 13, 0, 18, 36, 0, 18, 29,..."
2,2,161,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,jamaa ni mtafutaji halisi weeeeeee kumbe dolph...,JH AA M AA AA N IH M T AA F UH T AA JH IH ...,"[14, 0, 18, 0, 0, 20, 13, 18, 34, 0, 8, 37, 34..."
3,3,3210,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,vodka na bei yake imekubali ama aje wadau s,V OH D K AA N AA B EH IH Y AA K EH IH ...,"[38, 28, 4, 15, 0, 20, 0, 2, 7, 13, 40, 0, 15,..."
4,4,982,/content/drive/MyDrive/NLP_DS/final_audio/twee...,/content/drive/MyDrive/NLP_DS/tweet_transcript...,mauti ya familia mama na wanawe watatu wafarik...,M AA UH T IH Y AA F AA M IH L IH AA M AA...,"[18, 0, 37, 34, 13, 40, 0, 8, 0, 18, 13, 17, 1..."


## Converting Audio to Mel Spectorgram and saved as npy

In [None]:
def extract_and_save(audio_path, output_dir, sr=22050, n_mels=80, hop_length=256, win_length=1024):

    audio, sr = librosa.load(audio_path, sr=sr)

    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels,hop_length=hop_length, win_length=win_length)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

    base_name = os.path.basename(audio_path)
    file_name = os.path.splitext(base_name)[0] + "_mel.npy"
    file_path = os.path.join(output_dir, file_name)

    np.save(file_path, mel_spec_db)
    print(f'Done for {audio_path}')
    return file_path

output_dir = "/content/drive/MyDrive/NLP_DS/mel_spec"
os.makedirs(output_dir, exist_ok=True)

df['mel_spec_path'] = Parallel(n_jobs=-1)(
    delayed(extract_and_save)(path, output_dir) for path in df['Audio_Path']
)

df.to_csv('/content/drive/MyDrive/NLP_DS/tts_dataset.csv', index=False)
print("DataFrame saved to CSV with mel spectrogram paths.")


DataFrame saved to CSV with mel spectrogram paths.


In [None]:
df.to_pickle("full_data.pkl")
from IPython.display import FileLink
FileLink("full_data.pkl")
