# EDA

## Importhing Necessary Libraries for EDA

In [None]:
import numpy as np
import pandas as pd

import librosa
import librosa.display

import IPython.display as ipd # for displaying the play audio buttom

import warnings
warnings.filterwarnings('ignore')

## Reading the .csv files

In [None]:
train_df = pd.read_csv('../input/dlsprint/train.csv')
val_df = pd.read_csv('../input/dlsprint/validation.csv')
sample_submissin_df = pd.read_csv('../input/dlsprint/sample_submission.csv')

## Take a look at the .csv files

In [None]:
train_df.head(n = 3)

In [None]:
train_df.info()

In [None]:
train_df.isna().sum()

In [None]:
train_df['down_votes'].unique()

In [None]:
train_df['up_votes'].unique()

##### Higher the upvotes more accurate the text labels

Samples with higher number of downvote than upper are considered as incorrect.

## Let't check how label them

In [None]:
train_df['locale'].unique()

So all the audio files are labled by native bengalies

##### Columns named  'age', 'gender', 'accents', 'locale' would be removed as they didn't carry important information for the project.

In [None]:
train_df = train_df.drop(['age', 'gender', 'accents', 'locale'],axis = 1)
val_df = val_df.drop(['age', 'gender', 'accents', 'locale' , 'client_id','up_votes','down_votes'],axis = 1)
#val_df.to_csv('new_validation.csv')

In [None]:
val_df

It is assumed that if upvote is greater than downvote then the label is correct.

# preprocessing

In [None]:
# librosa.load() takes the path and returns a numpy array and the sample rate of the audio.

AUDIO_PATH = '../input/dlsprint/train_files/common_voice_bn_30614352.mp3'
audio, sr = librosa.load(AUDIO_PATH)

print('Shape of the audio: ', audio.shape)
print('Sample rate of the audio: ', sr)
ipd.display(ipd.Audio(data=audio, rate=sr))
print('Audio Label :',train_df[train_df['path'] == 'common_voice_bn_30614352.mp3'].sentence)

## All the audio files are in .mp3 format we need to convert them into .wav format for faster processing.

In [None]:
import os
import skimage.io

import time

from tqdm.notebook import tqdm
tqdm.pandas()

import shutil

from pydub import AudioSegment

from joblib import Parallel, delayed

import io
import soundfile as sf

In [None]:
TRAIN_PATH = "../input/dlsprint/train_files"
TEST_PATH = '../input/dlsprint/test_files'
VALIDATION_PATH = '../input/dlsprint/validation_files'

In [None]:
# data filtering
def filter_votes(x):
    up=x["up_votes"]
    down=x["down_votes"]
    if up-down<=0:
        return None
    elif up==0:
        return None
    else:
        return up

In [None]:
print("Total Data before filtering:",len(train_df))
train_df["up_votes"]=train_df.progress_apply(lambda x:filter_votes(x),axis=1)
train_df.dropna(subset = ['up_votes'],inplace=True)
print("Total Data after filtering:",len(train_df))
train_audio_files=train_df["path"].tolist()

In [None]:
train_df = train_df.drop(['client_id','up_votes','down_votes'],
                         axis = 1)

In [None]:
pd.DataFrame(train_df.sentence)

We have to remove the special characters such as ,.?!;:।

In [None]:
train_df.columns = train_df.columns.str.replace('[^a-zA-Z0-9]','')

In [None]:
train_df.sentence

In [None]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [None]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [None]:
vocab_train = extract_all_chars(train_df)

In [None]:
vocab_test = extract_all_chars(val_df)

In [None]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [None]:
vocab_list

In [None]:
to_remove = ['!','/','॥','।','.',';','‘','…','‚',':','-','"','—']

In [None]:
for x in to_remove:
    vocab_list.remove(x)

In [None]:
vocab_list

In [None]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

In [None]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

In [None]:
 import json
 with open('vocab.json', 'w') as vocab_file:
     json.dump(vocab_dict, vocab_file)

In a final step, we use the json file to instantiate an object of the Wav2Vec2CTCTokenizer class.

In [None]:
 from transformers import Wav2Vec2CTCTokenizer

 tokenizer = Wav2Vec2CTCTokenizer("./Vocab_Labeling.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [None]:
train_df.to_csv('new_train.csv')

## Creating a list of train files


In [None]:
train_df

In [None]:
val_df

In [None]:
# list to store files
test_audio_files = []

# Iterate directory
for path in os.listdir(TEST_PATH):
    # check if current path is a file
    if os.path.isfile(os.path.join(TEST_PATH, path)):
        test_audio_files.append(path)
len(test_audio_files)

## Creating a list of validation files

In [None]:
# list to store files
val_audio_files = []

# Iterate directory
for path in os.listdir(VALIDATION_PATH):
    # check if current path is a file
    if os.path.isfile(os.path.join(VALIDATION_PATH, path)):
        val_audio_files .append(path)
len(val_audio_files)

In [None]:
"""
new_dir = ['new_train','new_validation','new_test']
for folder in new_dir:
    if not os.path.exists(folder):
        os.makedirs(folder)

        """

In [None]:
New_Train_Dir = './new_train'
NEW_Validation_Dir = './new_validation'
New_Test_Dir = './new_test'

## Converting the train .mp3 files into .wav and saving it inside a new directory

In [None]:
def save_fn(filename):


    path = f"{TRAIN_PATH}/{filename}"
    save_path = f"{New_Train_Dir}"
    if not os.path.exists(save_path):
        os.makedirs(save_path, exist_ok=True)

    if os.path.exists(path):
        try:
            sound = AudioSegment.from_mp3(path)
            sound = sound.set_frame_rate(16000)
            sound.export(f"{save_path}/{filename[:-4]}.wav", format="wav")
        except:
            print(path)

In [None]:
import time
start = time.time()

Parallel(n_jobs=8, backend="multiprocessing")(
    delayed(save_fn)(filename) for filename in tqdm(train_audio_files)
)

end = time.time()
print("total time to process: {x} seconds".format(x=end-start))

## Converting the validation .mp3 files into .wav and saving it inside a new directory

In [None]:
def val_save_fn(filename):


    path = f"{VALIDATION_PATH}/{filename}"
    save_path = f"{NEW_Validation_Dir}"
    if not os.path.exists(save_path):
        os.makedirs(save_path, exist_ok=True)

    if os.path.exists(path):
        try:
            sound = AudioSegment.from_mp3(path)
            sound = sound.set_frame_rate(16000)
            sound.export(f"{save_path}/{filename[:-4]}.wav", format="wav")
        except:
            print(path)

In [None]:
start = time.time()

Parallel(n_jobs=8, backend="multiprocessing")(
    delayed(val_save_fn)(filename) for filename in tqdm(val_audio_files)
)

end = time.time()
print("total time to process: {x} seconds".format(x=end-start))

## Converting the test .mp3 files into .wav and saving it inside a new directory

In [None]:
def test_save_fn(filename):


    path = f"{TEST_PATH}/{filename}"
    save_path = f"{New_Test_Dir}"
    if not os.path.exists(save_path):
        os.makedirs(save_path, exist_ok=True)

    if os.path.exists(path):
        try:
            sound = AudioSegment.from_mp3(path)
            sound = sound.set_frame_rate(16000)
            sound.export(f"{save_path}/{filename[:-4]}.wav", format="wav")
        except:
            print(path)

In [None]:
start = time.time()

Parallel(n_jobs=8, backend="multiprocessing")(
    delayed(test_save_fn)(filename) for filename in tqdm(test_audio_files)
)

end = time.time()
print("total time to process: {x} seconds".format(x=end-start))