# Data Wrangling

## Transform all to into common format
## csv with columns path, sentence

In [84]:
import pandas as pd

df = pd.read_csv("data/commonvoice/train.csv")

In [85]:
df.head()

Unnamed: 0.1,Unnamed: 0,path,sentence
0,0,/home/sampo/.cache/huggingface/datasets/downlo...,Mitä nyt tekisimme?
1,1,/home/sampo/.cache/huggingface/datasets/downlo...,Äänestämme tämän vuoksi toisin kuin maataloude...
2,2,/home/sampo/.cache/huggingface/datasets/downlo...,"Rupeatko remmiin, vai et?"
3,3,/home/sampo/.cache/huggingface/datasets/downlo...,Äänestin näin ollen mietinnön puolesta.
4,4,/home/sampo/.cache/huggingface/datasets/downlo...,"Kiitos, että tulitte ja opetitte meille viisau..."


In [69]:

def add_prefix(sent):
    return "data/eduskunnanpuheet/"+sent

df.path = df.path.apply(add_prefix)

In [19]:
df.to_csv("train.csv",encoding="utf-8")

In [27]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,path,sentence
0,data/eduskunnanpuheet/session_2_SEG_01.wav,edustaja timo soini ja kolmekymmentäkuusi muut...
1,data/eduskunnanpuheet/session_2_SEG_02.wav,arvoisa herra puhemies eurokriisi alkoi kreika...
2,data/eduskunnanpuheet/session_2_SEG_03.wav,brysselissä tilastovääristelystä tiedettiin jo...
3,data/eduskunnanpuheet/session_2_SEG_04.wav,kreikan lainottaminen herätti keskustelua euro...
4,data/eduskunnanpuheet/session_2_SEG_05.wav,pankkien pelastaminen kävi kalliiksi niinpä he...


In [32]:
a = "I."

a == 'II.'

False

In [58]:
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\'\...\…\–\é\\\I.\\\II.\\\III.\\\IV.\\\V.\\\VI.\\\VII.\\\VIII.\\\IX.\\\X.]'
import re

def regex_fix(sent):
    sent = re.sub(chars_to_ignore_regex, '', sent).lower() + " "
    if sent[0] == ' ':
        sent = sent[1:]
    return sent

def remove_special_characters(sent):
    sent = re.sub(chars_to_ignore_regex, '', sent).lower() + " "
    return sent
regex_fix(df.iloc[0]['sentence'])

'siihen aikaan kun isä lampun osti sanoi hän äidille näinikään äiti hoi kuulehan '

In [86]:
df.to_csv("data/commonvoice/train.csv", encoding="utf-8", index=False, columns=['path', 'sentence'])

In [77]:
pd.read_csv("data/commonvoice/train.csv")

Unnamed: 0,path,sentence
0,/home/sampo/.cache/huggingface/datasets/downlo...,Mitä nyt tekisimme?
1,/home/sampo/.cache/huggingface/datasets/downlo...,Äänestämme tämän vuoksi toisin kuin maataloude...
2,/home/sampo/.cache/huggingface/datasets/downlo...,"Rupeatko remmiin, vai et?"
3,/home/sampo/.cache/huggingface/datasets/downlo...,Äänestin näin ollen mietinnön puolesta.
4,/home/sampo/.cache/huggingface/datasets/downlo...,"Kiitos, että tulitte ja opetitte meille viisau..."
...,...,...
1019,/home/sampo/.cache/huggingface/datasets/downlo...,"Ah, oma nimikkohytti."
1020,/home/sampo/.cache/huggingface/datasets/downlo...,Pääaukiolla oli käynnissä markkinat.
1021,/home/sampo/.cache/huggingface/datasets/downlo...,John peitti kädellä silmiään.
1022,/home/sampo/.cache/huggingface/datasets/downlo...,Annoin miehen päästä karkuun.


In [11]:
import os
import pandas as pd
#remember to change
base = "data/speechcollector"
subdirs = ["recordings","youtube"]

def get_filenames(dir):
    print(f"return files from dir: {dir}")
    return [file for file in os.listdir(dir) if os.path.isfile(os.path.join(dir, file))]
    
def audiopath_from_textfile(filename, subdir):
    name = filename.split('.')[0]
    return os.path.join(subdir, "audio/",name+".mp3")

#filenames = get_filenames("data/speechcollector/recordings/transcripts")


def csvfromfiles(dir):
    """
    input subdir (recordings or youtube)
    and return pandas dataframe with columns path, transcript
    """
    
    data = []
    transcript_dir = os.path.join(dir, "transcripts")
    filenames = get_filenames(transcript_dir)
    for file in filenames:
        
        row = [0, 0]
        filepath = os.path.join(transcript_dir, file)
        with open(filepath, 'r') as f:
            text = f.read()
        audiopath = audiopath_from_textfile(file, dir)
        row[0] = audiopath
        row[1] = text
        data.append(row)
    return pd.DataFrame(data, columns = ['path', 'sentence']) 
        
def create_csv(base, subdirs):
    dfs = []
    for subdir in subdirs:
        dir = os.path.join(base, subdir)
        dfs.append(csvfromfiles(dir))
    df = pd.concat(dfs)
    
    csv_path = os.path.join(base,"train.csv")
    print(f"csv file saved as {csv_path}")
    df.to_csv(csv_path, encoding="utf-8", index=False, columns=['path', 'sentence'])
    return df
    
audiopath_from_textfile(filenames[0], os.path.join(base, "recordings"))

#df = csvfromfiles(os.path.join(base, "recordings"))
df = create_csv(base, subdirs)
print(df.shape)
df.head()

return files from dir: data/speechcollector/recordings/transcripts
return files from dir: data/speechcollector/youtube/transcripts
(609, 2)


Unnamed: 0,path,sentence
0,data/speechcollector/youtube/audio/65b0abe4-bc...,kuten elämä itsessään myös mielenterveytemme m...
1,data/speechcollector/youtube/audio/ee7d7ae3-8e...,kiitos tota taustasta sen verran
2,data/speechcollector/youtube/audio/0acc691f-45...,joka kuoressa on säihkyvä helmi niin hieno ett...
3,data/speechcollector/youtube/audio/66f8140a-90...,kirja esimerksiks jääny kotiin tai läksyt on t...
4,data/speechcollector/youtube/audio/3b56b95b-e6...,oliks teil himassa silleen musiikkiperhe vai t...


In [35]:
import torchaudio
#id, normalized_text
base = "data/fi"
files = ["asr_dev.tsv", "asr_test.tsv", "asr_train.tsv"]
df = pd.read_csv(os.path.join(base, files[0]), sep='\t')
#print(df.head())
path = os.path.join(base,df.id[0][:4], df.id[0])+".ogg"
#print(path)
torchaudio.load(path)

dfs = []
for file in files:
    data = []
    df_path = os.path.join(base,file)
    df = pd.read_csv(df_path, sep='\t')
    for index, row in df.iterrows():
        r = [0,0]
        audiopath = os.path.join(base,row['id'][:4], row['id'])+".ogg"
        text = row['normalized_text']
        r[0] = audiopath
        r[1] = text
        data.append(r)
    dfs.append(pd.DataFrame(data, columns = ['path', 'sentence'])) 
    
csv_df = pd.concat(dfs)
csv_df.to_csv(os.path.join(base,"train.csv"), encoding="utf-8", index=False, columns=['path', 'sentence'])
    

In [37]:
path = os.path.join(base,"train.csv")
dd = pd.read_csv(path)
dd.shape

(9074, 2)