# Import libraries

In [22]:
# drive access
from google.colab import drive
drive.mount('/content/drive')

# standard library
import numpy as np
import pandas as pd

# for audio
import librosa
from IPython.display import Audio

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Read the transcription csv

In [23]:
transcription = pd.read_csv('/content/drive/MyDrive/266/Data/Clean_Data/Arctic/transcription.csv')

transcription.head()

Unnamed: 0,filename,transcription
0,a0001,"Author of the danger trail, Philip Steels, etc."
1,a0002,"Not at this particular case, Tom, apologized W..."
2,a0003,For the twentieth time that evening the two me...
3,a0004,"Lord, but I'm glad to see you again, Phil."
4,a0005,Will we ever forget it.


In [24]:
transcription.tail()

Unnamed: 0,filename,transcription
1127,b0535,He read his fragments aloud.
1128,b0536,Typhoid -- did I tell you.
1129,b0537,But she had become an automaton.
1130,b0538,"At the best, they were necessary accessories."
1131,b0539,"You were making them talk shop, Ruth charged him."


# Clean bdl csv

In [25]:
bdl = pd.read_csv('/content/drive/MyDrive/266/Data/Clean_Data/Arctic/auto_transcription/bdl.csv')

bdl.head()

Unnamed: 0,filename,auto_transcription
0,arctic_a0004.wav,LORD BUT I'M GLAD TO SEE YOU AGAIN PHIL
1,arctic_a0050.wav,IN SPITE OF THEIR ABSURDITY THE WORDS AFFECTED...
2,arctic_a0102.wav,HE WILL FOLLOW US SOON
3,arctic_a0072.wav,BUT WHO WAS ILEN'S DOUBLE
4,arctic_a0124.wav,IT WAS JEAN SINGING SOFTLY OVER BEYOND THE ROCKS


In [26]:
bdl['emotion'] = 'neutral'

bdl.head()

Unnamed: 0,filename,auto_transcription,emotion
0,arctic_a0004.wav,LORD BUT I'M GLAD TO SEE YOU AGAIN PHIL,neutral
1,arctic_a0050.wav,IN SPITE OF THEIR ABSURDITY THE WORDS AFFECTED...,neutral
2,arctic_a0102.wav,HE WILL FOLLOW US SOON,neutral
3,arctic_a0072.wav,BUT WHO WAS ILEN'S DOUBLE,neutral
4,arctic_a0124.wav,IT WAS JEAN SINGING SOFTLY OVER BEYOND THE ROCKS,neutral


### Add actor and gender attributes to the total csv

In [27]:
bdl['actor'] = 'bdl'
bdl['gender'] = 'male'

bdl.head()

Unnamed: 0,filename,auto_transcription,emotion,actor,gender
0,arctic_a0004.wav,LORD BUT I'M GLAD TO SEE YOU AGAIN PHIL,neutral,bdl,male
1,arctic_a0050.wav,IN SPITE OF THEIR ABSURDITY THE WORDS AFFECTED...,neutral,bdl,male
2,arctic_a0102.wav,HE WILL FOLLOW US SOON,neutral,bdl,male
3,arctic_a0072.wav,BUT WHO WAS ILEN'S DOUBLE,neutral,bdl,male
4,arctic_a0124.wav,IT WAS JEAN SINGING SOFTLY OVER BEYOND THE ROCKS,neutral,bdl,male


## Get the cleaned filenames

In [28]:
clean_filenames = []
for each in bdl['filename']:
  clean_filenames.append(each.split('.')[0].split('_')[-1])

assert len(clean_filenames) == len(bdl)

clean_filenames[:5]

['a0004', 'a0050', 'a0102', 'a0072', 'a0124']

In [29]:
bdl['clean_filename'] = clean_filenames

bdl.head()

Unnamed: 0,filename,auto_transcription,emotion,actor,gender,clean_filename
0,arctic_a0004.wav,LORD BUT I'M GLAD TO SEE YOU AGAIN PHIL,neutral,bdl,male,a0004
1,arctic_a0050.wav,IN SPITE OF THEIR ABSURDITY THE WORDS AFFECTED...,neutral,bdl,male,a0050
2,arctic_a0102.wav,HE WILL FOLLOW US SOON,neutral,bdl,male,a0102
3,arctic_a0072.wav,BUT WHO WAS ILEN'S DOUBLE,neutral,bdl,male,a0072
4,arctic_a0124.wav,IT WAS JEAN SINGING SOFTLY OVER BEYOND THE ROCKS,neutral,bdl,male,a0124


## Get the true transcriptions/labels based on cleaned filenames

In [30]:
labels = []

for row in bdl['clean_filename']:
  labels.append(transcription[transcription['filename'] == row]['transcription'].item())

assert len(labels) == len(bdl)

labels[:5]

["Lord, but I'm glad to see you again, Phil.",
 'In spite of their absurdity the words affected Philip curiously.',
 'He will follow us soon.',
 "But who was Eileen's double.",
 'It was Jeanne singing softly over beyond the rocks.']

In [31]:
bdl['label'] = labels

bdl.head(10)

Unnamed: 0,filename,auto_transcription,emotion,actor,gender,clean_filename,label
0,arctic_a0004.wav,LORD BUT I'M GLAD TO SEE YOU AGAIN PHIL,neutral,bdl,male,a0004,"Lord, but I'm glad to see you again, Phil."
1,arctic_a0050.wav,IN SPITE OF THEIR ABSURDITY THE WORDS AFFECTED...,neutral,bdl,male,a0050,In spite of their absurdity the words affected...
2,arctic_a0102.wav,HE WILL FOLLOW US SOON,neutral,bdl,male,a0102,He will follow us soon.
3,arctic_a0072.wav,BUT WHO WAS ILEN'S DOUBLE,neutral,bdl,male,a0072,But who was Eileen's double.
4,arctic_a0124.wav,IT WAS JEAN SINGING SOFTLY OVER BEYOND THE ROCKS,neutral,bdl,male,a0124,It was Jeanne singing softly over beyond the r...
5,arctic_a0145.wav,I HAVE TO BE CAREFUL OF THEM AS THEY TEAR VERY...,neutral,bdl,male,a0145,"I have to be careful of them, as they tear ver..."
6,arctic_a0131.wav,PROVIDENCE HAD DELIVERED HIM THROUGH THE MAILS...,neutral,bdl,male,a0131,Providence had delivered him through the maels...
7,arctic_a0108.wav,HE WADED INTO THE EDGE OF THE WATER AND BEGAN ...,neutral,bdl,male,a0108,He waded into the edge of the water and began ...
8,arctic_a0106.wav,THE EMOTION WHICH SHE HAD SUPPRESSED BURST FOR...,neutral,bdl,male,a0106,The emotion which she had suppressed burst for...
9,arctic_a0148.wav,NOW THESE THINGS HAD BEEN STRUCK DEAD WITHIN HIM,neutral,bdl,male,a0148,Now these things had been struck dead within him.


## Confirm the first 5 characters of the auto transcription and label matches, or the last 5 characters of the auto transcription and label matches, if neither matches, manually investigate

### check the first 5 characters

In [32]:
front_assertions = []

for i, row in bdl.iterrows():
  if str(row['auto_transcription'])[:5].lower() == str(row['label'])[:5].lower():
    front_assertions.append('Y')
  else:
    front_assertions.append('N')

assert len(front_assertions) == len(bdl)

front_assertions[:5]

['N', 'Y', 'Y', 'Y', 'Y']

In [33]:
bdl['front_assertions'] = front_assertions

bdl.head()

Unnamed: 0,filename,auto_transcription,emotion,actor,gender,clean_filename,label,front_assertions
0,arctic_a0004.wav,LORD BUT I'M GLAD TO SEE YOU AGAIN PHIL,neutral,bdl,male,a0004,"Lord, but I'm glad to see you again, Phil.",N
1,arctic_a0050.wav,IN SPITE OF THEIR ABSURDITY THE WORDS AFFECTED...,neutral,bdl,male,a0050,In spite of their absurdity the words affected...,Y
2,arctic_a0102.wav,HE WILL FOLLOW US SOON,neutral,bdl,male,a0102,He will follow us soon.,Y
3,arctic_a0072.wav,BUT WHO WAS ILEN'S DOUBLE,neutral,bdl,male,a0072,But who was Eileen's double.,Y
4,arctic_a0124.wav,IT WAS JEAN SINGING SOFTLY OVER BEYOND THE ROCKS,neutral,bdl,male,a0124,It was Jeanne singing softly over beyond the r...,Y


In [34]:
bdl['front_assertions'].value_counts()

Y    1023
N     109
Name: front_assertions, dtype: int64

### check the last 5 characters

In [35]:
back_assertions = []

for i, row in bdl.iterrows():
  if str(row['auto_transcription'])[-5:].lower() == str(row['label']).replace('.','')[-5:].lower():
      back_assertions.append('Y')
  else:
    back_assertions.append('N')

assert len(back_assertions) == len(bdl)

back_assertions[:5]

['Y', 'Y', 'Y', 'Y', 'Y']

In [36]:
bdl['back_assertions'] = back_assertions

bdl.head()

Unnamed: 0,filename,auto_transcription,emotion,actor,gender,clean_filename,label,front_assertions,back_assertions
0,arctic_a0004.wav,LORD BUT I'M GLAD TO SEE YOU AGAIN PHIL,neutral,bdl,male,a0004,"Lord, but I'm glad to see you again, Phil.",N,Y
1,arctic_a0050.wav,IN SPITE OF THEIR ABSURDITY THE WORDS AFFECTED...,neutral,bdl,male,a0050,In spite of their absurdity the words affected...,Y,Y
2,arctic_a0102.wav,HE WILL FOLLOW US SOON,neutral,bdl,male,a0102,He will follow us soon.,Y,Y
3,arctic_a0072.wav,BUT WHO WAS ILEN'S DOUBLE,neutral,bdl,male,a0072,But who was Eileen's double.,Y,Y
4,arctic_a0124.wav,IT WAS JEAN SINGING SOFTLY OVER BEYOND THE ROCKS,neutral,bdl,male,a0124,It was Jeanne singing softly over beyond the r...,Y,Y


In [37]:
bdl['back_assertions'].value_counts()

Y    1064
N      68
Name: back_assertions, dtype: int64

### manually check instances where either front and back assertions failed

In [38]:
bdl[(bdl['front_assertions'] == 'N') | (bdl['back_assertions'] == 'N')]

Unnamed: 0,filename,auto_transcription,emotion,actor,gender,clean_filename,label,front_assertions,back_assertions
0,arctic_a0004.wav,LORD BUT I'M GLAD TO SEE YOU AGAIN PHIL,neutral,bdl,male,a0004,"Lord, but I'm glad to see you again, Phil.",N,Y
6,arctic_a0131.wav,PROVIDENCE HAD DELIVERED HIM THROUGH THE MAILS...,neutral,bdl,male,a0131,Providence had delivered him through the maels...,Y,N
11,arctic_a0063.wav,YES IT WAS A MAN WHO ASKED A STRANGER,neutral,bdl,male,a0063,"Yes, it was a man who asked, a stranger.",N,Y
31,arctic_a0234.wav,WHY THE AVERAGE REVIEW IS MORE NAUSEATING THAN...,neutral,bdl,male,a0234,"Why, the average review is more nauseating tha...",N,Y
32,arctic_a0245.wav,OUT OF HIS EIGHTEEN HUNDRED HE LAID ASIDE SIXT...,neutral,bdl,male,a0245,"Out of his eighteen hundred, he laid aside six...",Y,N
...,...,...,...,...,...,...,...,...,...
1103,arctic_a0138.wav,IN THE PICTURE HE SAW EACH MOMENT A GREATER RE...,neutral,bdl,male,a0138,In the picture he saw each moment a greater re...,Y,N
1111,arctic_a0153.wav,MCDUGALL TAPPED HIS FOREHEAD SUSPICIOUSLY WITH...,neutral,bdl,male,a0153,MacDougall tapped his forehead suspiciously wi...,N,Y
1118,arctic_a0140.wav,EXCEPT A FATHER'S BLESSING AND WITH IT THIS,neutral,bdl,male,a0140,"Accept a father's blessing, and with it, this.",N,Y
1121,arctic_a0006.wav,GOD BLESS HIM I HOPE I'LL GO ON SEEING THEM FO...,neutral,bdl,male,a0006,"God bless 'em, I hope I'll go on seeing them f...",Y,N


Manually checked instances where either the front or the back assertions failed, confirmed no mis-matched instances

# Export the cleaned bdl csv to be merged to the main csv

In [39]:
bdl.drop(columns=['front_assertions', 'back_assertions'], inplace=True)

bdl.head()

Unnamed: 0,filename,auto_transcription,emotion,actor,gender,clean_filename,label
0,arctic_a0004.wav,LORD BUT I'M GLAD TO SEE YOU AGAIN PHIL,neutral,bdl,male,a0004,"Lord, but I'm glad to see you again, Phil."
1,arctic_a0050.wav,IN SPITE OF THEIR ABSURDITY THE WORDS AFFECTED...,neutral,bdl,male,a0050,In spite of their absurdity the words affected...
2,arctic_a0102.wav,HE WILL FOLLOW US SOON,neutral,bdl,male,a0102,He will follow us soon.
3,arctic_a0072.wav,BUT WHO WAS ILEN'S DOUBLE,neutral,bdl,male,a0072,But who was Eileen's double.
4,arctic_a0124.wav,IT WAS JEAN SINGING SOFTLY OVER BEYOND THE ROCKS,neutral,bdl,male,a0124,It was Jeanne singing softly over beyond the r...


In [40]:
bdl.to_csv('/content/drive/MyDrive/266/Data/Clean_Data/Arctic/bdl.csv', index=False)