# Import libraries

In [None]:
# drive access
from google.colab import drive
drive.mount('/content/drive')

# standard library
import numpy as np
import pandas as pd

# for audio
import librosa
from IPython.display import Audio

Mounted at /content/drive


# Read the transcription csv

In [None]:
transcription = pd.read_csv('/content/drive/MyDrive/266/Data/Clean_Data/Arctic/transcription.csv')

transcription.head()

Unnamed: 0,filename,transcription
0,a0001,"Author of the danger trail, Philip Steels, etc."
1,a0002,"Not at this particular case, Tom, apologized W..."
2,a0003,For the twentieth time that evening the two me...
3,a0004,"Lord, but I'm glad to see you again, Phil."
4,a0005,Will we ever forget it.


In [None]:
transcription.tail()

Unnamed: 0,filename,transcription
1127,b0535,He read his fragments aloud.
1128,b0536,Typhoid -- did I tell you.
1129,b0537,But she had become an automaton.
1130,b0538,"At the best, they were necessary accessories."
1131,b0539,"You were making them talk shop, Ruth charged him."


# Clean slt csv

In [None]:
slt = pd.read_csv('/content/drive/MyDrive/266/Data/Clean_Data/Arctic/auto_transcription/slt.csv')

slt.head()

Unnamed: 0,filename,auto_transcription
0,arctic_b0218.wav,THE ISSUE WAS NOT IN DOUBT
1,arctic_a0532.wav,MAY DROUGHT DESTROY YOUR CROPS
2,arctic_b0203.wav,A MONTH IN AUSTRALIA WOULD FINISH ME
3,arctic_a0562.wav,WHAT THE FLAMING
4,arctic_a0308.wav,HIS INFERNAL CHATTERING WORRIES ME EVEN NOW AS...


In [None]:
slt['emotion'] = 'neutral'

slt.head()

Unnamed: 0,filename,auto_transcription,emotion
0,arctic_b0218.wav,THE ISSUE WAS NOT IN DOUBT,neutral
1,arctic_a0532.wav,MAY DROUGHT DESTROY YOUR CROPS,neutral
2,arctic_b0203.wav,A MONTH IN AUSTRALIA WOULD FINISH ME,neutral
3,arctic_a0562.wav,WHAT THE FLAMING,neutral
4,arctic_a0308.wav,HIS INFERNAL CHATTERING WORRIES ME EVEN NOW AS...,neutral


### Add actor and gender attributes to the total csv

In [None]:
slt['actor'] = 'slt'
slt['gender'] = 'female'

slt.head()

Unnamed: 0,filename,auto_transcription,emotion,actor,gender
0,arctic_b0218.wav,THE ISSUE WAS NOT IN DOUBT,neutral,slt,female
1,arctic_a0532.wav,MAY DROUGHT DESTROY YOUR CROPS,neutral,slt,female
2,arctic_b0203.wav,A MONTH IN AUSTRALIA WOULD FINISH ME,neutral,slt,female
3,arctic_a0562.wav,WHAT THE FLAMING,neutral,slt,female
4,arctic_a0308.wav,HIS INFERNAL CHATTERING WORRIES ME EVEN NOW AS...,neutral,slt,female


## Get the cleaned filenames

In [None]:
clean_filenames = []
for each in slt['filename']:
  clean_filenames.append(each.split('.')[0].split('_')[-1])

assert len(clean_filenames) == len(slt)

clean_filenames[:5]

['b0218', 'a0532', 'b0203', 'a0562', 'a0308']

In [None]:
slt['clean_filename'] = clean_filenames

slt.head()

Unnamed: 0,filename,auto_transcription,emotion,actor,gender,clean_filename
0,arctic_b0218.wav,THE ISSUE WAS NOT IN DOUBT,neutral,slt,female,b0218
1,arctic_a0532.wav,MAY DROUGHT DESTROY YOUR CROPS,neutral,slt,female,a0532
2,arctic_b0203.wav,A MONTH IN AUSTRALIA WOULD FINISH ME,neutral,slt,female,b0203
3,arctic_a0562.wav,WHAT THE FLAMING,neutral,slt,female,a0562
4,arctic_a0308.wav,HIS INFERNAL CHATTERING WORRIES ME EVEN NOW AS...,neutral,slt,female,a0308


## Get the true transcriptions/labels based on cleaned filenames

In [None]:
labels = []

for row in slt['clean_filename']:
  labels.append(transcription[transcription['filename'] == row]['transcription'].item())

assert len(labels) == len(slt)

labels[:5]

['The issue was not in doubt.',
 'May drought destroy your crops.',
 'A month in Australia would finish me.',
 'What the flaming.',
 'His infernal chattering worries me even now as I think of it.']

In [None]:
slt['label'] = labels

slt.head(10)

Unnamed: 0,filename,auto_transcription,emotion,actor,gender,clean_filename,label
0,arctic_b0218.wav,THE ISSUE WAS NOT IN DOUBT,neutral,slt,female,b0218,The issue was not in doubt.
1,arctic_a0532.wav,MAY DROUGHT DESTROY YOUR CROPS,neutral,slt,female,a0532,May drought destroy your crops.
2,arctic_b0203.wav,A MONTH IN AUSTRALIA WOULD FINISH ME,neutral,slt,female,b0203,A month in Australia would finish me.
3,arctic_a0562.wav,WHAT THE FLAMING,neutral,slt,female,a0562,What the flaming.
4,arctic_a0308.wav,HIS INFERNAL CHATTERING WORRIES ME EVEN NOW AS...,neutral,slt,female,a0308,His infernal chattering worries me even now as...
5,arctic_b0273.wav,THEN IT WAS THAT A STRANGE THING HAPPENED,neutral,slt,female,b0273,Then it was that a strange thing happened.
6,arctic_a0128.wav,THIS ONE HOPE WAS DESTROYED AS QUICKLY AS IT W...,neutral,slt,female,a0128,This one hope was destroyed as quickly as it w...
7,arctic_a0292.wav,HERE IN THE MID MORNING THE FIRST CASUALTY OCC...,neutral,slt,female,a0292,"Here, in the midmorning, the first casualty oc..."
8,arctic_a0285.wav,BUT WHAT THEY WANT WITH YOUR TOOTH BRUSH IS MO...,neutral,slt,female,a0285,But what they want with your toothbrush is mor...
9,arctic_a0485.wav,EACH INSULT ADDED TO THE VALUE OF THE CLAIM,neutral,slt,female,a0485,Each insult added to the value of the claim.


## Confirm the first 5 characters of the auto transcription and label matches, or the last 5 characters of the auto transcription and label matches, if neither matches, manually investigate

### check the first 5 characters

In [None]:
front_assertions = []

for i, row in slt.iterrows():
  if str(row['auto_transcription'])[:5].lower() == str(row['label'])[:5].lower():
    front_assertions.append('Y')
  else:
    front_assertions.append('N')

assert len(front_assertions) == len(slt)

front_assertions[:5]

['Y', 'Y', 'Y', 'Y', 'Y']

In [None]:
slt['front_assertions'] = front_assertions

slt.head()

Unnamed: 0,filename,auto_transcription,emotion,actor,gender,clean_filename,label,front_assertions
0,arctic_b0218.wav,THE ISSUE WAS NOT IN DOUBT,neutral,slt,female,b0218,The issue was not in doubt.,Y
1,arctic_a0532.wav,MAY DROUGHT DESTROY YOUR CROPS,neutral,slt,female,a0532,May drought destroy your crops.,Y
2,arctic_b0203.wav,A MONTH IN AUSTRALIA WOULD FINISH ME,neutral,slt,female,b0203,A month in Australia would finish me.,Y
3,arctic_a0562.wav,WHAT THE FLAMING,neutral,slt,female,a0562,What the flaming.,Y
4,arctic_a0308.wav,HIS INFERNAL CHATTERING WORRIES ME EVEN NOW AS...,neutral,slt,female,a0308,His infernal chattering worries me even now as...,Y


In [None]:
slt['front_assertions'].value_counts()

Y    1044
N      88
Name: front_assertions, dtype: int64

### check the last 5 characters

In [None]:
back_assertions = []

for i, row in slt.iterrows():
  if str(row['auto_transcription'])[-5:].lower() == str(row['label']).replace('.','')[-5:].lower():
      back_assertions.append('Y')
  else:
    back_assertions.append('N')

assert len(back_assertions) == len(slt)

back_assertions[:5]

['Y', 'Y', 'Y', 'Y', 'Y']

In [None]:
slt['back_assertions'] = back_assertions

slt.head()

Unnamed: 0,filename,auto_transcription,emotion,actor,gender,clean_filename,label,front_assertions,back_assertions
0,arctic_b0218.wav,THE ISSUE WAS NOT IN DOUBT,neutral,slt,female,b0218,The issue was not in doubt.,Y,Y
1,arctic_a0532.wav,MAY DROUGHT DESTROY YOUR CROPS,neutral,slt,female,a0532,May drought destroy your crops.,Y,Y
2,arctic_b0203.wav,A MONTH IN AUSTRALIA WOULD FINISH ME,neutral,slt,female,b0203,A month in Australia would finish me.,Y,Y
3,arctic_a0562.wav,WHAT THE FLAMING,neutral,slt,female,a0562,What the flaming.,Y,Y
4,arctic_a0308.wav,HIS INFERNAL CHATTERING WORRIES ME EVEN NOW AS...,neutral,slt,female,a0308,His infernal chattering worries me even now as...,Y,Y


In [None]:
slt['back_assertions'].value_counts()

Y    1072
N      60
Name: back_assertions, dtype: int64

### manually check instances where either front and back assertions failed

In [None]:
slt[(slt['front_assertions'] == 'N') | (slt['back_assertions'] == 'N')]

Unnamed: 0,filename,auto_transcription,emotion,actor,gender,clean_filename,label,front_assertions,back_assertions
7,arctic_a0292.wav,HERE IN THE MID MORNING THE FIRST CASUALTY OCC...,neutral,slt,female,a0292,"Here, in the midmorning, the first casualty oc...",N,Y
12,arctic_a0502.wav,ANYTHING UNUSUAL OR ABNORMAL WAS SUFFICIENT TO...,neutral,slt,female,a0502,Anything unusual or abnormal was sufficient to...,Y,N
13,arctic_a0223.wav,MY I'M ALMOST HOMESICK FOR IT ALREADY,neutral,slt,female,a0223,"My, I'm almost homesick for it already.",N,Y
16,arctic_a0506.wav,THE KLAWDIN WAS LEAVING NEXT MORNING FOR HANNA...,neutral,slt,female,a0506,The Claudine was leaving next morning for Hono...,N,N
18,arctic_a0215.wav,THEY DIE OUT OF SPIGHT,neutral,slt,female,a0215,They die out of spite.,Y,N
...,...,...,...,...,...,...,...,...,...
1069,arctic_a0566.wav,DENON'S HANDS WERE RELEASED LONG ENOUGH FOR HI...,neutral,slt,female,a0566,Dennin's hands were released long enough for h...,N,Y
1084,arctic_a0119.wav,JANE WAS TURNING THE BOW SHOREWARD,neutral,slt,female,a0119,Jeanne was turning the bow shoreward.,N,Y
1096,arctic_b0236.wav,ULANG WAS TWO HUNDRED AND FIFTY MILES FROM THE...,neutral,slt,female,b0236,Oolong was two hundred and fifty miles from th...,N,Y
1119,arctic_a0387.wav,BOB GROWING DISGUSTED TURNED BACK SUDDENLY AND...,neutral,slt,female,a0387,"Bob, growing disgusted, turned back suddenly a...",N,Y


Manually checked instances where either the front or the back assertions failed, confirmed no mis-matched instances

# Export the cleaned slt csv to be merged to the main csv

In [None]:
slt.drop(columns=['front_assertions', 'back_assertions'], inplace=True)

slt.head()

Unnamed: 0,filename,auto_transcription,emotion,actor,gender,clean_filename,label
0,arctic_b0218.wav,THE ISSUE WAS NOT IN DOUBT,neutral,slt,female,b0218,The issue was not in doubt.
1,arctic_a0532.wav,MAY DROUGHT DESTROY YOUR CROPS,neutral,slt,female,a0532,May drought destroy your crops.
2,arctic_b0203.wav,A MONTH IN AUSTRALIA WOULD FINISH ME,neutral,slt,female,b0203,A month in Australia would finish me.
3,arctic_a0562.wav,WHAT THE FLAMING,neutral,slt,female,a0562,What the flaming.
4,arctic_a0308.wav,HIS INFERNAL CHATTERING WORRIES ME EVEN NOW AS...,neutral,slt,female,a0308,His infernal chattering worries me even now as...


In [None]:
slt.to_csv('/content/drive/MyDrive/266/Data/Clean_Data/Arctic/slt.csv', index=False)