In [1]:
#ETL Libraries
import pandas as pd
import numpy as np

#Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

#To Ignore Warnings
import warnings
warnings.filterwarnings('ignore')

#Statistical Packages
from scipy import stats
from scipy.stats import norm, skew

#Tar File Extraction
import tarfile

#Importing os package
import os
import shutil
from shutil import copyfile


#Audio Python Files
from os import path
from pydub import AudioSegment

#Importing Zip file
from zipfile import ZipFile

#Audio
import librosa

In [None]:
#Setting Paths
destination='Detoxy-B/Audio/'
final_destination='Detoxy-B/Final_Audio/'

In [2]:
#Importing Transcript File
transcript=pd.read_csv("Detoxy-B/Transcripts/DeToxy.csv")

## MELD

In [17]:
#Meld wav file path
source='Data/MELD/MELD wav/'

In [19]:
#Getting List of MELD file names
selected_files=transcript[transcript['Dataset']=='MELD']['FileName'].unique()
selected_files=list(selected_files)

In [20]:
#Moving Files to final location
for i in range(len(selected_files)):
    shutil.copy2(source+selected_files[i]+".wav", destination)

## Common Voice

In [21]:
# Common Voice file Path
source='Data/Common Voice/'

In [22]:
#Opening Tar File
int_dest=source+'Audio Files - Required/'
tar = tarfile.open(source+"en.tar.gz")

In [23]:
#Getting List of Common Voice file names
selected_files=transcript[transcript['Dataset']=='Common Voice']['FileName'].unique()
selected_files=list(selected_files)

In [24]:
#Extracting required files
file_names = ["cv-corpus-6.1-2020-12-11/en/clips/" + suit+".mp3" for suit in selected_files]
tar.extractall(members=[x for x in tar.getmembers() if x.name in file_names],path=int_dest)

In [25]:
#Setting new source path
source='Common Voice/Audio Files - Required/cv-corpus-6.1-2020-12-11/en/clips/'

In [None]:
from os import path
from pydub import AudioSegment

In [28]:
# convert mp3 to wav

files=os.listdir(source)

for i in range(len(files)):
    sound = AudioSegment.from_mp3(source+files[i])
    sound.export(int_dest+"Wav Files/"+files[i][:-3]+"wav", format="wav")

In [29]:
#Moving to Detoxy Folder
files=os.listdir(int_dest+"Wav Files/")

for i in range(len(files)):
    shutil.copy2(int_dest+"Wav Files/"+files[i], destination)

## LJ Speech

In [33]:
#Setting LJ Speech Path
source='Data/LJSpeech/LJSpeech-1.1/wavs/'

In [34]:
#Getting List of LJ Speech file names
selected_files=transcript[transcript['Dataset']=='LJ Speech']['FileName'].unique()
selected_files=list(selected_files)

In [35]:
#Moving Files to final location
for i in range(len(selected_files)):
    shutil.copy2(source+selected_files[i]+".wav", destination)

## VCTK

In [40]:
#Setting source path
source='Data/VCTK/'

In [41]:
#Importing VCTK Transcript File
vctk=pd.read_csv("Data/Detoxy-B/Transcripts/VCTK.csv")

In [47]:
#Creating Combined (path + filename) 
files=vctk['OriginalFileName'].str.split('\\',expand=True)
files[2]=files[2].str.replace('.txt','_mic1.flac')
files[0]='wav48_silence_trimmed'
files['Combined']=files[0]+"/"+files[1]+"/"+files[2]

In [49]:
#Getting List of VCTK file names
selected_files=files['Combined'].unique()
selected_files=list(selected_files)

In [52]:
#Extracting the required Audio Files
int_dest='Data/VCTK/VCTK Original/'

for i in range(len(selected_files)):
    with ZipFile(source+'VCTK-Corpus-0.92.zip', 'r') as zipObj:
        zipObj.extract(selected_files[i],int_dest)    
        zipObj.close()

In [55]:
#Moving all files from subfolders into one final directory.

source='Data/VCTK/VCTK Original/wav48_silence_trimmed/'
int_destination='Data/VCTK/VCTK Original/Taken Out/'

for root, dirs, files in os.walk(source):  # replace the . with your starting directory
    for file in files:
      path_file = os.path.join(root,file)
      shutil.copy2(path_file,int_destination) # change you destination dir

### Converting from .flac to .wav

In [56]:
from pydub import AudioSegment

source='VCTK/VCTK Original/Taken Out/'
int_destination='Data/VCTK/VCTK Wavs/'

In [59]:
#Getting file names
flac_files = [f for f in os.listdir(source) if f.endswith('.flac')]

for i in range(len(flac_files)):
    song=AudioSegment.from_file(source+flac_files[i])
    song.export(int_destination+flac_files[i][:-4]+"wav",format = "wav")

### Moving to the final destination

In [60]:
source='Data/VCTK/VCTK Wavs/'

In [61]:
files=os.listdir(source)

In [63]:
for i in range(len(files)):
    shutil.copy2(source+files[i], destination)

## IEMOCAP

In [82]:
source='iemocap/Audio Files/IEMOCAP_full_release/'
int_dest='iemocap/Audio Files/Required Wav Files/'

In [84]:
iemocap_b=pd.read_csv("Detoxy-B/Transcripts/IEMOCAP.csv")
iemocap_full=pd.read_csv("Datasets/iemocap.csv")
iemocap_full=iemocap_full[['FileName','Session']]
iemocap=pd.merge(iemocap_b,iemocap_full,on=['FileName'],how='left')

In [85]:
#Creating combined path (path + filename)

file=iemocap[['FileName','Session']].copy()
file['Session']="Session"+file['Session'].map(str)

file['path']=file['FileName']

for i in range(file.shape[0]):
    temp=file['path'].iloc[i].split('_')
    temp='_'.join(temp[:-1])
    file['path'].iloc[i]=temp
    
file['Combined path']=file['Session']+"/sentences/wav/"+file['path']+"/"+file['FileName']+'.wav'

In [86]:
selected_files=file['Combined path'].unique()
selected_files=list(selected_files)

In [87]:
#Moving audio files
for i in range(len(selected_files)):
    shutil.copy2(source+selected_files[i], int_dest)

## MSP-IMPROV

In [99]:
source='MSP-Improv/All Audio Files/'
int_dest='MSP-Improv/Required Audio Files/'

In [101]:
msp_improv=transcript[transcript['Dataset']=='MSP-Improv']
msp_improv.reset_index(drop=True,inplace=True)

In [104]:
#Creating path (path + filename)
temp=msp_improv['FileName'].str.split('-',expand=True)
temp['Session']=np.nan
for i in range(temp.shape[0]):
    temp['Session'].iloc[i]='session'+str(temp[3].iloc[i][-1:])
    
temp['Path']=temp['Session']+"/"+temp[2]+"/"+temp[4]
msp_improv['path']=temp['Path']

In [106]:
msp_improv['FileName']=msp_improv['FileName'].str.replace('.txt','.wav')
msp_improv['path']=msp_improv['path']+"/"+msp_improv['FileName']

In [107]:
selected_files=msp_improv['path'].unique()
selected_files=list(selected_files)

In [109]:
#Moving files into intermediate location
for i in range(0,len(selected_files)):
    shutil.copy2(source+selected_files[i], int_dest)

In [110]:
#Moving into Final DeToxy Location
files=os.listdir(int_dest)

for i in range(0,len(files)):
    shutil.copy2(int_dest+files[i], destination)

## Switchboard

In [111]:
source='Data/Switchboard/Audio Files/switch_board_needed/'
files=os.listdir(source)

In [112]:
switchboard=transcript[transcript['Dataset']=='Switchboard']
switchboard.reset_index(drop=True,inplace=True)

In [113]:
#Getting required filenames
selected_files=switchboard['FileName'].unique()
selected_files=list(selected_files)

In [114]:
#Moving files into final location
for i in range(0,len(selected_files)):
    shutil.copy2(source+selected_files[i]+".wav", destination)

## CMU MOSEI

In [116]:
source='Data/mosei/'
int_dest='Data/mosei/Required Audio Files/'

In [117]:
mosei_raw=pd.read_csv("Data/Detoxy-B/Transcripts/CMU-MOSEI.csv")

In [118]:
#Getting Original File names
selected_files=mosei_raw['OriginalFileName'].unique()
selected_files=list(selected_files)

In [120]:
#Extracting Audio Files
for i in range(len(selected_files)):
    with ZipFile(source+'CMU_MOSEI.zip', 'r') as zipObj:
        zipObj.extract('Raw/Audio/Full/WAV_16000/'+selected_files[i]+".wav",int_dest)    
        zipObj.close()

In [121]:
source='Data/mosei/Required Audio Files/Raw/Audio/Full/WAV_16000/'
int_dest='Data/mosei/Required Audio Files/Segmented/'

In [123]:
#Extracting Required Audio Files. 
for i in range(mosei_raw.shape[0]):
    
    old_audio = AudioSegment.from_wav(source+mosei_raw['OriginalFileName'].iloc[i]+".wav")
    new_audio=old_audio[mosei_raw['Starting'].iloc[i]*1000:mosei_raw['Ending'].iloc[i]*1000]

    new_audio.export(int_dest+mosei_raw['FileName'].iloc[i]+".wav",format = "wav",bitrate='256k')

In [124]:
#Moving into final DeToxy Folder
files=os.listdir(int_dest)

for i in range(0,len(files)):
    shutil.copy2(int_dest+files[i], destination)

## CMU - MOSI

In [125]:
source='Data/mosi/Raw/Audio/WAV_16000/Segmented/'

In [126]:
#Getting Required File names
selected_files=transcript[transcript['Dataset']=='CMU-MOSI']['FileName'].unique()
selected_files=list(selected_files)

In [127]:
#Moving into final DeToxy Folder
for i in range(len(selected_files)):
    shutil.copy2(source+selected_files[i]+".wav", destination)

## Social IQ

In [128]:
source='Data/socialiq/'
int_dest='Data/socialiq/Required Audio Files/'

In [132]:
social_iq_raw=pd.read_csv("Data/Detoxy-B/Transcripts/Social-IQ.csv")

In [None]:
#Creating New File Names
social_iq_raw['FileName']=social_iq_raw['FileName'].str.replace('.txt','.wav')
social_iq_raw['FileName']=social_iq_raw['FileName'].str.replace('-trimmed','_trimmed-out')
social_iq_raw['File Id']=social_iq_raw.index+1

In [135]:
selected_files=social_iq_raw['FileName'].unique()
selected_files=list(selected_files)

In [None]:
#Extracting Audio Files
for i in range(len(selected_files)):
    with ZipFile(source+'Social-IQ.zip', 'r') as zipObj:
        zipObj.extract('raw/acoustic/wav/'+selected_files[i],int_dest)    
        zipObj.close()

In [None]:
source='Data/socialiq/Required Audio Files/raw/acoustic/wav/'
int_dest='Data/socialiq/Required Audio Files/Segmented/'

In [None]:
#Extracting Audio Segments
for i in range(social_iq_raw.shape[0]):
    
    old_audio = AudioSegment.from_wav(source+social_iq_raw['FileName'].iloc[i])
    new_audio=old_audio[max(0,(float(social_iq_raw['start'].iloc[i].split(":")[1]) + 0.2) *1000) : (float(social_iq_raw['end'].iloc[i].split(":")[1]) + 0.2)*1000]

    new_audio.export(int_dest+social_iq_raw['FileName'].iloc[i][:-4]+str(social_iq_raw['File Id'].iloc[i])+".wav",format = "wav",bitrate='256k')

In [137]:
#Moving into final location
source='Data/socialiq/Required Audio Files/Segmented/'
selected_files=os.listdir(source)

for i in range(len(selected_files)):
    shutil.copy2(source+selected_files[i], destination)

In [139]:
social_iq_raw['FileName']=social_iq_raw['FileName'].str.replace('.wav','')

In [None]:
social_iq_raw.to_csv("Data/Detoxy-B/Transcripts/Social-IQ.csv",index=False)

## MSP - Podcast

In [140]:
source='Data/MSP-Podcast/Audio Files/Wav 16khz files/'

In [141]:
msp_podcast=transcript[transcript['Dataset']=='MSP-Podcast']
msp_podcast.reset_index(drop=True,inplace=True)

In [142]:
selected_files=msp_podcast['FileName'].unique()
selected_files=list(selected_files)

In [145]:
for i in range(len(selected_files)):
    shutil.copy2(source+selected_files[i]+'.wav', final_destination)

## Converting to 16Khz 

In [148]:
from tqdm import tqdm
import soundfile as sf

In [149]:
files=os.listdir(destination)

In [150]:
for file in tqdm(files):
    filename = destination + file
    try:
        y, sr = librosa.load(filename, sr=16000)
        y_16 = librosa.resample(y,orig_sr=sr,target_sr=16000)
        sf.write(final_destination + file,y_16, 16000,'PCM_16')
    except:
        print(filename)
print("------------------End------------------------")

100%|████████████████████████████████████████████████████████████████████████████| 15711/15711 [45:07<00:00,  5.80it/s]

------------------End------------------------





## Splitting audio into train,test and split

In [153]:
source='Data/Detoxy-B/Final_Audio/'

In [158]:
final_transcripts=pd.read_csv("Data/Detoxy-B/Transcripts/DeToxy.csv")

In [161]:
#Moving Train 

destination_train='Data/Detoxy-B/Input/train/'

train=final_transcripts[final_transcripts['split']==0]
train.reset_index(drop=True,inplace=True)

selected_files=train['FileName'].unique()
selected_files=list(selected_files)

for i in range(len(selected_files)):
    shutil.copy2(source+selected_files[i]+'.wav', destination_train)

In [159]:
#Moving Test 

destination_test='Data/Detoxy-B/Input/test/'

test=final_transcripts[final_transcripts['split']==2]
test.reset_index(drop=True,inplace=True)

selected_files=test['FileName'].unique()
selected_files=list(selected_files)

for i in range(0,len(selected_files)):
    shutil.copy2(source+selected_files[i]+'.wav', destination_test)

In [160]:
#Moving dev 

destination_dev='Data/Detoxy-B/Input/dev/'

dev=final_transcripts[final_transcripts['split']==1]
dev.reset_index(drop=True,inplace=True)

selected_files=dev['FileName'].unique()
selected_files=list(selected_files)

for i in range(len(selected_files)):
    shutil.copy2(source+selected_files[i]+'.wav', destination_dev)

## Creating Trigger Term Dataset

In [42]:
source='Data/Detoxy-B/Transcripts/'

In [43]:
# Not including the final dataset to avoid double counting.
files=os.listdir(source)
files.remove('DeToxy.csv')

In [44]:
#Initialising a new dataframe
combined_df=pd.DataFrame()
combined_df[['Dataset','FileName','text','label2a','Trigger','Sentiment','Toxic Filter','split']]=np.nan

In [45]:
#Appending the transcripts
for i in range(len(files)):
    temp=pd.read_csv(source+files[i])
    temp=temp.filter(['Dataset','FileName','text','Label','Trigger','Sentiment','Toxic Filter'])
    combined_df=combined_df.append(temp)
    
combined_df.reset_index(drop=True,inplace=True)

In [46]:
#Cleaning the data
combined_df['Trigger'].fillna(0,inplace=True)
combined_df['FileName']=combined_df['FileName'].str.replace('.wav','')
combined_df['FileName']=combined_df['FileName'].str.replace('.mp3','')
combined_df['FileName']=combined_df['FileName'].str.replace('.txt','')

In [47]:
#Subsetting only trigger term dataset
trigger=combined_df[combined_df['Trigger']==1]
trigger.reset_index(drop=True,inplace=True)
trigger['Trigger']=trigger['Trigger'].astype(int)

In [38]:
source='Final Audio/'
destination='Data/Detoxy-B/DeToxy/Input/trigger/'

In [39]:
#Getting Required File Names
selected_files=trigger['FileName'].unique()
selected_files=list(selected_files)

In [None]:
#Moving into final location
for i in range(len(selected_files)):
    shutil.copy2(source+selected_files[i]+'.wav', destination)