In [None]:
#Importing Python Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import warnings
warnings.filterwarnings('ignore')

# Transcript Prep

In [None]:
#Source and Destination Paths
source='D:/Goals/Research/Final-Toxic Speech/GitHub/Datasets/'
destination='D:/Goals/Research/Final-Toxic Speech/Data/Detoxy-B/Transcripts/'

In [None]:
# Extracting Toxic Data

def get_toxic(data):
    print("--- (With Toxic Filter = Yes) ---")
    toxic=data[data['Label']==1]
    toxic.reset_index(drop=True,inplace=True)
    print("Toxic Shape: ",toxic.shape)
    return toxic

In [None]:
# Extracting Non-Toxic Data

def get_nontoxic(data,shape):
    nontoxic=data[(data['Toxic Filter']=='Yes') & (data['Label']==0)].sample(n=shape)
    nontoxic.reset_index(drop=True,inplace=True)
    print("Non Toxic Shape (with Toxic Filter - Yes):",toxic.shape)
    return nontoxic

In [None]:
# Extracting Non Toxic Data (Sentiment Balanced)

def get_nontoxic_balanced(data,shape):
    
    sentiment_number=(shape*2)//3

    positive=data[(data['Toxic Filter']=='No') & (data['Sentiment']=='positive')].sample(n=sentiment_number)
    negative=data[(data['Toxic Filter']=='No') & (data['Sentiment']=='negative')].sample(n=sentiment_number)
    neutral=data[(data['Toxic Filter']=='No') & (data['Sentiment']=='neutral')].sample(n=sentiment_number)

    non_toxic_balanced=positive.copy()
    non_toxic_balanced=non_toxic_balanced.append(negative)
    non_toxic_balanced=non_toxic_balanced.append(neutral)
    non_toxic_balanced.reset_index(drop=True,inplace=True)
    
    print("\n --- Non Toxic Balanced (With Toxic Filter = No) ---")
    print("Number of Positive: ",positive.shape[0])
    print("Number of Negative: ",negative.shape[0])
    print("Number of Neutral: ",neutral.shape[0])
    print("Total Non Toxic Balanced Shape:",non_toxic_balanced.shape)
    
    return non_toxic_balanced

In [None]:
#Splitting Data into Train/Dev and Test Datasets

def get_split(data):
    
    X=data.drop(['Label'],axis=1)
    y=data['Label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1, stratify=y)

    train=X_train.copy()
    train['label']=y_train
    train.reset_index(drop=True,inplace=True)

    test=X_test.copy()
    test['label']=y_test
    test.reset_index(drop=True,inplace=True)
    
    test_shape=test.shape[0]

    ## Further splitting train into train and dev
    X=train.drop(['label'],axis=1)
    y=train['label']
    
    perc=test_shape/(train.shape[0])
    perc=round(perc,2)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=perc, random_state=1, stratify=y)

    #Final Train
    train=X_train.copy()
    train['label']=y_train
    train.reset_index(drop=True,inplace=True)

    #Final Dev
    dev=X_test.copy()
    dev['label']=y_test
    dev.reset_index(drop=True,inplace=True)
    
    #Assigning Split Labels
    train['label2a']=0
    dev['label2a']=1
    test['label2a']=2

    final_dataset=train.copy()
    final_dataset=final_dataset.append(test)
    final_dataset=final_dataset.append(dev)

    final_dataset.reset_index(drop=True,inplace=True)
    final_dataset.rename(columns={'label':'label2a','label2a':'split'},inplace=True)
    final_dataset.reset_index(drop=True,inplace=True)
    
    return final_dataset

### MELD

In [None]:
meld=pd.read_csv(source+"MELD.csv")
meld['Dataset']='MELD'

meld['Label']=np.where((meld['Label']==2) ,0,meld['Label'])

In [None]:
meld=meld[['FileName','Speaker','Dataset','text','Label','Sentiment','Toxic Filter']]

In [None]:
#Toxic Subset
toxic=get_toxic(meld)

#Toxic Filtered Non-Toxic Subset
nontoxic=get_nontoxic(meld,toxic.shape[0])

#Sentiment Balanced Non Toxic Filtered
nontoxic_balanced=get_nontoxic_balanced(meld,toxic.shape[0])

In [None]:
#Final Combined
final_meld=toxic.append(nontoxic,ignore_index=True)
final_meld=final_meld.append(nontoxic_balanced,ignore_index=True)

final_meld.reset_index(drop=True,inplace=True)

In [None]:
final_meld.shape

In [None]:
final_meld['Sentiment'].value_counts()

In [None]:
final_meld['Speaker'].nunique()

In [None]:
final_meld.drop(['Speaker'],axis=1,inplace=True)

In [None]:
final_meld.to_csv(destination+"Meld.csv",index=False)

### IEMOCAP

In [None]:
iemocap=pd.read_csv(source+"iemocap.csv")

In [None]:
iemocap=pd.read_csv(source+"iemocap.csv")
iemocap=iemocap[['FileName','text','Label','Trigger','Sentiment Label','Sentiment Value','Toxic Filter']]

iemocap['Sentiment']=np.where(((iemocap['Sentiment Value']>=0.4) & (iemocap['Sentiment Value']<=0.6)),"neutral",iemocap['Sentiment Label'])
iemocap['Sentiment']=iemocap['Sentiment'].str.lower()
iemocap['Dataset']='IEMOCAP'

iemocap=iemocap[['FileName','Dataset','text','Label','Trigger','Sentiment','Toxic Filter']]

In [None]:
#Toxic Subset
toxic=get_toxic(iemocap)

#Toxic Filtered Non-Toxic Subset
nontoxic=get_nontoxic(iemocap,toxic.shape[0])

#Sentiment Balanced Non Toxic Filtered
nontoxic_balanced=get_nontoxic_balanced(iemocap,toxic.shape[0])

In [None]:
#Final Combined
final_iemocap=toxic.append(nontoxic,ignore_index=True)
final_iemocap=final_iemocap.append(nontoxic_balanced,ignore_index=True)

final_iemocap.reset_index(drop=True,inplace=True)
final_iemocap.shape

In [None]:
final_iemocap.to_csv(destination+"IEMOCAP.csv",index=False)

In [None]:
final_iemocap['Sentiment'].value_counts()

## LJSpeech

In [None]:
ljspeech=pd.read_csv(source+"LJSpeech.csv")

ljspeech.rename(columns={'Normalized Transcription':'text'},inplace=True)

ljspeech['Sentiment']=np.where(((ljspeech['Sentiment Value']>=0.4) & (ljspeech['Sentiment Value']<=0.6)),"neutral",ljspeech['Sentiment Label'])
ljspeech['Sentiment']=ljspeech['Sentiment'].str.lower()
ljspeech['Dataset']='LJ Speech'

ljspeech=ljspeech[['FileName','Dataset','text','Label','Sentiment','Toxic Filter']]

In [None]:
#Toxic Subset
toxic=get_toxic(ljspeech)

#Toxic Filtered Non-Toxic Subset
nontoxic=ljspeech[(ljspeech['Toxic Filter']=='Yes') & (ljspeech['Label']==0)]
nontoxic.reset_index(drop=True,inplace=True)

#Sentiment Balanced Non Toxic Filtered
nontoxic_balanced=get_nontoxic_balanced(ljspeech,toxic.shape[0])

In [None]:
#Final Combined
final_ljspeech=toxic.append(nontoxic,ignore_index=True)
final_ljspeech=final_ljspeech.append(nontoxic_balanced,ignore_index=True)

final_ljspeech.reset_index(drop=True,inplace=True)
final_ljspeech.shape

In [None]:
final_ljspeech.to_csv(destination+"LJSpeech.csv",index=False)

In [None]:
final_ljspeech['Sentiment'].value_counts()

## MOSEI

In [None]:
mosei=pd.read_csv(source+'mosei.csv')

In [None]:
mosei['Sentiment']=np.where(((mosei['Sentiment Value']>=0.4) & (mosei['Sentiment Value']<=0.6)),"neutral",mosei['Sentiment Label'])
mosei['Sentiment']=mosei['Sentiment'].str.lower()

mosei['Dataset']='CMU-MOSEI'

mosei['Label']=np.where((mosei['Label']==2) ,0,mosei['Label'])
mosei=mosei[['FileName','Dataset','FileNameId','Starting','Ending','text','Label','Sentiment','Toxic Filter']]

In [None]:
mosei=mosei[mosei['FileName']!='#NAME?']
mosei.reset_index(drop=True,inplace=True)

In [None]:
mosei.rename(columns={'FileName':'OriginalFileName'},inplace=True)
mosei['FileName']=mosei['OriginalFileName']+"_"+mosei['FileNameId'].map(str)

In [None]:
mosei['Label'].value_counts()

In [None]:
mosei['Sentiment'].value_counts()

In [None]:
#Toxic Subset
toxic=get_toxic(mosei)

#Toxic Filtered Non-Toxic Subset
nontoxic=get_nontoxic(mosei,toxic.shape[0])

#Sentiment Balanced Non Toxic Filtered
nontoxic_balanced=get_nontoxic_balanced(mosei,toxic.shape[0])

In [None]:
#Final Combined
final_mosei=toxic.append(nontoxic,ignore_index=True)
final_mosei=final_mosei.append(nontoxic_balanced,ignore_index=True)

final_mosei.reset_index(drop=True,inplace=True)
final_mosei.shape

In [None]:
final_mosei.to_csv(destination+"CMU-MOSEI.csv",index=False)

In [None]:
final_mosei['Sentiment'].value_counts()

## MOSI

In [None]:
mosi=pd.read_csv(source+'mosi.csv')
mosi.rename(columns={'file':'FileName'},inplace=True)

mosi['Sentiment']=np.where(((mosi['Sentiment Value']>=0.4) & (mosi['Sentiment Value']<=0.6)),"neutral",mosi['Sentiment Label'])
mosi['Sentiment']=mosi['Sentiment'].str.lower()
mosi['Dataset']='CMU-MOSI'

mosi['Label']=np.where((mosi['Label']==2) ,0,mosi['Label'])
mosi=mosi[['FileName','start','end','id','Dataset','text','Label','Sentiment','Toxic Filter']]

In [None]:
mosi['FileName']=mosi['FileName']+"_"+mosi['id'].map(str)

In [None]:
#Toxic Subset
toxic=get_toxic(mosi)

#Toxic Filtered Non-Toxic Subset
nontoxic=mosi[(mosi['Toxic Filter']=='Yes') & (mosi['Label']!=1)]
nontoxic.reset_index(drop=True,inplace=True)

#Sentiment Balanced Non Toxic Filtered
nontoxic_balanced=get_nontoxic_balanced(mosi,toxic.shape[0])

In [None]:
#Final Combined
final_mosi=toxic.append(nontoxic,ignore_index=True)
final_mosi=final_mosi.append(nontoxic_balanced,ignore_index=True)

final_mosi.reset_index(drop=True,inplace=True)
final_mosi.shape

In [None]:
final_mosi.to_csv(destination+"CMU-MOSI.csv",index=False)

In [None]:
final_mosi['Sentiment'].value_counts()

## Common Voice

In [None]:
common_voices=pd.read_csv(source+'Common Voice.csv')

common_voices.rename(columns={'path':'FileName','Trigger Term':'Trigger'},inplace=True)
common_voices['Dataset']='Common Voice'

In [None]:
common_voices['Sentiment Value']=common_voices['Sentiment Value'].str.replace("(","")
common_voices['Sentiment Value']=common_voices['Sentiment Value'].str.replace(")","")
common_voices['Sentiment Value']=common_voices['Sentiment Value'].astype(float)

In [None]:
common_voices['Sentiment']=np.where(((common_voices['Sentiment Value']>=0.4) & (common_voices['Sentiment Value']<=0.6)),"neutral",common_voices['Sentiment Label'])
common_voices['Sentiment']=common_voices['Sentiment'].str.lower()

common_voices=common_voices[['client_id','FileName','Dataset','text','Label','Trigger','Sentiment','Toxic Filter']]

In [None]:
#Toxic Subset
toxic=get_toxic(common_voices)

#Toxic Filtered Non-Toxic Subset
nontoxic=get_nontoxic(common_voices,toxic.shape[0])

#Sentiment Balanced Non Toxic Filtered
nontoxic_balanced=get_nontoxic_balanced(common_voices,toxic.shape[0])

In [None]:
#Final Combined
final_common_voice=toxic.append(nontoxic,ignore_index=True)
final_common_voice=final_common_voice.append(nontoxic_balanced,ignore_index=True)

final_common_voice.reset_index(drop=True,inplace=True)
final_common_voice.shape

In [None]:
final_common_voice['client_id'].nunique()

In [None]:
final_common_voice.drop(['client_id'],axis=1,inplace=True)

In [None]:
final_common_voice.to_csv(destination+"Common_Voice.csv",index=False)

In [None]:
final_common_voice['Sentiment'].value_counts()

## MSP - Improv

In [None]:
msp_improv=pd.read_csv(source+'MSP-Improv.csv')
msp_improv.rename(columns={'File Name':'FileName'},inplace=True)

msp_improv['Sentiment']=np.where(((msp_improv['Sentiment Value']>=0.4) & (msp_improv['Sentiment Value']<=0.6)),"neutral",msp_improv['Sentiment Label'])
msp_improv['Sentiment']=msp_improv['Sentiment'].str.lower()
msp_improv['Dataset']='MSP-Improv'

msp_improv=msp_improv[['FileName','Dataset','text','Label','Trigger','Sentiment','Toxic Filter']]

In [None]:
msp_improv['Label'].value_counts()

In [None]:
#Toxic Subset
toxic=get_toxic(msp_improv)

#Toxic Filtered Non-Toxic Subset
nontoxic=get_nontoxic(msp_improv,toxic.shape[0])

#Sentiment Balanced Non Toxic Filtered
nontoxic_balanced=get_nontoxic_balanced(msp_improv,toxic.shape[0])

In [None]:
#Final Combined
final_msp_improv=toxic.append(nontoxic,ignore_index=True)
final_msp_improv=final_msp_improv.append(nontoxic_balanced,ignore_index=True)

final_msp_improv.reset_index(drop=True,inplace=True)
final_msp_improv.shape

In [None]:
final_msp_improv.to_csv(destination+"MSP-Improv.csv",index=False)

In [None]:
final_msp_improv['Sentiment'].value_counts()

## Social IQ

In [None]:
social_iq=pd.read_csv(source+"Social-IQ.csv")
social_iq.rename(columns={'Filename':'FileName'},inplace=True)

social_iq['Sentiment']=np.where(((social_iq['Sentiment Value']>=0.4) & (social_iq['Sentiment Value']<=0.6)),"neutral",social_iq['Sentiment Label'])
social_iq['Sentiment']=social_iq['Sentiment'].str.lower()
social_iq['Dataset']='Social-IQ'

social_iq['Label']=np.where((social_iq['Label']==2) ,0,social_iq['Label'])

In [None]:
social_iq=social_iq[['FileName','Dataset','text','start','end','Label','Sentiment','Toxic Filter']]

In [None]:
social_iq=social_iq[social_iq['text']!='#NAME?']
social_iq=social_iq[social_iq['FileName']!='#NAME?']

social_iq.reset_index(drop=True,inplace=True)

In [None]:
social_iq['Label'].value_counts()

In [None]:
#Toxic Subset
toxic=get_toxic(social_iq)

#Toxic Filtered Non-Toxic Subset
nontoxic=get_nontoxic(social_iq,toxic.shape[0])

#Sentiment Balanced Non Toxic Filtered
nontoxic_balanced=get_nontoxic_balanced(social_iq,toxic.shape[0])

In [None]:
#Final Combined
final_socialiq=toxic.append(nontoxic,ignore_index=True)
final_socialiq=final_socialiq.append(nontoxic_balanced,ignore_index=True)

final_socialiq.reset_index(drop=True,inplace=True)
final_socialiq.shape

In [None]:
final_socialiq.to_csv(destination+"Social-IQ.csv",index=False)

In [None]:
final_socialiq['Sentiment'].value_counts()

## VCTK

In [None]:
vctk=pd.read_csv(source+"VCTK.csv")
vctk.rename(columns={'filename':'FileName'},inplace=True)

vctk['Sentiment']=np.where(((vctk['Sentiment Value']>=0.4) & (vctk['Sentiment Value']<=0.6)),"neutral",vctk['Sentiment Label'])
vctk['Sentiment']=vctk['Sentiment'].str.lower()

vctk['Dataset']='VCTK'

vctk['Label']=np.where((vctk['Label']==2) ,0,vctk['Label'])
vctk=vctk[['FileName','Dataset','text','Label','Sentiment','Toxic Filter']]

In [None]:
vctk['Label'].value_counts()

In [None]:
speaker=vctk['FileName'].str.split('\\',expand=True)

In [None]:
#Toxic Subset
toxic=get_toxic(vctk)

#Toxic Filtered Non-Toxic Subset
nontoxic=get_nontoxic(vctk,toxic.shape[0])

#Sentiment Balanced Non Toxic Filtered
nontoxic_balanced=get_nontoxic_balanced(vctk,toxic.shape[0])

In [None]:
#Final Combined
final_vctk=toxic.append(nontoxic,ignore_index=True)
final_vctk=final_vctk.append(nontoxic_balanced,ignore_index=True)

final_vctk.reset_index(drop=True,inplace=True)
final_vctk.shape

In [None]:
final_vctk.to_csv(destination+"VCTK.csv",index=False)

In [None]:
final_vctk['Sentiment'].value_counts()

## Switchboard

In [None]:
swbd=pd.read_csv(source+"Switchboard.csv")

In [None]:
swbd.rename(columns={'Filename':'FileName'},inplace=True)

In [None]:
swbd['Sentiment']=np.where(((swbd['Sentiment Value']>=0.4) & (swbd['Sentiment Value']<=0.6)),"neutral",swbd['Sentiment Labels'])
swbd['Sentiment']=swbd['Sentiment'].str.lower()

swbd['Dataset']='Switchboard'

swbd['Label']=np.where((swbd['Label']==2) ,0,swbd['Label'])
swbd=swbd[['FileName','Dataset','text','Label','Trigger','Sentiment','Toxic Filter']]

In [None]:
#Toxic Subset
toxic=get_toxic(swbd)

#Toxic Filtered Non-Toxic Subset
nontoxic=get_nontoxic(swbd,toxic.shape[0])

#Sentiment Balanced Non Toxic Filtered
nontoxic_balanced=get_nontoxic_balanced(swbd,toxic.shape[0])

In [None]:
#Final Combined
final_swbd=toxic.append(nontoxic,ignore_index=True)
final_swbd=final_swbd.append(nontoxic_balanced,ignore_index=True)

final_swbd.reset_index(drop=True,inplace=True)
final_swbd.shape

In [None]:
final_swbd.to_csv(destination+"Switchboard.csv",index=False)

In [None]:
final_swbd['Sentiment'].value_counts()

## MSP Podcast

In [None]:
msp_podcast=pd.read_csv(source+"MSP-Podcast.csv")

In [None]:
msp_podcast['Split_Set'].value_counts()

In [None]:
msp_podcast.rename(columns={'File Name':'FileName'},inplace=True)

In [None]:
msp_podcast.head(2)

In [None]:
msp_podcast['Sentiment']=np.where(((msp_podcast['Sentiment Value']>=0.4) & (msp_podcast['Sentiment Value']<=0.6)),"neutral",msp_podcast['Sentiment Label'])
msp_podcast['Sentiment']=msp_podcast['Sentiment'].str.lower()

msp_podcast['Dataset']='MSP-Podcast'

msp_podcast['Label']=np.where((msp_podcast['Label']==2) ,0,msp_podcast['Label'])
msp_podcast=msp_podcast[['FileName','Dataset','text','Label','Trigger','Sentiment','Toxic Filter','SpkrID']]

In [None]:
msp_podcast['SpkrID'].nunique()

In [None]:
#Toxic Subset
toxic=get_toxic(msp_podcast)

#Toxic Filtered Non-Toxic Subset
nontoxic=get_nontoxic(msp_podcast,toxic.shape[0])

#Sentiment Balanced Non Toxic Filtered
nontoxic_balanced=get_nontoxic_balanced(msp_podcast,toxic.shape[0])

In [None]:
#Final Combined
final_podcast=toxic.append(nontoxic,ignore_index=True)
final_podcast=final_podcast.append(nontoxic_balanced,ignore_index=True)

final_podcast.reset_index(drop=True,inplace=True)
final_podcast.shape

In [None]:
final_podcast['Sentiment'].value_counts()

In [None]:
final_podcast['text']=final_podcast['text'].str.lower()

In [None]:
final_podcast['SpkrID'].nunique()

In [None]:
final_podcast.head(2)

In [None]:
final_podcast.shape

In [None]:
final_podcast.to_csv(destination+"MSP-Podcast.csv",index=False)

### Combining all Transcripts

In [None]:
files=os.listdir(destination)

In [None]:
combined_df=pd.DataFrame()
combined_df[['Dataset','FileName','text','label2a','Trigger','Sentiment','Toxic Filter','split']]=np.nan

In [None]:
for i in range(len(files)):
    temp=pd.read_csv(destination+files[i])
    temp=temp.filter(['Dataset','FileName','text','Label','Trigger','Sentiment','Toxic Filter'])
    split_temp=get_split(temp)
    combined_df=combined_df.append(split_temp)
    
combined_df.reset_index(drop=True,inplace=True)

In [None]:
combined_df['Dataset'].value_counts()

In [None]:
combined_df['Trigger'].fillna(0,inplace=True)

In [None]:
combined_df['FileName']=combined_df['FileName'].str.replace('.wav','')
combined_df['FileName']=combined_df['FileName'].str.replace('.mp3','')
combined_df['FileName']=combined_df['FileName'].str.replace('.txt','')

In [None]:
combined_df[combined_df['split']==0]['label2a'].value_counts()

In [None]:
#Dev and Test
dev=combined_df[combined_df['split']==1]
dev.reset_index(drop=True,inplace=True)

test=combined_df[combined_df['split']==2]
test.reset_index(drop=True,inplace=True)

In [None]:
combined_df=pd.DataFrame()
combined_df=final_train.append(dev)
combined_df=combined_df.append(test)
combined_df.reset_index(drop=True,inplace=True)

In [None]:
combined_df[combined_df['split']==0]['label2a'].value_counts()

In [None]:
combined_df[combined_df['split']==1]['label2a'].value_counts()

In [None]:
combined_df[combined_df['split']==2]['label2a'].value_counts()

In [None]:
#Exporting Final Dataset
combined_df.to_csv(destination+"DeToxy.csv",index=False)

In [None]:
#To Split onto different files
final_train.to_csv("Train.csv",index=False)
dev.to_csv("valid.csv",index=False)
test.to_csv("test.csv",index=False)