## Importing Libraries

In [None]:
#Modeling Libraries
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

#Data Wrangling Libraries
import pandas as pd
import numpy as np

#For Timer
from tqdm import tqdm

## Civil Comments

In [None]:
#Importing Model Trained on Civil Comments
from detoxify import Detoxify

#Loading Model
model_toxic = Detoxify('unbiased', device='cuda')

Downloading: "https://github.com/unitaryai/detoxify/releases/download/v0.3-alpha/toxic_debiased-c7548aa0.ckpt" to /root/.cache/torch/hub/checkpoints/toxic_debiased-c7548aa0.ckpt


  0%|          | 0.00/476M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

In [None]:
#Setting Path Variables
source='../Transcribed_Transcripts/'
destination='../Civil_Comments_Output/'

In [None]:
def create_toxic(source,destination):

    #Listing All CSV Files
    csv_files = [f for f in os.listdir(source) if f.endswith('.csv')]

    #Looping Through All CSV Files
    for j in range(len(csv_files)):

    print("Reading File: ",csv_files[j])
    
    #Reading the Files
    train = pd.read_csv(source+csv_files[j],encoding='latin-1')
    preds = []

    print("Starting To Predict For Toxicity in each sentence")

    toxicity = []

    #Storing Prediction Outputs
    for i in tqdm(range(train.shape[0])):
        p = model_toxic.predict(train['text'].iloc[i])
        toxicity.append(p['toxicity'])

    toxic=train.copy()

    #Labeling Toxic(1) and Non Toxic(2)
    toxic['Toxicity']=toxicity
    toxic['Toxicity']=round(toxic['Toxicity'])
    toxic["Predicted"]=np.nan
    toxic["Predicted"]=np.where(toxic['Toxicity']>0.5,1,0)
    print("Found {row} toxic utterances in {file} ".format(row=toxic[toxic['Predicted']==1].shape[0],file=csv_files[j]))
    toxic.to_csv(destination+csv_files[j],index=False)

    print("Finished Creating File: ",csv_files[j])

In [None]:
#Calling Function
create_toxic(source,destination)

Reading File:  CommonVoice_Test_transcripts.csv
Starting To Predict For Toxicity in each sentence


100%|██████████| 3045/3045 [00:49<00:00, 61.16it/s]


Found 925 toxic utterances in CommonVoice_Test_transcripts.csv 
Finished Creating File:  CommonVoice_Test_transcripts.csv
Reading File:  SwitchBoard_Test_transcripts.csv
Starting To Predict For Toxicity in each sentence


100%|██████████| 3045/3045 [00:49<00:00, 61.09it/s]


Found 950 toxic utterances in SwitchBoard_Test_transcripts.csv 
Finished Creating File:  SwitchBoard_Test_transcripts.csv
Reading File:  CommonVoice_Trigger_Test_transcripts.csv
Starting To Predict For Toxicity in each sentence


100%|██████████| 844/844 [00:13<00:00, 61.76it/s]


Found 531 toxic utterances in CommonVoice_Trigger_Test_transcripts.csv 
Finished Creating File:  CommonVoice_Trigger_Test_transcripts.csv
Reading File:  CommonVoice_Dev_transcripts.csv
Starting To Predict For Toxicity in each sentence


100%|██████████| 3107/3107 [00:50<00:00, 61.92it/s]


Found 949 toxic utterances in CommonVoice_Dev_transcripts.csv 
Finished Creating File:  CommonVoice_Dev_transcripts.csv
Reading File:  SwitchBoard_TriggerTest_transcripts.csv
Starting To Predict For Toxicity in each sentence


100%|██████████| 844/844 [00:13<00:00, 61.22it/s]


Found 531 toxic utterances in SwitchBoard_TriggerTest_transcripts.csv 
Finished Creating File:  SwitchBoard_TriggerTest_transcripts.csv
Reading File:  SwitchBoard_Dev_transcripts.csv
Starting To Predict For Toxicity in each sentence


100%|██████████| 3107/3107 [00:50<00:00, 61.74it/s]


Found 1017 toxic utterances in SwitchBoard_Dev_transcripts.csv 
Finished Creating File:  SwitchBoard_Dev_transcripts.csv
Reading File:  LibriSpeech_test_transcripts.csv
Starting To Predict For Toxicity in each sentence


100%|██████████| 3045/3045 [00:49<00:00, 61.63it/s]


Found 955 toxic utterances in LibriSpeech_test_transcripts.csv 
Finished Creating File:  LibriSpeech_test_transcripts.csv
Reading File:  LibriSpeech_trigger_transcripts.csv
Starting To Predict For Toxicity in each sentence


100%|██████████| 844/844 [00:13<00:00, 60.53it/s]


Found 514 toxic utterances in LibriSpeech_trigger_transcripts.csv 
Finished Creating File:  LibriSpeech_trigger_transcripts.csv
Reading File:  LibriSpeech_dev_transcripts.csv
Starting To Predict For Toxicity in each sentence


100%|██████████| 3107/3107 [00:53<00:00, 58.40it/s]


Found 1005 toxic utterances in LibriSpeech_dev_transcripts.csv 
Finished Creating File:  LibriSpeech_dev_transcripts.csv
Reading File:  Gold_trigger_test.csv
Starting To Predict For Toxicity in each sentence


100%|██████████| 844/844 [00:14<00:00, 57.85it/s]


Found 549 toxic utterances in Gold_trigger_test.csv 
Finished Creating File:  Gold_trigger_test.csv
Reading File:  Gold_Dev.csv
Starting To Predict For Toxicity in each sentence


100%|██████████| 3107/3107 [00:51<00:00, 60.09it/s]


Found 967 toxic utterances in Gold_Dev.csv 
Finished Creating File:  Gold_Dev.csv
Reading File:  Gold_Test.csv
Starting To Predict For Toxicity in each sentence


100%|██████████| 3045/3045 [00:48<00:00, 62.21it/s]

Found 898 toxic utterances in Gold_Test.csv 
Finished Creating File:  Gold_Test.csv





# Evaluating Results

In [None]:
#Importing Sklearn Metric Library
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [None]:
#Listing all output Files
source='../Civil_Comments_Output/'
csv_files = [f for f in os.listdir(source) if f.endswith('.csv')]

In [None]:
#Reading and Calculating  Micro and Macro F1 Scores.
print("######################")
for j in range(len(csv_files)):

    print("Reading File: ",csv_files[j])
    data=pd.read_csv(source+csv_files[j])

    micro=f1_score(data['Label'], data['Predicted'], average='micro')
    macro=f1_score(data['Label'], data['Predicted'], average='macro')

    print("Micro F1 Score:",micro)
    print("Macro F1 Score:",macro)
    print("######################")