In [5]:

from owlready2 import get_ontology
import pandas as pd
import string

translator = str.maketrans('', '', string.punctuation)

def classify_sentence(sentence, ontology):
    words = sentence.split()
    classifications = {'Toxic': 0, 'MedicalTerminology': 0, 'NonToxic': 0, 'MinorityGroup': 0}
    
    matched = False
    for word in words:
        for individual in ontology.Content.instances():
            
            cleaned_word = word.translate(translator).lower()

            if cleaned_word == individual.name.lower():
                matched = True

                for cls in individual.is_a:
                    if cls.name == 'ToxicLanguage' and classifications['Toxic'] == 0:
                        classifications['Toxic'] = 1
                    if cls.name == 'MedicalTerminology' and classifications['MedicalTerminology'] == 0:
                        classifications['MedicalTerminology'] = 1
                    if cls.name == 'MinorityGroup' and classifications['MinorityGroup'] == 0:
                        classifications['MinorityGroup'] = 1
                    if cls.name == 'NonToxicLanguage' and classifications['NonToxic'] == 0:
                        classifications['NonToxic'] = 1
    if not matched:
        classifications['NonToxic'] = 1
    return classifications

def label_dataset(dataset_path, ontology_path):
    onto = get_ontology(ontology_path).load()
    sentences_df = pd.read_csv(dataset_path)

    sentences_df['Toxic'] = 0
    sentences_df['MedicalTerminology'] = 0
    sentences_df['NonToxic'] = 0
    sentences_df['MinorityGroup'] = 0

    for index, row in sentences_df.iterrows():
        sentence = row['text']
        classifications = classify_sentence(sentence, onto)
        sentences_df.at[index, 'Toxic'] = classifications['Toxic']
        sentences_df.at[index, 'MedicalTerminology'] = classifications['MedicalTerminology']
        sentences_df.at[index, 'MinorityGroup'] = classifications['MinorityGroup']
        sentences_df.at[index, 'NonToxic'] = classifications['NonToxic']

    return sentences_df

ontology_path = "../ontology/toxic-language-ontology-with-individuals.owl"
dataset_path = "../data/proud_bg_sentences.csv"
sentences_df = label_dataset(dataset_path, ontology_path)
sentences_df.to_csv('demo_labeled_dataset.csv', index=False)
