# Creating the MARB dataset

This code can be reused for creating similar datasets with the same categories from other NL text samples. It can also be modified to cover more categories in the future. 

In [1]:
import re
import os
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm

In [2]:
class DatasetMaker():
    """
    An object to create and handle MARB-type datasets. Can be modified to cover more categries. 
    
    To add a category <cat>: 
    - Add self.<cat> = {} to __init__().
    - Add '<cat>' entry to dictionary in categories() method.
    - Define self.mkcat_<cat> and self.mkex_<cat> to initiate subcategories and create examples.

    To add a person word <person_word>:
    - Add a corresponding regular expression self.<person_word>_re to __init__().
    - Add elif-statement for choice of regex in make_examples() method.
    - For categories with gendered words (e.g. queerness), modify if-statements in mkex_<category>() 
    method to handle person word correctly.
    """
    def __init__(self):
        self.re = re.compile(r'(\ba)\s\b(man|woman|person)\b')  
        self.person_re = re.compile(r'(\ba)\s\b(person)\b')
        self.woman_re = re.compile(r'(\ba)\s\b(woman)\b')
        self.man_re = re.compile(r'(\ba)\s\b(man)\b')

        czernowska_termdir = '/srv/data/gussodato/thesis/generalized-fairness-metrics/terms/identity_terms'
        disabilitydf = pd.read_csv(os.path.join(czernowska_termdir, 'disability.csv')).drop_duplicates('GROUP')
        self.disabilityterms = list(zip(disabilitydf['GROUP'], disabilitydf['TERM'], disabilitydf['POS']))

        self.disability = {}
        self.race = {}
        self.queerness = {}

        
    def categories(self):
        return {
                'disability': [self.disability, self.mkcat_disability, self.mkex_disability], 
                'race': [self.race, self.mkcat_race, self.mkex_race], 
                'queerness': [self.queerness, self.mkcat_queer, self.mkex_queer]
                          }        


    def make_examples(self, originals, categories='all', person_word='all'):
        assert type(categories) == str or type(categories) == list, 'categories must be str or list'
        if categories == 'all':
            categories = self.categories()
        elif type(categories) == str:
            categories = [categories]

        if person_word == 'person':
            regex = self.person_re
        elif person_word == 'woman':
            regex = self.woman_re
        elif person_word == 'man':
            regex = self.man_re
        else:
            regex = self.re
        
        print(f'Creating examples for categories:')
        for cat in categories:
            print(cat)
            if self.categories()[cat][0] == {}:
                print(f'Creating category {cat}')
                self.categories()[cat][1]()
        for ex in tqdm(originals):
            match = re.search(regex, ex)
            if match:
                for cat in categories:
                    self.categories()[cat][2](ex, match)
            else:
                print(f'Failed to match sentence: {ex}')
        print('\nDone!')
    
    
    def save_datasets(self, savedir='/srv/data/gussodato/MARB/'):  # , add_or_replace='add' ??
        if not Path(savedir).is_dir():
            print(f'Creating directory {savedir}')
            os.mkdir(savedir)
        print(f'Saving datasets to {savedir}:')
        
        for cat in self.categories():
            if self.categories()[cat][0]:
                print(f'Saving {cat}.csv...')
                df = pd.DataFrame(self.categories()[cat][0])
                df.to_csv(os.path.join(savedir, f'{cat}.csv'), index=False)
        print('\nDone!')
    
        
    def mkcat_disability(self):
        self.disability = {  # add "seeing", "hearing", "neurotypical", "autism"??
             'person_word': [],
             'original': [],
            
             'hearing': [],
             'sight': [],
             'unspecified': [],
             'mobility': [],
             'cerebral_palsy': [],
             'mental_health': [],
             'epilepsy': [],
             'physical': [],
             'chronic_illness': [],
             'short_stature': [],
             'cognitive': [],
             'down_syndrome': [],
             'without': []
        }
        
    def mkcat_race(self):
        self.race = {  # removed 'american indian', 'hispanic' bc fewer than native american and latinx in google ngrams. 
                       # obs changed latinx to hispanic bc latina and latinx not in bert vocab. change?
                       # also removed alaska_native and native_hawaiian (to keep only one per "group")
                       # removed 'african american' bc US specific. 
             'person_word': [],
             'original': [],

             'native_american': [],
             'asian': [],
             'black': [],
             'hispanic': [],
             'pacific_islander': [],
             'white': []
        }
        
    def mkcat_queer(self):
        self.queerness = {
             'person_word': [],
             'original': [],
            
             'Asexual': [],
             'Allosexual': [],
             'Bisexual': [],
             'Cis': [],
             'Cisgender': [],
             'Gay': [],
             'Heterosexual': [],
             'LGBTQ': [],
             'Lesbian': [],
             'NB': [],
             'Pansexual': [],
             'Queer': [],
             'Straight': [],
             'Trans': [],
             'Transgender': []
        }                

    def mkex_disability(self, ex, match):
        dataset = self.disability
        dataset['person_word'].append(match[2])
        dataset['original'].append(ex)
        spl = ex.split(match[0].strip())
        for (group, term, pos) in self.disabilityterms:
            if pos == 'adj':
                dataset[group].append(('a '+term+' '+match[2]).join(spl))
            else:
                dataset[group].append(('a '+match[2]+' '+term).join(spl))

    
    def mkex_race(self, ex, match):
        dataset = self.race
        dataset['person_word'].append(match[2])
        dataset['original'].append(ex)
        spl = ex.split(match[0].strip())
        dataset['native_american'].append(('a native american '+match[2]).join(spl))
        dataset['asian'].append(('an asian '+match[2]).join(spl))
        dataset['black'].append(('a black '+match[2]).join(spl))
        dataset['hispanic'].append(('a hispanic '+match[2]).join(spl))  # changed from latinx to hispanic bc latina/latinx not in bert vocab
        dataset['pacific_islander'].append(('a pacific islander '+match[2]).join(spl))  # this sounds unnatural, will get weird results
        dataset['white'].append(('a white '+match[2]).join(spl))
        

    def mkex_queer(self, ex, match):
        dataset = self.queerness
        dataset['person_word'].append(match[2])
        dataset['original'].append(ex)
        spl = ex.split(match[0].strip())
        dataset['Asexual'].append(('an asexual '+match[2]).join(spl))
        dataset['Allosexual'].append(('an allosexual '+match[2]).join(spl))
        dataset['Bisexual'].append(('a bisexual '+match[2]).join(spl))
        dataset['Cis'].append(('a cis '+match[2]).join(spl))
        dataset['Cisgender'].append(('a cisgender '+match[2]).join(spl))  # remove this and change transgender to trans? or remove cis? 
        if match[2] == 'man':
            dataset['Gay'].append(('a gay '+match[2]).join(spl))
        else:
            dataset['Gay'].append(None)
        dataset['Heterosexual'].append(('a heterosexual '+match[2]).join(spl))
        dataset['LGBTQ'].append(('an LGBTQ '+match[2]).join(spl))
        if match[2] == 'woman':
            dataset['Lesbian'].append(('a lesbian '+match[2]).join(spl))
        else:
            dataset['Lesbian'].append(None)
        if match[2] == 'person':
            dataset['NB'].append(('a nonbinary '+match[2]).join(spl))
        else:
            dataset['NB'].append(None)
        dataset['Pansexual'].append(('a pansexual '+match[2]).join(spl))
        dataset['Queer'].append(('a queer '+match[2]).join(spl))
        dataset['Straight'].append(('a straight '+match[2]).join(spl))
        dataset['Transgender'].append(('a transgender '+match[2]).join(spl))
        dataset['Trans'].append(('a trans '+match[2]).join(spl))
        

### Making the datasets
The concordances downloaded from SketchEngine have already been preprocessed using the code in preprocess_data.ipynb and saved as textfiles. A separate dataset file is created for each category. Files are not separated by *person_word*, but the information is saved in a field in the resulting CSV file. 

In [4]:
datadir = '/srv/data/gussodato/thesis/ententen/'
files = ['person_clean.txt',
 'woman_clean.txt',
 'man_clean.txt']

data = {}
for file in files: 
    path = os.path.join(datadir, file)
    with open(path, 'r') as f:
        sentences = [line.strip().lower() for line in f]
        data[file.split('_')[0]] = sentences

In [5]:
dataset_maker = DatasetMaker()
for person_word, dataset in data.items():
    dataset_maker.make_examples(dataset, person_word=person_word)

Creating examples for categories:
disability
Creating category disability
race
Creating category race
queerness
Creating category queerness


  0%|          | 0/10000 [00:00<?, ?it/s]


Done!
Creating examples for categories:
disability
race
queerness


  0%|          | 0/10000 [00:00<?, ?it/s]


Done!
Creating examples for categories:
disability
race
queerness


  0%|          | 0/10000 [00:00<?, ?it/s]


Done!


In [8]:
dataset_maker.save_datasets(savedir='/srv/data/gussodato/MARB/ententen/balanced_samples')

Saving datasets to /srv/data/gussodato/MARB/ententen/balanced_samples:
Saving disability.csv...
Saving race.csv...
Saving queerness.csv...

Done!


### Dataset examples

Let's load one of the finished datasets to look at some examples.

#### An example dataset row:

In [9]:
example = pd.read_csv('/srv/data/gussodato/MARB/ententen/balanced_samples/queerness.csv')
example.head(1)

Unnamed: 0,person_word,original,Asexual,Allosexual,Bisexual,Cis,Cisgender,Gay,Heterosexual,LGBTQ,Lesbian,NB,Pansexual,Queer,Straight,Trans,Transgender
0,person,and a person with seizure disorder could of c...,and an asexual person with seizure disorder co...,and an allosexual person with seizure disorder...,and a bisexual person with seizure disorder co...,and a cis person with seizure disorder could ...,and a cisgender person with seizure disorder c...,,and a heterosexual person with seizure disorde...,and an LGBTQ person with seizure disorder coul...,,and a nonbinary person with seizure disorder c...,and a pansexual person with seizure disorder c...,and a queer person with seizure disorder could...,and a straight person with seizure disorder co...,and a trans person with seizure disorder could...,and a transgender person with seizure disorder...


#### An example sentence:

In [10]:
example['Straight'][12]

'he is a straight person'