In [68]:
# Loading required packages
import spacy
import pandas as pd
import re
import json
import csv
from sklearn.model_selection import train_test_split
from names_dataset import NameDataset
from names_dataset import NameDatasetV1
from flashtext import KeywordProcessor
from tqdm import tqdm



# Select Spacy model
# Efficiency
# nlp = spacy.load("en_core_web_sm")

# Accuracy
nlp = spacy.load("en_core_web_trf")

# import stanza
# stanza.download('en')       # This downloads the English models for the neural pipeline
# nlp = stanza.Pipeline('en', processors="tokenize,ner") # This sets up a default neural pipeline in English

# Change working directory
%cd '/Users/richardfrissen/Documents/Maastricht University/Thesis/Development/Datasets'

/Users/richardfrissen/Documents/Maastricht University/Thesis/Development/Datasets


## 2. Import data

### Load EMSCAD Dataset

In [69]:
# Load the csv file into the environment
jobdescriptions = pd.read_csv('EMSCAD/Input data/JobDescriptions_cleaned.csv', delimiter=',')
jobdescriptions

# Check length dataframe
len(jobdescriptions)

17880

### Subset data, remain only the column description

In [74]:
# Copy column description from DF jobdescription
descriptions = jobdescriptions['description']

# Convert Series into Dataframe
descriptions = descriptions.to_frame()
descriptions

Unnamed: 0,description
0,"Food, a fast-growing, -winning online food com..."
1,Organised - Focused - NAME_MASKED - Awesome!Do...
2,"Our client, located in Houston, is actively se..."
3,THE COMPANY: ESRI – Environmental Systems Rese...
4,JOB TITLE: Itemization Review Manager \nLOCATI...
...,...
17875,Just in case this is the first time you’ve vis...
17876,\nThe Payroll Accountant will focus primarily...
17877,Experienced Project Cost Control Staff Enginee...
17878,NAME_MASKED Studios is looking for an experien...


## 3. Clean the data

### Remove HTML patterns in job descriptions

### Once cleaned, we can put the data through Spacy's NLP pipeline and tokenize each description

In [75]:
%%time

result = []
counter = 0

Cleaned = pd.DataFrame()
for i in descriptions['description']:
    i = str(i)
    cleanr = re.compile('<.*?>')
    i = re.sub(cleanr, '', i)
    cleanr = re.compile('<[^>]+>')
    i = re.sub(cleanr, '', i)
    # Remove all between hashtag (# #)
    i = re.sub(r'#[\w-]+#', '#URL_MASKED#', i)
    i = re.sub(r'#[\w\s-]+#', '#URL_MASKED#', i)
    # URL universal
    i = re.sub(r"(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'.,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))", '#URL_MASKED#', i)
    # URL pattern
    i = re.sub(r'(#\s*URL_).[^#]+\s*[#]', '#URL_MASKED#', i)
    # Email patten
    i = re.sub(r'(#\s*EMAIL_).[^#]+\s*[#]', '#EMAIL_MASKED#', i)
    # Email adresses
    i = re.sub(r'/^([a-z0-9_\.-]+\@[\da-z-]+\.[a-z\.]{2,6})$/', '#EMAIL_MASKED#', i)
    # Phone pattern
    i = re.sub(r'^[+]*[(]{0,1}[0-9]{1,4}[)]{0,1}[-\s\./0-9]*$', '#PHONE_MASKED#', i)
    # All phone numbers
    i = re.sub(r'[+]*[(]{0,1}[0-9]{1,4}[)]{0,1}[-\s\./0-9]*$', '#PHONE_MASKED#', i)
    i = i.replace('\xa0', ' ')
    i = i.replace('\r', ' ')
    i = i.replace('\n', ' ')
    i = i.replace('&amp', ' ')
    i = i.replace('\N{SOFT HYPHEN}', '')
    j = str(i)
    i = str(i)
    result.append(i)

# Add the result
Cleaned["Result"] = result
Cleaned.head(5)

CPU times: user 5.29 s, sys: 27.6 ms, total: 5.32 s
Wall time: 5.37 s


Unnamed: 0,Result
0,"Food, a fast-growing, -winning online food com..."
1,Organised - Focused - NAME_MASKED - Awesome!Do...
2,"Our client, located in Houston, is actively se..."
3,THE COMPANY: ESRI – Environmental Systems Rese...
4,JOB TITLE: Itemization Review Manager LOCATIO...


### Split descriptions into sentences
### By doing so, we prepare the data for annotation and training the custom NER model ---> Spacy

In [77]:
%%time
# Split each description into sentences
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
endpoint = ('.')

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    sentences = [token for token in sentences if not token.startswith(prefixes)]
    return sentences

def remove_invalid_sentences(sentences):
    prefixes = ('.')
    output = []
    sentences = [token for token in sentences if not token.startswith(prefixes)]
    output.append(sentences)
    return output

result = []

for i in Cleaned["Result"]:
    sentences = str(i)
    sentences = split_into_sentences(sentences)
#     sentences = remove_invalid_sentences(sentences)
    if sentences:
        result.append(sentences)

sentences = pd.DataFrame(columns=['sentence'])
sentences["sentence"] = result
sentences['sentence'].to_csv('EMSCAD/Output data/sentence.csv', index = False, header = False)

CPU times: user 3.35 s, sys: 39 ms, total: 3.39 s
Wall time: 3.43 s


### TRAIN/ TEST SET

In [78]:
# 80% / 20% split
Train, Eval = train_test_split(sentences, test_size=0.2, shuffle=False)
Train = list(Train['sentence'])
Eval = list(Eval['sentence'])
FULL = list(sentences['sentence'])

## 4. Preparation and annotation

In [79]:
%%time
Train_Annotation_data = pd.DataFrame(columns=['Result', 'Label'])
Eval_Annotation_data = pd.DataFrame(columns=['Result', 'Label'])
FULL_Annotation_data = pd.DataFrame(columns=['Result', 'Label'])
#########################
lemmatizer = nlp.get_pipe("lemmatizer")
#########################

prefixes = ["\"","#","$","%","&","'","(",")","*","+",","," ","-","/",":",";","<","=",">","@","[","\\","]","^","_","`","{","|","}","~"]
prefixes_end = ["?","!"]
full_stop = ["."]
gdpr_begin = ["URL", "#URL", "EMAIL", "#EMAIL", "PHONE", "#PHONE"]
gdpr_begin = "(URL|#URL|EMAIL|#EMAIL|PHONE|#PHONE)"

def sentence_to_words(input_list):
    prefixes = ["\"","#","$","%","&","'","(",")","*","+",","," ","-","/",":",";","<","=",">","@","[","\\","]","^","_","`","{","|","}","~"]
    prefixes_end = ["?","!"]
    full_stop = ["."]
    result = []
    max_length = 0
    count = 0
    for i in tqdm(input_list):
        i = str(i)
        ######################### USE FOR en_core_web_trf model only! (because this model can handle only sentences with the number of characters up to 512)
        if len(i) > max_length:
            max_length = len(i)
        if len(i)>512:
            i = i[:512]
            count = count +1
        #########################
        i = nlp(i)
        for token in i:
            #########################
            token = token.lemma_
            #########################
            if str(token) not in prefixes and str(token) not in prefixes_end:
                result.append(token)
            elif str(token) in prefixes_end:
                result.append(nlp(full_stop[0]))
#     print(count)
#     print(max_length)
    return result

Train_Annotation_data["Result"] = sentence_to_words(Train)
Eval_Annotation_data["Result"] = sentence_to_words(Eval)
FULL_Annotation_data["Result"] = sentence_to_words(FULL)

  3%|▎         | 481/17374 [02:34<1:40:40,  2.80it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (882 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 17374/17374 [1:39:48<00:00,  2.90it/s]   


CPU times: user 6h 18min 42s, sys: 8min 57s, total: 6h 27min 40s
Wall time: 1h 39min 49s


In [80]:
# Export data to be annotated (Can be used for manual annotation!)
Train_Annotation_data['Result'].to_csv('EMSCAD/Output data/Train_Annotation_data.csv')
Eval_Annotation_data['Result'].to_csv('EMSCAD/Output data/Eval_Annotation_data.csv')
FULL_Annotation_data['Result'].to_csv('EMSCAD/Output data/Full_Annotation_data.csv')

In [81]:
# Import the source file that contains all biased word lists
biased_words = pd.read_csv('EMSCAD/Input data/biased_words.csv', delimiter=';')

In [82]:
%%time
# Automated annotation process (Based on the word lists imported.)
# Only exact matches will be annotated.
def automated_annotation(Annotation_data):
    result = []
    row = -1
    count = 0
    for i in tqdm(Annotation_data['Result']):
        i = str(i)
        row = row + 1
        for j in biased_words:
            for k in biased_words[j]:
                word = str(k)
                if word == i:
                    Annotation_data['Label'][row] = j
                    count = count + 1
    Annotation_data['Label'] = Annotation_data['Label'].fillna("O")
    print(str(count) + " words have been annotated.")
    return Annotation_data

Train_Annotation_data = automated_annotation(Train_Annotation_data)
Eval_Annotation_data = automated_annotation(Eval_Annotation_data)
FULL_Annotation_data = automated_annotation(FULL_Annotation_data)

100%|██████████| 2977570/2977570 [36:38<00:00, 1354.56it/s]


117200 words have been annotated.
CPU times: user 36min 17s, sys: 20.3 s, total: 36min 38s
Wall time: 36min 38s


In [83]:
# Export annotated data (By "automated" annotator)
Train_Annotation_data.to_csv('EMSCAD/Output data/Train_Annotation_data_output.tsv', sep='\t', index = False, header = False)
Eval_Annotation_data.to_csv('EMSCAD/Output data/Eval_Annotation_data_output.tsv', sep='\t', index = False, header = False)
FULL_Annotation_data.to_csv('EMSCAD/Output data/FULL_Annotation_data_output.tsv', sep='\t', index = False, header = False)

In [15]:
# Model config 1 and 2 --> Dataset 1
FULL_Annotation_data['Label'].value_counts()

O                                  2909508
Masculine-coded words                37572
Feminine-coded words                 31911
Exclusive language                    2765
Demographic and Racial language        380
LGBTQ-coloured language                103
Name: Label, dtype: int64

In [54]:
# Model config 3 and 4 -->  Dataset 2
FULL_Annotation_data['Label'].value_counts()

O                                  2866384
Feminine-coded words                 54143
Masculine-coded words                52094
Exclusive language                    3883
Demographic and Racial language       1694
LGBTQ-coloured language                172
Name: Label, dtype: int64

In [35]:
# Model config 5 and 6 -->  Dataset 3
FULL_Annotation_data['Label'].value_counts()

O                                  2909508
Masculine-coded words                37572
Feminine-coded words                 31911
Exclusive language                    2765
Demographic and Racial language        380
LGBTQ-coloured language                103
Name: Label, dtype: int64

In [84]:
# Model config 7 and 8 -->  Dataset 4
FULL_Annotation_data['Label'].value_counts()

O                                  2862507
Feminine-coded words                 54898
Masculine-coded words                54311
Exclusive language                    4022
Demographic and Racial language       1658
LGBTQ-coloured language                174
Name: Label, dtype: int64