In [1]:
# Loading required packages
import spacy
import pandas as pd
import re
import json
import csv
from sklearn.model_selection import train_test_split

# Select Spacy model
# Efficiency
nlp = spacy.load("en_core_web_sm")

# Accuracy
# nlp = spacy.load("en_core_web_trf")

# Change working directory
%cd '/Users/richardfrissen/Documents/Maastricht University/Thesis/Development/Datasets'

/Users/richardfrissen/Documents/Maastricht University/Thesis/Development/Datasets


## 2. Import data

### Load EMSCAD Dataset

In [2]:
# Load the csv file into the environment
jobdescriptions = pd.read_csv('EMSCAD/Input data/JobDescriptions.csv', delimiter=',')
jobdescriptions.head(5)
len(jobdescriptions)

17880

### Subset data, remain only the column description

In [3]:
# Copy column description from DF jobdescription
descriptions = jobdescriptions['description']

# Convert Series into Dataframe
descriptions = descriptions.to_frame()
descriptions.head(5)

Unnamed: 0,description
0,"<p>Food52, a fast-growing, James Beard Award-w..."
1,<p>Organised - Focused - Vibrant - Awesome!<br...
2,"<p>Our client, located in Houston, is actively..."
3,<p><b>THE COMPANY: ESRI – Environmental System...
4,<p><b>JOB TITLE:</b> Itemization Review Manage...


## 3. Clean the data

### Remove HTML patterns in job descriptions

### Once cleaned, we can put the data through Spacy's NLP pipeline and tokenize each description

In [187]:
%%time
# Remove HTML codes based on pattern cleanr
# Remove all characters except whitespace an alphabetic characters.

result = []
Cleaned = pd.DataFrame()
for i in descriptions["description"]:
    cleanr = re.compile('<.*?>')
    i = re.sub(cleanr, '', i)
    cleanr = re.compile('<[^>]+>')
    i = re.sub(cleanr, '', i)
    i = i.replace('\xa0', ' ')
    i = i.replace('\r', ' ')
    i = i.replace('&amp', ' ')
    i = i.replace('\N{SOFT HYPHEN}', '')
    result.append(i)

# Add the result
Cleaned["Result"] = result
Cleaned.head(5)

CPU times: user 376 ms, sys: 30.3 ms, total: 406 ms
Wall time: 411 ms


Unnamed: 0,Result
0,"Food52, a fast-growing, James Beard Award-winn..."
1,Organised - Focused - Vibrant - Awesome!Do you...
2,"Our client, located in Houston, is actively se..."
3,THE COMPANY: ESRI – Environmental Systems Rese...
4,JOB TITLE: Itemization Review Manager \nLOCATI...


### Split descriptions into sentences
### By doing so, we prepare the data for annotation and training the custom NER model ---> Spacy

In [188]:
%%time
# Split each description into sentences
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
endpoint = ('.')

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    sentences = [token for token in sentences if not token.startswith(prefixes)]
    return sentences

def remove_invalid_sentences(sentences):
    prefixes = ('.')
    output = []
    sentences = [token for token in sentences if not token.startswith(prefixes)]
    output.append(sentences)
    return output

result = []

for i in Cleaned["Result"]:
    sentences = str(i)
    sentences = split_into_sentences(sentences)
#     sentences = remove_invalid_sentences(sentences)
    if sentences:
        result.append(sentences)

sentences = pd.DataFrame(columns=['sentence'])
sentences["sentence"] = result
sentences['sentence'].to_csv('EMSCAD/Output data/sentence.csv', index = False, header = False)

CPU times: user 3.21 s, sys: 72.5 ms, total: 3.29 s
Wall time: 3.31 s


### TRAIN/ TEST SET

In [189]:
# 80% / 20% split
Train, Eval = train_test_split(sentences, test_size=0.2, shuffle=False)
Train = list(Train['sentence'])
Eval = list(Eval['sentence'])
FULL = list(sentences['sentence'])

## 4. Preparation and annotation

In [202]:
%%time
Train_Annotation_data = pd.DataFrame(columns=['Result', 'Label'])
Eval_Annotation_data = pd.DataFrame(columns=['Result', 'Label'])
FULL_Annotation_data = pd.DataFrame(columns=['Result', 'Label'])
#########################
# lemmatizer = nlp.get_pipe("lemmatizer")
#########################

prefixes = ["\"","#","$","%","&","'","(",")","*","+",","," ","-","/",":",";","<","=",">","@","[","\\","]","^","_","`","{","|","}","~"]
prefixes_end = ["?","!"]
full_stop = ["."]

def sentence_to_words(input_list):
    prefixes = ["\"","#","$","%","&","'","(",")","*","+",","," ","-","/",":",";","<","=",">","@","[","\\","]","^","_","`","{","|","}","~"]
    prefixes_end = ["?","!"]
    full_stop = ["."]
    result = []
    max_length = 0
    count = 0
    for i in input_list:
        i = str(i)
        ######################### USE FOR en_core_web_trf model only!
#         if len(i) > max_length:
#             max_length = len(i)
#         if len(i)>512:
#             i = i[:512]
#             count = count +1
        #########################
        i = nlp(i)
        for token in i:
            #########################
#             token = token.lemma_
            #########################
            if str(token) not in prefixes and str(token) not in prefixes_end:
                result.append(token)
            elif str(token) in prefixes_end:
                result.append(nlp(full_stop[0]))
#     print(count)
#     print(max_length)
    return result

Train_Annotation_data["Result"] = sentence_to_words(Train)
Eval_Annotation_data["Result"] = sentence_to_words(Eval)
FULL_Annotation_data["Result"] = sentence_to_words(FULL)

CPU times: user 16min, sys: 1min 49s, total: 17min 50s
Wall time: 17min 59s


In [203]:
# Export data to be annotated (Can be used for manual annotation!)
Train_Annotation_data['Result'].to_csv('EMSCAD/Output data/Train_Annotation_data.csv')
Eval_Annotation_data['Result'].to_csv('EMSCAD/Output data/Eval_Annotation_data.csv')
FULL_Annotation_data['Result'].to_csv('EMSCAD/Output data/Eval_Annotation_data.csv')

In [204]:
# Import the source file that contains all biased word lists
biased_words = pd.read_csv('EMSCAD/Input data/biased_words.csv', delimiter=';')

In [205]:
%%time
# Automated annotation process (Based on the word lists imported.)
# Only exact matches will be annotated.
def automated_annotation(Annotation_data):
    result = []
    row = -1
    count = 0
    for i in Annotation_data['Result']:
        i = str(i)
        row = row + 1
        for j in biased_words:
            for k in biased_words[j]:
                word = str(k)
                if word == i:
                    Annotation_data['Label'][row] = j
                    count = count + 1
    Annotation_data['Label'] = Annotation_data['Label'].fillna("O")
    print(str(count) + " words have been annotated.")
    return Annotation_data

Train_Annotation_data = automated_annotation(Train_Annotation_data)
Eval_Annotation_data = automated_annotation(Eval_Annotation_data)
FULL_Annotation_data = automated_annotation(FULL_Annotation_data)

12649 words have been annotated.
2867 words have been annotated.
15516 words have been annotated.
CPU times: user 7min 51s, sys: 1.64 s, total: 7min 53s
Wall time: 7min 55s


In [206]:
# Export annotated data (By "automated" annotator)
Train_Annotation_data.to_csv('EMSCAD/Output data/Train_Annotation_data_output.tsv', sep='\t', index = False, header = False)
Eval_Annotation_data.to_csv('EMSCAD/Output data/Eval_Annotation_data_output.tsv', sep='\t', index = False, header = False)
FULL_Annotation_data.to_csv('EMSCAD/Output data/FULL_Annotation_data_output.tsv', sep='\t', index = False, header = False)

In [213]:
Train_Annotation_data['Label'].value_counts()

O                        2419381
Feminine-coded words        7411
Masculine-coded words       5238
Name: Label, dtype: int64

In [215]:
Eval_Annotation_data['Label'].value_counts()

O                        568705
Feminine-coded words       1671
Masculine-coded words      1196
Name: Label, dtype: int64

In [216]:
FULL_Annotation_data['Label'].value_counts()

O                        2988086
Feminine-coded words        9082
Masculine-coded words       6434
Name: Label, dtype: int64