In [2]:
# Mounting drive to Google Collab
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
# Get project directory in Google Colab
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks')

In [4]:
# Print working directory to see if everything worked
!pwd

/content/drive/MyDrive/Colab Notebooks


In [5]:
# Import data frame, skip spaces and empty lines
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/annotated_job_posts.csv', encoding='unicode_escape', skipinitialspace=True, skip_blank_lines=True)
df.head(13)

Unnamed: 0,jobpostnr,nodeid,word,tag
0,1,1,realize,O
1,1,2,the,O
2,1,3,work,O
3,1,4,and,O
4,1,5,management,B-SKILL
5,1,6,of,O
6,1,7,corporate,O
7,1,8,sales,B-SKILL
8,1,9,service,O
9,1,10,css,B-SKILL


In [6]:
# We need to to give all unique words an id and we need to give all unique tags an id
from itertools import chain
# We define the following function
def get_dict_map(data,word_or_tag):
  # We initialize two empty variables, one will have the format token(word or tag) with their id, other has the format id with its token
  token2id = {}
  id2token = {}
  # If we deliver "word" to the function we add every unique word to the variable vocab.
  # Same happens to the tags if we deliver "tag" to the function.
  if word_or_tag == 'word':
    vocab = list(set(data['word'].to_list()))
  else:
    vocab = list(set(data['tag'].to_list()))

  # We fill up the two variables created above, by enumerating the vocab variable.
  id2token = {id:tok for id,tok in enumerate(vocab)}
  token2id = {tok:id for id,tok in enumerate(vocab)}

  return token2id, id2token

# We create and fill up 4 variables with the created function.
word2id, id2word = get_dict_map(df, 'word')
tag2id, id2tag = get_dict_map(df, 'tag')

In [7]:
id2tag

{0: 'I-SKILL', 1: 'B-SKILL', 2: 'O'}

In [54]:
tag2id

{'B-SKILL': 0, 'I-SKILL': 2, 'O': 1}

In [8]:
# We augment the initial data frame by adding the attributes word_id and tag_id
# With the help of the map function we can use the variables created above to create the corresponding entry in each observation.
df['Word_id'] = (df['word']).map(word2id)
df['Tag_id'] = (df['tag'].map(tag2id))
df.head

<bound method NDFrame.head of       jobpostnr  nodeid         word      tag  Word_id  Tag_id
0             1       1      realize        O      609       2
1             1       2          the        O     1884       2
2             1       3         work        O      447       2
3             1       4          and        O     1455       2
4             1       5   management  B-SKILL     1313       1
...         ...     ...          ...      ...      ...     ...
9955        100      39    solutions        O     1187       2
9956        100      40           to        O     1990       2
9957        100      41       create        O     1249       2
9958        100      42  incremental        O      781       2
9959        100      43       growth        O     1117       2

[9960 rows x 6 columns]>

In [9]:
# We fill all NaN in the data frame by using the value of the observation above.
df_fillnan = df.ffill(axis=0)
df_fillnan.head()

Unnamed: 0,jobpostnr,nodeid,word,tag,Word_id,Tag_id
0,1,1,realize,O,609,2
1,1,2,the,O,1884,2
2,1,3,work,O,447,2
3,1,4,and,O,1455,2
4,1,5,management,B-SKILL,1313,1


In [10]:
#We normalize the word_id's
def min_max_scaling(column):
  return (column - column.min()) / (column.max() - column.min())

for col in df_fillnan.columns:
  df_fillnan['Word_id'] = min_max_scaling(df_fillnan['Word_id'])

df_fillnan.head()

Unnamed: 0,jobpostnr,nodeid,word,tag,Word_id,Tag_id
0,1,1,realize,O,0.296784,2
1,1,2,the,O,0.918129,2
2,1,3,work,O,0.217836,2
3,1,4,and,O,0.709064,2
4,1,5,management,B-SKILL,0.639864,1


In [11]:
# We now group the data frame by sentence number and collect column values to get sentence, pos, tag, word-id and tag-id
final_data = df_fillnan.groupby(
['jobpostnr'], as_index=False
# We use the aggregate function for the variables that are affected by the groupby command
)['nodeid', 'word', 'tag', 'Word_id', 'Tag_id'].agg(lambda x: list(x))
# Check data
final_data.head()

  """


Unnamed: 0,jobpostnr,nodeid,word,tag,Word_id,Tag_id
0,1,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[realize, the, work, and, management, of, corp...","[O, O, O, O, B-SKILL, O, O, B-SKILL, O, B-SKIL...","[0.29678362573099415, 0.9181286549707602, 0.21...","[2, 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 0, ..."
1,2,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[organize, all, shipments, in, line, with, com...","[B-SKILL, O, O, O, O, O, O, O, O, O, O, O, B-S...","[0.34746588693957114, 0.025341130604288498, 0....","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0, 2, ..."
2,3,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[actively, promote, bank, loan, products, resp...","[O, O, B-SKILL, I-SKILL, O, O, O, B-SKILL, O, ...","[0.4220272904483431, 0.8269980506822612, 0.112...","[2, 2, 1, 0, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, ..."
3,4,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[develop, excel, based, and, sql, server, repo...","[B-SKILL, O, O, O, B-SKILL, I-SKILL, I-SKILL, ...","[0.5282651072124757, 0.23830409356725146, 0.03...","[1, 2, 2, 2, 1, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, ..."
4,5,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[develop, and, promote, marketing, strategy, o...","[O, O, O, B-SKILL, I-SKILL, O, O, O, B-SKILL, ...","[0.5282651072124757, 0.7090643274853801, 0.826...","[2, 2, 2, 1, 0, 2, 2, 2, 1, 2, 2, 2, 2, 1, 0, ..."


In [59]:
final_data.isna().sum().sum()

0

In [60]:
final_data.isnull().sum().sum()

0

In [12]:
# Take longest sentence to apply padding to the shorter sentences to make the input size of same size

# Check words and tags from final_data
print("Statements : \n", final_data['word'])
print("\n")
print("Tags :\n ", final_data['tag'])

# Getting the maximal length from all available sentences
statements_list = final_data['Word_id'].to_list()
max_len_statement = max([len(statement) for statement in statements_list])
print("\n The maximal size of all available sentences is : ", max_len_statement)

# Getting the maximal length of tags
tags_list = final_data['Tag_id'].to_list()
max_len_tag = max([len(tag) for tag in tags_list])
print("\n The maximal size of tags are : ",max_len_tag)

# In order to make all final statements of the same length we use the pad sequence function
# If the sentence is below max length, the "missing" values will be appended with the word "The" and the tag "O"
from tensorflow.keras.preprocessing.sequence import pad_sequences
pad_statements = pad_sequences(statements_list,maxlen=max_len_statement,padding='post',value=word2id['the'])
pad_tags = pad_sequences(tags_list,maxlen=max_len_tag,padding='post', value=tag2id["O"])

print("Statements after padding : \n",pad_statements)
print("Tags after padding : ",pad_tags)

Statements : 
 0     [realize, the, work, and, management, of, corp...
1     [organize, all, shipments, in, line, with, com...
2     [actively, promote, bank, loan, products, resp...
3     [develop, excel, based, and, sql, server, repo...
4     [develop, and, promote, marketing, strategy, o...
                            ...                        
95    [draw, up, formulations, and, advise, for, the...
96    [participate, in, object, oriented, analysis, ...
97    [under, the, overall, guidance, and, supervisi...
98    [supervision, of, construction, works, reporti...
99    [implement, and, monitor, marketing, programs,...
Name: word, Length: 100, dtype: object


Tags :
  0     [O, O, O, O, B-SKILL, O, O, B-SKILL, O, B-SKIL...
1     [B-SKILL, O, O, O, O, O, O, O, O, O, O, O, B-S...
2     [O, O, B-SKILL, I-SKILL, O, O, O, B-SKILL, O, ...
3     [B-SKILL, O, O, O, B-SKILL, I-SKILL, I-SKILL, ...
4     [O, O, O, B-SKILL, I-SKILL, O, O, O, B-SKILL, ...
                            ...        

In [13]:
# Converting output tags to one hot encoding (= Every tag will be represented as a binary vector - only one value per vector will be true - unmistakenly id-ing the tag)
# If we imagine the output layer of the neural network - we can imagine how each neuron represents one of the entries of the one hot encoded vector
# Neurons in the output layer will have outputs between 0 and 1
# So in this way the neural network output layer can better (because directly) compare the predicted value with the real value
from tensorflow.keras.utils import to_categorical
pad_tags = to_categorical(pad_tags)
print(pad_tags)
print("Shape is : ",pad_tags.shape)

[[[0. 0. 1.]
  [0. 0. 1.]
  [0. 0. 1.]
  ...
  [0. 0. 1.]
  [0. 0. 1.]
  [0. 0. 1.]]

 [[0. 1. 0.]
  [0. 0. 1.]
  [0. 0. 1.]
  ...
  [0. 0. 1.]
  [0. 0. 1.]
  [0. 0. 1.]]

 [[0. 0. 1.]
  [0. 0. 1.]
  [0. 1. 0.]
  ...
  [0. 0. 1.]
  [0. 0. 1.]
  [0. 0. 1.]]

 ...

 [[0. 0. 1.]
  [0. 0. 1.]
  [0. 0. 1.]
  ...
  [0. 0. 1.]
  [0. 0. 1.]
  [0. 0. 1.]]

 [[0. 1. 0.]
  [0. 0. 1.]
  [0. 1. 0.]
  ...
  [0. 0. 1.]
  [0. 0. 1.]
  [0. 0. 1.]]

 [[0. 1. 0.]
  [0. 0. 1.]
  [0. 0. 1.]
  ...
  [0. 0. 1.]
  [0. 0. 1.]
  [0. 0. 1.]]]
Shape is :  (100, 530, 3)


In [14]:
# Shape can be understood as: 100 job posts. 
# Each sentence is 530 words long (with the help of the padding applied) 
# Each word is represented by 3 different values (one hot encoded tags)

In [15]:
# Split the dataset in training (90% of data) and testing (10% of data)
# random_state=69 to reproduce results
from sklearn.model_selection import train_test_split
train_statements,test_statements,train_tags,test_tags=train_test_split(pad_statements,pad_tags,test_size=0.1,train_size=0.9, random_state=69)
print(train_statements.shape)
print(test_statements.shape)
print(train_tags.shape)
print(test_tags.shape)

(90, 530)
(10, 530)
(90, 530, 3)
(10, 530, 3)


In [16]:
# import keras functions and sequential evaluation functions
import numpy as np
#import seqeval
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, TimeDistributed, Dense
from tensorflow.keras.utils import plot_model
from keras.callbacks import Callback
#from seqeval.metrics import accuracy_score
#from seqeval.metrics import classification_report
#from seqeval.metrics import f1_score

In [17]:
# We set some variables which are needed to tailor our Neural Network according to our input and output
shape = train_statements[1].shape
input_dim = len(list(set(df['word'].to_list()))) + 1
output_dim = 64
input_length = max_len_statement
output_units = len(id2tag)

In [18]:
train_statements

array([[   0,    0,    0, ..., 1884, 1884, 1884],
       [   0,    0,    0, ..., 1884, 1884, 1884],
       [   0,    0,    0, ..., 1884, 1884, 1884],
       ...,
       [   0,    0,    0, ..., 1884, 1884, 1884],
       [   0,    0,    0, ..., 1884, 1884, 1884],
       [   0,    0,    0, ..., 1884, 1884, 1884]], dtype=int32)

In [19]:
train_tags

array([[[0., 1., 0.],
        [1., 0., 0.],
        [0., 0., 1.],
        ...,
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.]],

       [[0., 0., 1.],
        [0., 1., 0.],
        [1., 0., 0.],
        ...,
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.]],

       [[0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        ...,
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.]],

       ...,

       [[0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 1.],
        ...,
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.]],

       [[0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        ...,
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.]],

       [[0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        ...,
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.]]], dtype=float32)

In [92]:
import tensorflow
# We shape the input layer according to the length of the sentences
input_layer = Input(shape=(530,))
embeddings = Embedding(input_dim = input_dim,output_dim = output_dim)(input_layer)
lstm1=LSTM(units=output_dim,return_sequences=True)(embeddings)
# model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))(lstm1)
lstm2=LSTM(units=output_dim,return_sequences=True)(lstm1)
#output = Dense(units=output_units,activation='relu')(lstm1)
output = tensorflow.keras.layers.TimeDistributed(Dense(units=output_units,activation='softmax'))(lstm2)
model = tensorflow.keras.Model(inputs = input_layer,outputs=output)
# model = Sequential()

# Add Embedding layer
# model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

# Add bidirectional LSTM
# model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode = 'concat'))

# Add LSTM
# model.add(LSTM(units = output_dim, return_sequences=True, dropout = 0.5, recurrent_dropout = 0.5))

# Add timeDistributed Layer
# model.add(TimeDistributed(Dense(n_tags, activation="relu")))
model.compile(optimizer = tensorflow.keras.optimizers.RMSprop(1e-3), loss='categorical_crossentropy', metrics=[tensorflow.keras.metrics.Precision(), tensorflow.keras.metrics.Recall(), tensorflow.keras.metrics.AUC()])
# metrics=[tensorflow.keras.metrics.Precision(), tensorflow.keras.metrics.Recall(), tensorflow.keras.metrics.AUC()]
# metrics=['accuracy']
#plot_model(model, show_shapes=True)

In [71]:
train_statements

array([[  0,   0,   0, ..., 262, 262, 262],
       [  0,   0,   0, ..., 262, 262, 262],
       [  0,   0,   0, ..., 262, 262, 262],
       ...,
       [  0,   0,   0, ..., 262, 262, 262],
       [  0,   0,   0, ..., 262, 262, 262],
       [  0,   0,   0, ..., 262, 262, 262]], dtype=int32)

In [95]:
#model.fit(x=train_statements,y=train_tags,validation_data=(test_statements, test_tags), epochs=10, batch_size=5)
model.fit(train_statements,train_tags, epochs=25, batch_size=5)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f1aaa1c6d90>

In [96]:
model.evaluate(x=test_statements, y=test_tags,batch_size=1)



[0.10073335468769073,
 0.9732075333595276,
 0.9732075333595276,
 0.9961040019989014]