## Job segmentation by deep learning

In this notebook, I show how to use deep learning to classify the sentences from job descriptions into different catalogues. 

In [66]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential
from tensorflow.keras import preprocessing
from tensorflow.keras.layers import (
    Embedding, 
    Bidirectional, 
    Dense, 
    LSTM, 
    GlobalAveragePooling1D, 
    Conv1D, 
    GlobalMaxPooling1D, 
    MultiHeadAttention, 
    LayerNormalization,
    Dropout,   
)
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

#### Parameters

Just some samples in DATADIR, due to data protection reason, cannot release the full dataset here.

In [67]:
BATCH_SIZE = 32
SEED = 42
DATADIR = "./data/"

VOCAB_SIZE = 10000  # Total vocabulary size
MAX_LEN = 100  # Maximum lenght of a sentence
EMBED_SIZE = 32  # Embedding size for each token

NUM_HEADS = 2  # Number of attention heads
FF_SIZE = 32  # Hidden layer size in feed forward network inside transformer

#### Load data

In [68]:
train_ds = preprocessing.text_dataset_from_directory(
    DATADIR,
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    subset='training',
    seed=SEED)

valid_ds = preprocessing.text_dataset_from_directory(
    DATADIR,
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    subset='validation',
    seed=SEED)

Found 23975 files belonging to 4 classes.
Using 19180 files for training.
Found 23975 files belonging to 4 classes.
Using 4795 files for validation.


### LSTM

In [69]:
vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_LEN
)

vectorize_layer.adapt(train_ds.map(lambda x, y: x))

In [70]:
embedding_layer = Embedding(
    input_dim=VOCAB_SIZE,
    output_dim=EMBED_SIZE,
    mask_zero=True
)

In [71]:
layers_lstm = [
    vectorize_layer,
    embedding_layer,
    Bidirectional(LSTM(32)),
    Dense(32, activation='relu'),
    Dense(4, activation="softmax")
]

lstm_model = Sequential(layers_lstm)

lstm_model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(1e-4),
    metrics=['accuracy']
)

In [72]:
lstm_model.fit(
    x=train_ds,
    validation_data=valid_ds,
    validation_steps=30,
    epochs=5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f568dd35880>

In [73]:
samples = ["knowledge of python, R", "Fluent english"]

probs = lstm_model.predict(samples)
labels = np.argmax(probs, axis=-1)

In [74]:
labels

array([2, 2])

### CNN

In [75]:
layers_cnn = [
    vectorize_layer,
    embedding_layer,
    Conv1D(64, 5, activation="relu", strides=2),
    GlobalMaxPooling1D(),
    Dense(32, activation='relu'),
    Dense(4, activation="softmax")
]

cnn_model = Sequential(layers_cnn)

cnn_model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(1e-4),
    metrics=['accuracy']
)

In [76]:
cnn_model.fit(
    x=train_ds,
    validation_data=valid_ds,
    validation_steps=30,
    epochs=5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f56bbb50f40>

In [77]:
samples = ["knowledge of python, R", "Fluent english"]

probs = cnn_model.predict(samples)
labels = np.argmax(probs, axis=-1)



In [78]:
labels

array([2, 2])

### Transformer

In [79]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [80]:
embedding_layer = TokenAndPositionEmbedding(MAX_LEN, VOCAB_SIZE, EMBED_SIZE)
transformer_block = TransformerBlock(EMBED_SIZE, NUM_HEADS, FF_SIZE)

In [81]:
layers_transformer = [
    vectorize_layer,
    embedding_layer,
    transformer_block,
    GlobalAveragePooling1D(),
    Dense(32, activation='relu'),
    Dense(4, activation="softmax")
]

transformer_model = Sequential(layers_transformer)

transformer_model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(1e-4),
    metrics=['accuracy']
)

In [82]:
transformer_model.fit(
    x=train_ds,
    validation_data=valid_ds,
    validation_steps=30,
    epochs=5
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f56bba983d0>