##### Copyright 2020 HrFlow's AI Research Department

Licensed under the Apache License, Version 2.0 (the "License");

In [None]:
# Copyright 2020 HrFlow's AI Research Department. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

<p>
<table align="left"><td>
  <a target="_blank"  href="https://colab.research.google.com/github/Riminder/python-hrflow-api/blob/master/examples/colab/build_personalized_ai_hr_models.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab
  </a>
</td><td>
  <a target="_blank"  href="https://github.com/Riminder/python-hrflow-api/blob/master/examples/colab/build_personalized_ai_hr_models.ipynb">
    <img width=32px src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
</td><td>
  <a target="_blank"  href="https://www.hrflow.ai/book-us">
    <img width=32px src="https://gblobscdn.gitbook.com/spaces%2F-M1L6Hspq8r9LXd5_gIC%2Favatar-1586188377926.png?generation=1586188378327930&alt=media" />Get an account</a>
</td></table>
<br>
</p>

<p>
<table align="center">
<td>
  <a target="_blank"  href="https://developers.hrflow.ai/ai-layers/embedding">
    <img width=800px img src="https://lh3.googleusercontent.com/JXagdsThZxaEKwjE83-QrJXjB1r1tk2-KmdBzb94X_a238-5bNtwHuDi-PUA4_cVBkpaCie1uil6lPDNhdggpZhkgiZBYQGe4iKRRGo13XvyYgzuG9Vw_fv72LiYrg2am9MIrPnkwlQ"/>
  </a>
</td>
</table>
</p>

Advantages of HrFlow.ai Embedding API:

-  **Save 90% of R&D time** spent on features engineering 

- Train with **limited amount of labels**

- Increase inference **speed up to 26x**

- **Limit the memory** footprint on production up to **300x**

# Getting Started
This section sets up the environment to get access to **HrFlow Profile API** and sets up a connection to HrFlow.

In [None]:
# Machine Learning and Classification Libs
!pip install --quiet tensorflow
!pip install --quiet matplotlib
!pip install --quiet pandas
!pip install --quiet seaborn
!pip install --quiet plotly
!pip install --quiet tqdm

# HrFlow Dependencies
!pip install --quiet python-magic
!pip install --quiet hrflow

An **API Key** is required here. You can get your API Key at **https://```<your-sub domain/>```.hrflow.ai/settings/api/keys** or ask us for a **demo API Key**.

In [None]:
import pprint
from hrflow import Hrflow
from getpass import getpass


# Credentials
api_secret = getpass(prompt="Please Enter Your API Secret Key")
client = Hrflow(api_secret=api_secret)


# Hrflow Synchronous Source
source_key = getpass(prompt="Please Enter a Synchronous source_key")

In [None]:
import os
import pickle
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

# 1. Building Dataset

Dataset Folders Tree Structure:
- webinar/dataset/
    - resume/
        - data_scientist/
        - executive_manager/
    - profile/
        - data_scientist/
        - executive_manager/
    - embedding/
        - data_scientist/
        - executive_manager/

## 1.1. Setting Dataset Root

In [None]:
def build_path(path):
  recursive_path = ""
  for folder in path.split("/"):
    recursive_path = os.path.join(recursive_path, folder)
    if not os.path.isdir(recursive_path):
      os.mkdir(recursive_path)

In [None]:
import os

# Dataset Root Folder
DATASET_ROOT = "drive/My Drive/webinar/dataset"

# Resume Folder
data_scientist_resume_folder = os.path.join(DATASET_ROOT, "resume", "data_scientist")
executive_manager_resume_folder = os.path.join(DATASET_ROOT, "resume", "executive_manager")

# Parsing Folder
data_scientist_parsing_folder = os.path.join(DATASET_ROOT, "parsing", "data_scientist")
executive_manager_parsing_folder = os.path.join(DATASET_ROOT, "parsing", "executive_manager")
build_path(data_scientist_parsing_folder)
build_path(executive_manager_parsing_folder)

# Profile Folder
data_scientist_profile_folder = os.path.join(DATASET_ROOT, "profile", "data_scientist")
executive_manager_profile_folder = os.path.join(DATASET_ROOT, "profile", "executive_manager")
build_path(data_scientist_profile_folder)
build_path(executive_manager_profile_folder)

# Embedding Folder
data_scientist_embedding_folder = os.path.join(DATASET_ROOT, "embedding", "data_scientist")
executive_manager_embedding_folder = os.path.join(DATASET_ROOT, "embedding", "executive_manager")
build_path(data_scientist_embedding_folder)
build_path(executive_manager_embedding_folder)

In [None]:
print(os.listdir(data_scientist_resume_folder))

## 1.2. Parsing Resumes

In [None]:
import json
from tqdm import tqdm


for folder, target_folder in [(data_scientist_resume_folder, data_scientist_parsing_folder), (executive_manager_resume_folder, executive_manager_parsing_folder)]:
  file_names = os.listdir(folder)
  for file_name in tqdm(file_names):
      # Sending File for Parsing
      file_path = os.path.join(folder, file_name)
      with open(file_path, "rb") as file:
          profile = file.read()
          
      response = client.profile.parsing.add_file(source_key=source_key, 
                                                 profile_file=profile,
                                                 sync_parsing=1)
      profile_parsing = response.get("data").get("parsing")
      
      # Saving Parsed Result
      target_path = os.path.join(target_folder, file_name.split(".")[0])
      with open(target_path,"w") as file:
          json.dump(profile_parsing, file)

## 1.3. Downloading Profiles

In [None]:
import json
from tqdm import tqdm


for folder, target_folder in [(data_scientist_parsing_folder, data_scientist_profile_folder), (executive_manager_parsing_folder, executive_manager_profile_folder)]:
  file_names = os.listdir(folder)
  for file_name in tqdm(file_names):
      # Loading Parsing to get Profile Key
      file_path = os.path.join(folder, file_name)
      with open(file_path, "r") as file:
          profile = json.load(file)

      response = client.profile.indexing.get(source_key=source_key, 
                                             key=profile["key"])
      
      profile = response.get("data")
      
      # Saving Indexed Result
      target_path = os.path.join(target_folder, file_name.split(".")[0])
      with open(target_path,"w") as file:
          json.dump(profile, file)

## 1.4. Computing Embeddings

In [None]:
import base64
import numpy as np

def decode_embedding(base64_string):
    output = base64.b64decode(base64_string)
    output = np.frombuffer(output, dtype=np.dtype('>f4'))
    output = np.reshape(output, (-1, 1024))
    return output

In [None]:
import pickle
from tqdm import tqdm

folder = data_scientist_profile_folder
target_folder = data_scientist_embedding_folder

file_names = os.listdir(folder)


for folder, target_folder in [(data_scientist_profile_folder, data_scientist_embedding_folder), (executive_manager_profile_folder, executive_manager_embedding_folder)]:
  file_names = os.listdir(folder)
  for file_name in tqdm(file_names):
      # Loading Profile
      file_path = os.path.join(folder, file_name)
      with open(file_path, "r") as file:
          profile = json.load(file)
          
      response = client.document.embedding.post("profile", 
                                                profile, 
                                                return_sequences=True)
      profile_embedding = decode_embedding(response.get("data"))
      
      # Saving Embedded Result
      target_path = os.path.join(target_folder, file_name.split(".")[0])
      with open(target_path,"wb") as file:
          pickle.dump(profile_embedding, file)

## 1.5. Dataset Generator

In [None]:
import pickle
import tensorflow as tf
import numpy as np


class Generator(tf.keras.utils.Sequence):
    
    def __init__(self, file_paths, labels, batch_size=2):
        self.file_paths = file_paths
        self.labels = labels
        self.batch_size = batch_size
        self.indices = np.arange(len(file_paths))
        
    def __len__(self):
        return len(self.file_paths) // self.batch_size
    
    def on_epoch_end(self):
        np.random.shuffle(self.indices)
        
    def __getitem__(self, index):
        start = index * self.batch_size
        end = min(start + self.batch_size, len(self.file_paths))
        batch_indices = self.indices[start:end]
        batch_path = self.file_paths[batch_indices]
        batch_labels = tf.constant(self.labels[batch_indices])
        batch_profiles = []
        for file_path in batch_path:
            with open(file_path, "rb") as file:
                profile = pickle.load(file)
            batch_profiles.append(profile)
        pad_length = max([element.shape[0] for element in batch_profiles])
        batch_profiles = [tf.pad(element, [[0, pad_length-element.shape[0]], [0, 0]]) for element in batch_profiles]
        batch_profiles = tf.stack(batch_profiles)
        return batch_profiles, batch_labels

In [None]:
data_scientist_paths = [os.path.join(data_scientist_embedding_folder, file) for file in os.listdir(data_scientist_embedding_folder)]
executive_manager_paths = [os.path.join(executive_manager_embedding_folder, file) for file in os.listdir(executive_manager_embedding_folder)]

file_paths = np.array(data_scientist_paths + executive_manager_paths)
labels = np.array([0] * len(data_scientist_paths) + [1] * len(executive_manager_paths))

In [None]:
generator = Generator(file_paths, labels)
x, y = next(iter(generator))

print(x.shape)

# 2. Machine Learning With HrFlow.ai Embeddings

## 2.1. Profile Classification and Embedding

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Masking, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.losses import SparseCategoricalCrossentropy

# Profile Encoder Deduced From Classification Model Training
model_input = Input(shape=(None, 1024), name="hrflow_embedding")
masking = Masking(name="masking")(model_input)
bi_lstm = Bidirectional(LSTM(8), name="bi_lstm")(masking)
dense = Dense(2, activation="tanh", name="dense")(bi_lstm)
profile_encoder = Model(inputs=[model_input], outputs=[dense], name="profile_encoder")

# Scoring from Encoded Profile
scoring_input = Input(shape=(2,))
dropout = Dropout(0.2, name="dropout")(scoring_input)
softmax = Dense(2, activation='softmax', name="softmax")(dropout) 
scoring = Model(inputs=[scoring_input], outputs=[softmax], name="profile_scoring")


# Classification Model
profile_embedding = profile_encoder(model_input)
profile_score = scoring(profile_embedding)
model = Model(inputs=[model_input], outputs=[profile_score])
model.compile(loss=SparseCategoricalCrossentropy(), 
              optimizer='nadam', 
              metrics=['accuracy'])

model.summary()

## 2.2. Training

In [None]:
model.fit(generator, epochs=5)

## 2.3. Embeddings and Predictions

In [None]:
def line_jump(text, every_char=50):
    n_jumps = len(text) // every_char
    output = text[:every_char]
    for index in range(1, n_jumps):
        output += '<br />' + text[every_char*index:every_char*(index+1)] 
    return output

In [None]:
import pickle
import json
from pandas.core.frame import DataFrame


results = {"text": [], "x": [], "y": [], "score": [], "label": [], "prediction": []}

for file_path, label in zip(file_paths, labels):
    # Get Profile Label
    results["label"].append(label)
    
    # Get Profile Embedding
    with open(file_path, "rb") as file:
        hrflow_embedding = pickle.load(file)
    profile_embedding = profile_encoder.predict(np.expand_dims(hrflow_embedding, axis=0))[0]
    x, y = profile_embedding
    results["x"].append(x)
    results["y"].append(y)
    
    # Get Profile Prediction
    profile_score = model.predict(np.expand_dims(hrflow_embedding, axis=0))[0]
    results["score"].append(profile_score[1])
    results["prediction"].append(int(profile_score[1]>0.5))
    
    # Get Profile Summary
    path = file_path.split("/")
    path[-3] = "profile"
    with open("/".join(path), "r") as file:
        summary = json.load(file)["info"]["summary"]
    results["text"].append(line_jump(summary))
    
df = DataFrame(results)
df

In [None]:
import numpy as np
import plotly.graph_objects as go
import plotly.express as px


# Compute Scores for Mesh Values
xx = np.arange(-1, 1.1, 0.1)
yy = np.arange(-1, 1.1, 0.1)
mesh_values = np.array([scoring.predict(np.array([[x, y] for x in xx]))[:,1] for y in yy])

# Contour/Boundary Plot
data = go.Contour(x=xx, y=yy, z=mesh_values, 
                  colorscale=[[0.0, "rgb(165,0,38)"],
                              [0.1111111111111111, "rgb(215,48,39)"],
                              [0.2222222222222222, "rgb(244,109,67)"],
                              [0.3333333333333333, "rgb(253,174,97)"],
                              [0.4444444444444444, "rgb(254,224,144)"],
                              [0.5555555555555556, "rgb(224,243,248)"],
                              [0.6666666666666666, "rgb(171,217,233)"],
                              [0.7777777777777778, "rgb(116,173,209)"],
                              [0.8888888888888888, "rgb(69,117,180)"],
                              [1.0, "rgb(49,54,149)"]])
layout = {'width': 600, 'height': 600,
          'xaxis_title': 'x', 'yaxis_title': 'y', 
          'xaxis': {'range': [-1, 1]}, 'yaxis': {'range': [-1, 1]},
          'title': 'Decision Boundaries (Executive Managers in Yellow, Data Scientists in Blue)'}
fig = go.Figure(data = data, layout=layout)

# Profiles Embeddings
scatter = px.scatter(df, x='x', y='y', 
                     hover_data=['summary', 'score', 'label', 'prediction'],
                     color='prediction')
fig.add_trace(scatter.data[0])

# Show Graph
fig.show()