In [43]:
import os
from joblib import dump, load
from transformers import pipeline, Pipeline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import EnglishStemmer
from concurrent.futures import ThreadPoolExecutor
import re
import nltk
#TODO: zorgen dat deze niet elke keer opnieuw downloaden
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from googletrans import Translator

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Grimm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Grimm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [52]:
class Preprocessor:
    """
    This class contains the basic preprocessor pipeline that's used to extract keywords themed around skills.

    Attributes:
        __dir_skill (str): The directory path where the skill preprocessor model is stored.
        __dir_knowledge (str): The directory path where the knowledge preprocessor model is stored.
        _window_size (int): The window size used in the preprocessor pipeline.

    Methods:
        __init__: Initializes the Preprocessor class with default or user-specified parameters.
        preprocess_input: Preprocesses input and gives skills & knowledge back
    """
    def __init__(self, window_size: int = 8, preprocessor_dir_skill: str = "./preprocessor/skill/", preprocessor_dir_knowledge: str = "./preprocessor/knowledge/"):
        """
        Initializes the Preprocessor class with default or user-specified parameters.

        Args:
            window_size (int): The window size used in the preprocessor pipeline. Default is 8.
            preprocessor_dir_skill (str): The directory path where the skill preprocessor model is stored.
                Default is "./preprocessor/skill/".
            preprocessor_dir_knowledge (str): The directory path where the knowledge preprocessor model is stored.
                Default is "./preprocessor/knowledge/".
        """
        self._dir_skill = preprocessor_dir_skill
        self._dir_knowledge = preprocessor_dir_knowledge
        self._window_size = window_size
        self._skill_preprocessor = self.__load_model(dir=self._dir_skill, model_name="jjzha/jobbert_skill_extraction")
        self._knowledge_preprocessor = self.__load_model(dir=self._dir_knowledge, model_name="jjzha/jobbert_knowledge_extraction")
        
    def __load_model(self, dir:str, model_name:str):
        """
        Loads a preprocessor model from the specified directory.

        Args:
            dir (str): The directory path where the preprocessor model is stored.
            model_name (str): The name of the preprocessor model to load.

        Returns:
            object: The preprocessor model loaded from the specified directory.
        """
        if not os.path.exists(dir) or not os.listdir(dir):
            token_classifier = pipeline(model=model_name, aggregation_strategy="first")
            token_classifier.save_pretrained(dir)
        return pipeline(model=dir, task="ner")   
    
    def preprocess_parallel(self, emails):
        with ThreadPoolExecutor() as executor:
            results = list(executor.map(self.process_text, emails))

        json_data = {}

        for i, result in enumerate(results, 1):
            keywords = []
            for entity in result['entities']:
                if entity['score'] > 0.85:
                    keywords.append({'word': self.postprocess_text(entity['word']), 'score': entity['score']})
            json_data[f'mail_{i}'] = {'title': result['title'], 'keywords': keywords}

        return json_data
    
    def postprocess_text(self, text):
        lemmatizer = WordNetLemmatizer()
        stop_words_english = set(stopwords.words("english"))
        stop_words_dutch = set(stopwords.words("dutch"))
        words = re.findall(r'\b\w+\b|\s+', text)
        cleaned_words = [lemmatizer.lemmatize(word.lower()) for word in words if word.strip().isalpha() and word.lower().strip() not in stop_words_english.union(stop_words_dutch)]
        cleaned_text = ' '.join(cleaned_words)
        return cleaned_text
    
    def process_text(self, email):
        sentences = email['text_body'].split('\n')
        entities = []

        for sentence in sentences:
            output_skills = self._skill_preprocessor(sentence)
            print(output_skills)
            for result in output_skills:
                if result.get("entity_group"):
                    result["entity_type"] = "Skill"
                    del result["entity_group"]
                    entities.append(result)

            output_knowledge = self._knowledge_preprocessor(sentence)
            for result in output_knowledge:
                if result.get("entity_group"):
                    result["entity"] = "Knowledge"
                    del result["entity_group"]
                    entities.append(result)
        
        return {"title": email['subject'], "text": email['text_body'], "entities": entities}


In [49]:
email =  [{
    "item_id": 0,
    "sender": "a1d400258b5c6e3d97307b2c949ffe01fe0aa27ab02ef1c351a7bfa6e0f300a3",
    "sender_email": "6fedc8e86e6e05504fefcdce51f8f73b69f5fd104c23dc5e9dba6c64e5536ffd",
    "datetime_received": 1707207587000,
    "sensitivity": "Normal",
    "subject": "Datawarehousing Specialist (4. Expert (10+)) SWI000876 - For Swift",
    "text_body": "ENKEL RECHTSTREEKS, GEEN TUSSENPARTIJEN AUB\r\n\r\nVOORRANG VASTE MEDEWERKERS\r\n\r\n\r\n\r\nHallo collega\u2019s,\r\n\r\n\r\n\r\nVoor Swift zoeken we een Datawarehousing Specialist (4. Expert (10+)) SWI000876 die voldoet aan volgende beschrijving:\r\n\r\n\r\n\r\nUiterste reactiedatum: 16/02/2024\r\n\r\nGewenste startdatum: 01/03/2024\r\n\r\nEinddatum: 31/08/2024\r\n\r\n\r\n\r\nReferentie: SWI000876\r\n\r\nTitel: Datawarehousing Specialist (4. Expert (10+)) SWI000876\r\n\r\nLocatie: THE NETHERLANDS - ZOETERWOUDE (ENERGIEWEG 33, 2382 NC ZOETERWOUDE, NEDERLAND)\r\n\r\nStatus: Gepubliceerd\r\n\r\nType contract: Time & material\r\n\r\nCategorie: Niet van toepassing\r\n\r\nAantal personen: 1\r\n\r\nAfdeling: Human Resource (HR)\r\n\r\n\r\n\r\nOmschrijving\r\n\r\n\r\n\r\nThe project for which the candidate will be assigned is called Digital Dashboards, having the goal of building executive dashboarding for Swift. The ideal candidate will have an extensive background and expertise in MS Power BI, with both the ability to design the data model, as well as the reports and dashboards.\r\n\r\nThe candidate will join the project team and will have a key role to play, not just delivering on the scope of the project, but also training the team on that technology since it is still quite new at Swift.\r\n\r\n\r\nOpdracht informatie\r\n\r\n\r\n\r\nProjectnaam: Digital Dashboards\r\n\r\nWerkregime: Voltijds\r\n\r\n\r\n\r\nVaardigheden\r\n\r\n\r\n\r\nSPECIFIEKE VAARDIGHEDEN\r\n\r\n\r\n\r\nData Modeling: Expert (10+)\r\n\r\nETL Development: Expert (10+)\r\n\r\nMS Power Bi: Expert (10+)\r\n\r\n\r\n\r\n\r\nIndien jullie geschikte kandidaten hebben ontvang ik graag hun beschikbaarheid, CV en kostprijs.\r\n\r\n\r\n\r\nAlvast hartelijk bedankt.\r\n\r\n\r\n[signature_1929168496]\r\n\r\nChannice \r\n\r\nExecutive Assistant - Business and sales support\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nDe Cronos Groep nv\r\n\r\nVeldkant 33A, 2550 Kontich\r\n\r\n\r\n",
    "label": "BI_ENGINEER",
    "keywords": [
      "Datawarehousing Specialist",
      "MS Power BI",
      "Data Modeling",
      "ETL Development"
    ]
}]

In [54]:
preprocessor = Preprocessor()

# loads in 6s and 660ms

In [55]:
preprocessor.preprocess_parallel(email)

# classification takes 1s and 180 ms

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[]
[]
[]
[]
[{'entity': 'B', 'score': 0.9989403, 'index': 2, 'word': 'researching', 'start': 6, 'end': 17}, {'entity': 'I', 'score': 0.99871325, 'index': 3, 'word': 'potential', 'start': 18, 'end': 27}, {'entity': 'I', 'score': 0.9916442, 'index': 4, 'word': 'solutions', 'start': 28, 'end': 37}, {'entity': 'B', 'score': 0.9979285, 'index': 35, 'word': 'developing', 'start': 194, 'end': 204}, {'entity': 'I', 'score': 0.9775203, 'index': 36, 'word': 'a', 'start': 205, 'end': 206}, {'entity': 'I', 'score': 0.9771836, 'index': 37, 'word': 'Python', 'start': 207, 'end': 213}, {'entity': 'I', 'score': 0.9831904, 'index': 38, 'word': 'program', 'start': 214, 'end': 221}]
[]
[]
[]
[{'entity': 'B', 'score': 0.999605, 'index': 13, 'word': 'collect', 'start': 52, 'end': 59}, {'entity': 'I', 'score': 0.99986744, 'index': 14, 'word': 'and', 'start': 60, 'end': 63}, {'entity': 'I', 'score': 0.99993, 'index': 15, 'word': 'display', 'start': 64, 'end': 71}, {'entity': 'I', 'score': 0.9999733, 'index':

{'mail_1': {'title': 'Datawarehousing Specialist (4. Expert (10+)) SWI000876 - For Swift',
  'keywords': []}}

In [48]:
class Preprocessor:
    def __init__(self, model_dir):
        self.model_dir = model_dir
        self.token_skill_classifier = None
        self.token_knowledge_classifier = None
        self.load_models()

    def load_models(self):
        skill_model_path = os.path.join(self.model_dir, "skill_model.joblib")
        knowledge_model_path = os.path.join(self.model_dir, "knowledge_model.joblib")

        if os.path.exists(skill_model_path):
            self.token_skill_classifier = load(skill_model_path)

        if os.path.exists(knowledge_model_path):
            self.token_knowledge_classifier = load(knowledge_model_path)

        if self.token_skill_classifier is None:
            self.token_skill_classifier = pipeline(model="jjzha/jobbert_skill_extraction", aggregation_strategy="first")
            dump(self.token_skill_classifier, skill_model_path)

        if self.token_knowledge_classifier is None:
            self.token_knowledge_classifier = pipeline(model="jjzha/jobbert_knowledge_extraction", aggregation_strategy="first")
            dump(self.token_knowledge_classifier, knowledge_model_path)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Grimm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Grimm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Total time elapsed for parallel processing: 5.04219913482666s

{'mail_1': {'title': 'Initial Inquiry - Pool Project', 'keywords': [{'word': 'researching potential solution', 'score': 0.9999105}, {'word': 'developing python program', 'score': 0.99930406}, {'word': 'collect display real time data', 'score': 0.99955183}, {'word': 'implement system', 'score': 0.9999465}, {'word': 'send alert notification', 'score': 0.9980915}, {'word': 'ensuring prompt action taken', 'score': 0.98590475}, {'word': 'logging', 'score': 0.98005944}, {'word': 'create database log file', 'score': 0.99996805}, {'word': 'store historical data', 'score': 0.9902846}, {'word': 'analysis', 'score': 0.8603323}, {'word': 'develop user friendly interface', 'score': 0.99995506}, {'word': 'python', 'score': 0.8881644}, {'word': 'python', 'score': 0.96697456}]}, 'mail_2': {'title': 'Follow Up - Pool Project Details', 'keywords': [{'word': 'availability', 'score': 0.88017213}]}, 'mail_3': {'title': 'Project Clarification - 