In [49]:
import os
from transformers import pipeline, Pipeline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import EnglishStemmer
from googletrans import Translator

In [None]:
# TODO: add privacy extraction
# TODO: add keyword checker

class Preprocessor:
    """
    This class contains the basic preprocessor pipeline that's used to extract keywords themed around skills.

    Attributes:
        __dir_skill (str): The directory path where the skill preprocessor model is stored.
        __dir_knowledge (str): The directory path where the knowledge preprocessor model is stored.
        _window_size (int): The window size used in the preprocessor pipeline.

    Methods:
        __init__: Initializes the Preprocessor class with default or user-specified parameters.
        preprocess_input: Preprocesses input and gives skills & knowledge back
    """
    def __init__(self, window_size: int = 8, preprocessor_dir_skill: str = "./preprocessor/skill/", preprocessor_dir_knowledge: str = "./preprocessor/knowledge/"):
        """
        Initializes the Preprocessor class with default or user-specified parameters.

        Args:
            window_size (int): The window size used in the preprocessor pipeline. Default is 8.
            preprocessor_dir_skill (str): The directory path where the skill preprocessor model is stored.
                Default is "./preprocessor/skill/".
            preprocessor_dir_knowledge (str): The directory path where the knowledge preprocessor model is stored.
                Default is "./preprocessor/knowledge/".
        """
        self._dir_skill = preprocessor_dir_skill
        self._dir_knowledge = preprocessor_dir_knowledge
        self._window_size = window_size
        self._skill_preprocessor = self.__load_model(dir=self._dir_skill, model_name="jjzha/jobbert_skill_extraction")
        self._knowledge_preprocessor = self.__load_model(dir=self._dir_knowledge, model_name="jjzha/jobbert_knowledge_extraction")
        
    def __load_model(self, dir:str, model_name:str) -> Pipeline:
        """
        Loads a preprocessor model from the specified directory.

        Args:
            dir (str): The directory path where the preprocessor model is stored.
            model_name (str): The name of the preprocessor model to load.

        Returns:
            object: The preprocessor model loaded from the specified directory.
        """
        if not os.path.exists(dir) or not os.listdir(dir):
            token_classifier = pipeline(model=model_name, aggregation_strategy="first")
            token_classifier.save_pretrained(dir)
        return pipeline(model=dir, task="ner")   
    
    def __translate_to_english(self, text:str) -> str:
        translator = Translator()
        translated_text = translator.translate(text,src='auto', dest='en').text
        return translated_text
    
    
    def preprocess_input(self, email:any) -> dict[str, list[dict[str, str]] | list[dict[str, str]]]:
        """
        Preprocesses the input email to extract skills and knowledge.
    
        Args:
            email (json): The input email to be preprocessed.
    
        Returns:
            dict[str, list[dict[str, str]] | list[dict[str, str]]]: A dictionary containing lists of dictionaries,
                where each dictionary represents a skill or knowledge term with its corresponding score and type.
                The dictionary has two keys: 'skill' and 'knowledge', each containing a list of dictionaries.
    
        Note:
            This function performs the following preprocessing steps:
            1. Translates the email text to English.
            2. Tokenizes the text and converts it to lowercase.
            3. Removes stopwords and non-alphabetic tokens.
            4. Stems the remaining tokens.
            5. Processes the tokens using separate preprocessors for skills and knowledge.
            6. Combines the output of the preprocessors into a dictionary.
    
        Example:
                # Initialize the object
                preprocessor = Preprocessor()
                
                # Define an example email
                email = {
                    'text_body': 'This is a sample email containing information about programming and machine learning.'
                }
                
                # Preprocess the email
                result = preprocessor.preprocess_input(email)
                
                # Access the extracted skills and knowledge
                skills = result['skill']
                knowledge = result['knowledge']
        """

        text_en = self.__translate_to_english(email['text_body'])
        print(text_en)
        # Tokenize the text using NLTK
        tokens = word_tokenize(text_en.lower()) #nope this is dumb
        
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
        
        stemmer = EnglishStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
        
        # Process tokens using skill_preprocessor
        skill_output = [self._skill_preprocessor(' '.join(tokens[i-self._window_size:i+self._window_size])) for i in range(self._window_size, len(tokens), self._window_size)] #context based preprocessing using sliding window
        filtered_skill_output = [item for item in skill_output if item]  # Filter out empty arrays
        if filtered_skill_output:
            transformed_skill_output = [{'word': item['word'], 'score': item['score'], 'type': 'skill'} for item in filtered_skill_output[0]]
        else:
            transformed_skill_output = []
        
        # Process tokens using knowledge_preprocessor
        knowledge_output = [self._knowledge_preprocessor(' '.join(tokens[i-self._window_size:i+self._window_size])) for i in range(self._window_size, len(tokens), self._window_size)]
        filtered_knowledge_output = [item for item in knowledge_output if item]  # Filter out empty arrays
        if filtered_knowledge_output:
            transformed_knowledge_output = [{'word': item['word'], 'score': item['score'], 'type': 'skill'} for item in filtered_knowledge_output[0]]
        else:
            transformed_knowledge_output = []


        combined_output = {'skill':transformed_skill_output,'knowledge': transformed_knowledge_output}

        return combined_output


In [None]:
email1 =  {
    "item_id": 0,
    "sender": "a1d400258b5c6e3d97307b2c949ffe01fe0aa27ab02ef1c351a7bfa6e0f300a3",
    "sender_email": "6fedc8e86e6e05504fefcdce51f8f73b69f5fd104c23dc5e9dba6c64e5536ffd",
    "datetime_received": 1707207587000,
    "sensitivity": "Normal",
    "subject": "Datawarehousing Specialist (4. Expert (10+)) SWI000876 - For Swift",
    "text_body": "ENKEL RECHTSTREEKS, GEEN TUSSENPARTIJEN AUB\r\n\r\nVOORRANG VASTE MEDEWERKERS\r\n\r\n\r\n\r\nHallo collega\u2019s,\r\n\r\n\r\n\r\nVoor Swift zoeken we een Datawarehousing Specialist (4. Expert (10+)) SWI000876 die voldoet aan volgende beschrijving:\r\n\r\n\r\n\r\nUiterste reactiedatum: 16/02/2024\r\n\r\nGewenste startdatum: 01/03/2024\r\n\r\nEinddatum: 31/08/2024\r\n\r\n\r\n\r\nReferentie: SWI000876\r\n\r\nTitel: Datawarehousing Specialist (4. Expert (10+)) SWI000876\r\n\r\nLocatie: THE NETHERLANDS - ZOETERWOUDE (ENERGIEWEG 33, 2382 NC ZOETERWOUDE, NEDERLAND)\r\n\r\nStatus: Gepubliceerd\r\n\r\nType contract: Time & material\r\n\r\nCategorie: Niet van toepassing\r\n\r\nAantal personen: 1\r\n\r\nAfdeling: Human Resource (HR)\r\n\r\n\r\n\r\nOmschrijving\r\n\r\n\r\n\r\nThe project for which the candidate will be assigned is called Digital Dashboards, having the goal of building executive dashboarding for Swift. The ideal candidate will have an extensive background and expertise in MS Power BI, with both the ability to design the data model, as well as the reports and dashboards.\r\n\r\nThe candidate will join the project team and will have a key role to play, not just delivering on the scope of the project, but also training the team on that technology since it is still quite new at Swift.\r\n\r\n\r\nOpdracht informatie\r\n\r\n\r\n\r\nProjectnaam: Digital Dashboards\r\n\r\nWerkregime: Voltijds\r\n\r\n\r\n\r\nVaardigheden\r\n\r\n\r\n\r\nSPECIFIEKE VAARDIGHEDEN\r\n\r\n\r\n\r\nData Modeling: Expert (10+)\r\n\r\nETL Development: Expert (10+)\r\n\r\nMS Power Bi: Expert (10+)\r\n\r\n\r\n\r\n\r\nIndien jullie geschikte kandidaten hebben ontvang ik graag hun beschikbaarheid, CV en kostprijs.\r\n\r\n\r\n\r\nAlvast hartelijk bedankt.\r\n\r\n\r\n[signature_1929168496]\r\n\r\nChannice \r\n\r\nExecutive Assistant - Business and sales support\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nDe Cronos Groep nv\r\n\r\nVeldkant 33A, 2550 Kontich\r\n\r\n\r\n",
    "label": "BI_ENGINEER",
    "keywords": [
      "Datawarehousing Specialist",
      "MS Power BI",
      "Data Modeling",
      "ETL Development"
    ]
}

email2 = {
    "item_id": 1,
    "sender": "14f33059402db7d415bfc0da4cc7790465ad377a8aa325542a61217c6b7dde1e",
    "sender_email": "71aae3a5555cf66927e64affd0e6fc6eda03ec12c193b5fa4f9c247bc60a9043",
    "datetime_received": 1701859209000,
    "sensitivity": "Normal",
    "subject": "Data engineer met functionele analyse kennis gezocht",
    "text_body": "Dag P&Cs\r\n\r\nVoor een project zoeken we een deeltijdse data engineer die voldoet aan volgende vereisten (starttijd januari, vermoedelijke einddatum einde maar 2024).  Mochten er personen zijn die hier op matchen, gelive me per kerende te contacteren:\r\n\r\n\r\n  *   Aantoonbare ervaring als Data engineer in een MS Azure omgeving (Stream Analytics, Analysis Services, Data Factory v2, Data Lake, ...) (3 jaar)\r\n  *   Aantoonbare ervaring als Functioneel Analist waarvan minstens 1 project in een industri\u00eble IoT/OT omgeving (3 jaar)\r\n  *   Aantoonbare ervaring met datamodellering (dataflows en datamodel).\r\n  *   Aantoonbare ervaring met Kafka, Databricks, Python codeertaal en Power BI.\r\n  *   Kennis van data vault modellering\r\n  *   Hogere opleiding (Master of Bachelor) met technische/engineering achtergrond of gelijkwaardig door ervaring met ICT-technologie waarvoor expertise gevraagd wordt\r\n\r\n  *   Aantoonbare ervaring met Agile / scrum werking\r\n  *   Aantoonbare ervaring met Confluent\r\n  *   Kennis van Deltalake, Parquet files en time series data\r\n\r\nMet vriendelijke groeten\r\n\r\n[A blue and white sign with a white letter c  Description automatically generated]\r\nKoen \r\nFounder & Managing Partner\r\n<tel:+324## ## ## ##>\r\nwww.convolve.be<https://safelink.com/?url=https%3A%2F%2Fwww.convolve.be%2F&data=05%7C01%##################%40email.com%7C384a086775364bd760c508dbf647afc7%7C49c3d703357947bfa8887c913fbdced9%7C0%7C0%7C638374560092107236%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=u3sAeXowJyvSBz%2BC%2BdhMWkAfjhyKLr2NjPFcwGLcK0s%3D&reserved=0>\r\nConnect on LinkedIn<https://safelink.com/?url=https%3A%2F%2Fwww.linkedin.com%2Fin%2Fkoen-rutten-260bb71&data=05%7C01%##################%40email.com%7C384a086775364bd760c508dbf647afc7%7C49c3d703357947bfa8887c913fbdced9%7C0%7C0%7C638374560092107236%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=5q9inshhDlX1Nxy8DT%2FhLqHeOK3YVvDH4e%2FQV%2BC1x8Y%3D&reserved=0>\r\n\r\n",
    "label": "DATA_ENGINEER",
    "keywords": [
      "MS Azure",
      "Stream Analytics",
      "Analysis Services",
      "Data Factory v2",
      "Data Lake",
      "Industrial IoT/OT",
      "datamodelling",
      "dataflows",
      "datamodel",
      "Kafka",
      "Databricks",
      "Python",
      "Power BI",
      "data vault modellering",
      "Agile",
      "scrum",
      "Confluent",
      "Deltalake",
      "Parquet files",
      "time series data"
    ]
  }

In [None]:
preprocessor = Preprocessor()

# loads in 6s and 660ms

In [None]:
preprocessor.preprocess_input(email1)

# classification takes 1s and 180 ms

In [None]:
preprocessor.preprocess_input(email2)