In [10]:
import os
from transformers import pipeline
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import DutchStemmer

In [11]:
# TODO: add the knowledge extraction
# TODO: add privacy extraction
# TODO: add different directories in the preprocessor 
# TODO: change the pipeline so it understand the language used and uses the preprocessor for that language that we have NL, DE, FR, EN

class Preprocessor:
    """
    This class contains the basic preprocessor pipeline that's used to extract keywords themed around skills.
    """
    def __init__(self):
        self.__model = self.load_model()
        
    def load_model(self):
        """
        This function loads the model from the preprocessor directory if it exists,
        otherwise downloads it and saves it to the directory, then waits until the files are detected in said folder before returning the preprocessor.
        """
        PREPROCESSOR_DIR = "./preprocessor"
        if not (os.path.exists(PREPROCESSOR_DIR) and os.listdir(PREPROCESSOR_DIR)):
            token_skill_classifier = pipeline(model="jjzha/jobbert_skill_extraction", aggregation_strategy="first")
            token_skill_classifier.save_pretrained(PREPROCESSOR_DIR)
            while not os.listdir(PREPROCESSOR_DIR):  # Wait until files are detected in the specified folder, I'm doing this in case the writing speed isn't fast enough
                pass
        return pipeline(model=PREPROCESSOR_DIR, task="ner")
        
    def preprocess_input(self, mail):
        """
        This function preprocesses the input and returns a list of stemmed tokens with all the skills and/or keywords of the inputted email.
        """

        # Tokenize the text using NLTK
        tokens = word_tokenize(mail["text_body"].lower())
        
        # Remove Dutch stop words using NLTK
        stop_words = set(stopwords.words("dutch"))
        tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
        
        # Stemming using Dutch Snowball Stemmer
        stemmer = DutchStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
        output_json = self.__model(tokens)
        filtered_output = [arr for arr in output_json if arr]
        transformed_output = []
        for sublist in output_json:
            for item in sublist:
                transformed_output.append({'word': item['word'], 'score': item['score'], 'type': 'skill'})
        return transformed_output


In [12]:
mail =  {
    "item_id": 0,
    "sender": "a1d400258b5c6e3d97307b2c949ffe01fe0aa27ab02ef1c351a7bfa6e0f300a3",
    "sender_email": "6fedc8e86e6e05504fefcdce51f8f73b69f5fd104c23dc5e9dba6c64e5536ffd",
    "datetime_received": 1707207587000,
    "sensitivity": "Normal",
    "subject": "Datawarehousing Specialist (4. Expert (10+)) SWI000876 - For Swift",
    "text_body": "ENKEL RECHTSTREEKS, GEEN TUSSENPARTIJEN AUB\r\n\r\nVOORRANG VASTE MEDEWERKERS\r\n\r\n\r\n\r\nHallo collega\u2019s,\r\n\r\n\r\n\r\nVoor Swift zoeken we een Datawarehousing Specialist (4. Expert (10+)) SWI000876 die voldoet aan volgende beschrijving:\r\n\r\n\r\n\r\nUiterste reactiedatum: 16/02/2024\r\n\r\nGewenste startdatum: 01/03/2024\r\n\r\nEinddatum: 31/08/2024\r\n\r\n\r\n\r\nReferentie: SWI000876\r\n\r\nTitel: Datawarehousing Specialist (4. Expert (10+)) SWI000876\r\n\r\nLocatie: THE NETHERLANDS - ZOETERWOUDE (ENERGIEWEG 33, 2382 NC ZOETERWOUDE, NEDERLAND)\r\n\r\nStatus: Gepubliceerd\r\n\r\nType contract: Time & material\r\n\r\nCategorie: Niet van toepassing\r\n\r\nAantal personen: 1\r\n\r\nAfdeling: Human Resource (HR)\r\n\r\n\r\n\r\nOmschrijving\r\n\r\n\r\n\r\nThe project for which the candidate will be assigned is called Digital Dashboards, having the goal of building executive dashboarding for Swift. The ideal candidate will have an extensive background and expertise in MS Power BI, with both the ability to design the data model, as well as the reports and dashboards.\r\n\r\nThe candidate will join the project team and will have a key role to play, not just delivering on the scope of the project, but also training the team on that technology since it is still quite new at Swift.\r\n\r\n\r\nOpdracht informatie\r\n\r\n\r\n\r\nProjectnaam: Digital Dashboards\r\n\r\nWerkregime: Voltijds\r\n\r\n\r\n\r\nVaardigheden\r\n\r\n\r\n\r\nSPECIFIEKE VAARDIGHEDEN\r\n\r\n\r\n\r\nData Modeling: Expert (10+)\r\n\r\nETL Development: Expert (10+)\r\n\r\nMS Power Bi: Expert (10+)\r\n\r\n\r\n\r\n\r\nIndien jullie geschikte kandidaten hebben ontvang ik graag hun beschikbaarheid, CV en kostprijs.\r\n\r\n\r\n\r\nAlvast hartelijk bedankt.\r\n\r\n\r\n[signature_1929168496]\r\n\r\nChannice \r\n\r\nExecutive Assistant - Business and sales support\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nDe Cronos Groep nv\r\n\r\nVeldkant 33A, 2550 Kontich\r\n\r\n\r\n",
    "label": "BI_ENGINEER",
    "keywords": [
      "Datawarehousing Specialist",
      "MS Power BI",
      "Data Modeling",
      "ETL Development"
    ]
}

In [13]:
preprocessor = Preprocessor()
output = preprocessor.preprocess_input(mail)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
