In [166]:
import re
from nltk.corpus import stopwords

class Preprocessor:
    def __init__(self):
        """
        Initialize Preprocessor class with keywords and stopwords.
        """
        self.__keywords = [
         'advanced', 'alm', 'analyst', 'analytical', 'analytics', 'api', 'app', 'application', 'architecture', 'arm', 'asset', 'automation', 'availability', 'azure', 'ba', 'bi', 'big', 'bnppf', 'c', 'cd', 'central', 'ci', 'cloud', 'cloudtrail', 'code', 'cognos', 'container', 'crisp', 'culture', 'customer', 'dashboard', 'data', 'databricks', 'dataiku', 'datalake', 'datasets', 'dax', 'db', 'dbt', 'deepdives', 'delta', 'design', 'desktop', 'developer', 'development', 'devops', 'dimensional', 'docker', 'dynamic', 'ecosystem', 'emr', 'engineer', 'environment', 'erp', 'etl', 'etp', 'excel', 'experience', 'factory', 'flow', 'flow', 'gen', 'git', 'governance', 'handling', 'high', 'hr', 'hub', 'ict', 'infrastructure', 'innovation', 'insight', 'integration', 'intelligence', 'interface', 'iot', 'java', 'kibana', 'lake', 'lambda', 'language', 'large', 'layer', 'learning', 'level', 'linux', 'm', 'machine', 'maintenance', 'management', 'manipulating', 'mart', 'master', 'medior', 'meeting', 'microsoft', 'migration', 'model', 'modeling', 'mongo', 'need', 'neo', 'operation', 'optimization', 'oracle', 'orchestration', 'package', 'paginated', 'panda', 'pipeline', 'platform', 'power', 'practice', 'preparation', 'processing', 'procurement', 'product', 'programming', 'project', 'qliksense', 'quality', 'querying', 'report', 'reporting', 'requirement', 'roadmap', 'row', 's3', 'sa', 'safe', 'sagemaker', 'scalable', 'security', 'semi', 'service', 'setup', 'shiny', 'signal', 'skill', 'solution', 'specialist', 'sql', 'ssis', 'ssms', 'stack', 'strategic', 'strategy', 'stream', 'structured', 'studio', 'synapse', 't', 'technique', 'technology', 'tool', 'toolkit', 'topdesk', 'topic', 'transform', 'transformation', 'ux', 'value', 'vault', 'visual', 'visualisation', 'visualization', 'visuals', 'vpc', 'warehouse', 'wifi', 'workspace', 'wph']
        self.__english_stopwords = set(stopwords.words('english'))
        self.__french_stopwords = set(stopwords.words('french'))
        self.__dutch_stopwords = set(stopwords.words('dutch'))
        
    def __get_tokens(self, text: str) -> str:
        """
        Tokenizes the given text.

        Args:
        text (str): The text to tokenize.

        Returns:
        list[str]: A list of tokens extracted from the text.
        """
        for pattern in [r'\r', r'\n', r'[^A-Za-z0-9\s]', r'\s+']:
            text = re.sub(pattern, ' ', text)
        text = text.lower()
        text = text.strip()
        arr = text.split(' ')
        return arr
    
    def __get_interesting_tokens(self, tokens: list[str]) -> list[str]:
        """
        Filters tokens based on predefined keywords.

        Args:
        tokens (list[str]): List of tokens to filter.

        Returns:
        list[str]: Filtered list of tokens containing only interesting tokens.
        """
        interesting_tokens = []
        for token in tokens:
            for keyword in self.__keywords:
                if token in keyword:
                   interesting_tokens.append(token)
        return list(set(interesting_tokens))
    
    def __filter_stopwords(self, tokens: list[str]) -> list[str]:
        """
        Removes stopwords from the list of tokens.

        Args:
        tokens (list[str]): List of tokens to filter.

        Returns:
        list[str]: Filtered list of tokens with stopwords removed.
        """
        tokens = [token for token in tokens if token not in self.__english_stopwords]
        tokens = [token for token in tokens if token not in self.__french_stopwords]
        tokens = [token for token in tokens if token not in self.__dutch_stopwords]
        return tokens
    
    def preprocess(self, email: any) -> dict:
        """
        Preprocesses the text of an email by tokenizing, filtering based on keywords, and removing stopwords.

        Args:
        email (any): The email object containing text to preprocess.

        Returns:
        list[str]: Preprocessed tokens sorted alphabetically.
        """
        text = email.get('text_body', '')
        tokens = self.__get_tokens(text)
        interesting_tokens = self.__get_interesting_tokens(tokens)
        preprocessed_tokens = self.__filter_stopwords(interesting_tokens)
        preprocessed_tokens.sort()
        email['keywords'] = preprocessed_tokens
        return email

In [167]:
email =  {
    "item_id": 0,
    "sender": "a1d400258b5c6e3d97307b2c949ffe01fe0aa27ab02ef1c351a7bfa6e0f300a3",
    "sender_email": "6fedc8e86e6e05504fefcdce51f8f73b69f5fd104c23dc5e9dba6c64e5536ffd",
    "datetime_received": 1707207587000,
    "sensitivity": "Normal",
    "subject": "Datawarehousing Specialist (4. Expert (10+)) SWI000876 - For Swift",
    "text_body": "ENKEL RECHTSTREEKS, GEEN TUSSENPARTIJEN AUB\r\n\r\nVOORRANG VASTE MEDEWERKERS\r\n\r\n\r\n\r\nHallo collega\u2019s,\r\n\r\n\r\n\r\nVoor Swift zoeken we een Datawarehousing Specialist (4. Expert (10+)) SWI000876 die voldoet aan volgende beschrijving:\r\n\r\n\r\n\r\nUiterste reactiedatum: 16/02/2024\r\n\r\nGewenste startdatum: 01/03/2024\r\n\r\nEinddatum: 31/08/2024\r\n\r\n\r\n\r\nReferentie: SWI000876\r\n\r\nTitel: Datawarehousing Specialist (4. Expert (10+)) SWI000876\r\n\r\nLocatie: THE NETHERLANDS - ZOETERWOUDE (ENERGIEWEG 33, 2382 NC ZOETERWOUDE, NEDERLAND)\r\n\r\nStatus: Gepubliceerd\r\n\r\nType contract: Time & material\r\n\r\nCategorie: Niet van toepassing\r\n\r\nAantal personen: 1\r\n\r\nAfdeling: Human Resource (HR)\r\n\r\n\r\n\r\nOmschrijving\r\n\r\n\r\n\r\nThe project for which the candidate will be assigned is called Digital Dashboards, having the goal of building executive dashboarding for Swift. The ideal candidate will have an extensive background and expertise in MS Power BI, with both the ability to design the data model, as well as the reports and dashboards.\r\n\r\nThe candidate will join the project team and will have a key role to play, not just delivering on the scope of the project, but also training the team on that technology since it is still quite new at Swift.\r\n\r\n\r\nOpdracht informatie\r\n\r\n\r\n\r\nProjectnaam: Digital Dashboards\r\n\r\nWerkregime: Voltijds\r\n\r\n\r\n\r\nVaardigheden\r\n\r\n\r\n\r\nSPECIFIEKE VAARDIGHEDEN\r\n\r\n\r\n\r\nData Modeling: Expert (10+)\r\n\r\nETL Development: Expert (10+)\r\n\r\nMS Power Bi: Expert (10+)\r\n\r\n\r\n\r\n\r\nIndien jullie geschikte kandidaten hebben ontvang ik graag hun beschikbaarheid, CV en kostprijs.\r\n\r\n\r\n\r\nAlvast hartelijk bedankt.\r\n\r\n\r\n[signature_1929168496]\r\n\r\nChannice \r\n\r\nExecutive Assistant - Business and sales support\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nDe Cronos Groep nv\r\n\r\nVeldkant 33A, 2550 Kontich\r\n\r\n\r\n",
    "label": "BI_ENGINEER",
    "keywords": [
      "Datawarehousing Specialist",
      "MS Power BI",
      "Data Modeling",
      "ETL Development"
    ]
}

In [168]:
preprocessor = Preprocessor()

In [169]:
preprocessor.preprocess(email)

{'item_id': 0,
 'sender': 'a1d400258b5c6e3d97307b2c949ffe01fe0aa27ab02ef1c351a7bfa6e0f300a3',
 'sender_email': '6fedc8e86e6e05504fefcdce51f8f73b69f5fd104c23dc5e9dba6c64e5536ffd',
 'datetime_received': 1707207587000,
 'sensitivity': 'Normal',
 'subject': 'Datawarehousing Specialist (4. Expert (10+)) SWI000876 - For Swift',
 'text_body': 'ENKEL RECHTSTREEKS, GEEN TUSSENPARTIJEN AUB\r\n\r\nVOORRANG VASTE MEDEWERKERS\r\n\r\n\r\n\r\nHallo collega’s,\r\n\r\n\r\n\r\nVoor Swift zoeken we een Datawarehousing Specialist (4. Expert (10+)) SWI000876 die voldoet aan volgende beschrijving:\r\n\r\n\r\n\r\nUiterste reactiedatum: 16/02/2024\r\n\r\nGewenste startdatum: 01/03/2024\r\n\r\nEinddatum: 31/08/2024\r\n\r\n\r\n\r\nReferentie: SWI000876\r\n\r\nTitel: Datawarehousing Specialist (4. Expert (10+)) SWI000876\r\n\r\nLocatie: THE NETHERLANDS - ZOETERWOUDE (ENERGIEWEG 33, 2382 NC ZOETERWOUDE, NEDERLAND)\r\n\r\nStatus: Gepubliceerd\r\n\r\nType contract: Time & material\r\n\r\nCategorie: Niet van toepass