In [306]:
import os
import re
from transformers import pipeline
from googletrans import Translator

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words

In [307]:
class Preprocessor:
    def __init__(self):
        
        self._translator = Translator()
        
        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('wordnet')
        nltk.download('words')
        
        self.words = set(words.words('en')+stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def __load_model(self, dir:str, model_name:str):
        if not os.path.exists(dir) or not os.listdir(dir):
            token_classifier = pipeline(model=model_name, aggregation_strategy="first")
            token_classifier.save_pretrained(dir)
        return pipeline(model=dir, task="ner")

    def __regex_privacy(self,text:str) -> str:
        patterns = [
            r'\r',  # Matches carriage return
            r'\n',  # Matches newline
            r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)',  # Matches URLs
            r'^[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}$',  # Matches phone numbers
            r'[0-9]',  # Matches digits
            r'-',  # Matches hyphens
            r'[\[\](){}<>]',  # Matches brackets and parentheses
            r'[,\.;:!?\&+_\/]'  # Matches common punctuation
        ]
        for pattern in patterns:
            text = re.sub(pattern, '  ', text)
        return text

    def __clean_text(self,text:str) -> str:
        text_multi_space = text.replace(' ','_')
        text_cleaned = self.__regex_privacy(text_multi_space)
        return text_cleaned

    def __translate_to_english(self, text:str) -> str:
        translated_text = self._translator.translate(text, src='auto', dest='en').text
        return translated_text

    def __tokenize(self,text:str) -> list[str]:
        tokens = word_tokenize(text)
        lemmatized_tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        filtered_tokens = [token.lower() for token in lemmatized_tokens if token not in self.words]
        return filtered_tokens

    def preprocess(self, text:str) -> str:
        clean_text = self.__clean_text(text)
        text_english = self.__translate_to_english(clean_text)
        clean_english_text = self.__clean_text(text_english)
        tokens = self.__tokenize(clean_english_text)
        unique_tokens = list(set(tokens))
        return unique_tokens

In [308]:
email = {
    "item_id": 0,
    "sender": "a1d400258b5c6e3d97307b2c949ffe01fe0aa27ab02ef1c351a7bfa6e0f300a3",
    "sender_email": "6fedc8e86e6e05504fefcdce51f8f73b69f5fd104c23dc5e9dba6c64e5536ffd",
    "datetime_received": 1707207587000,
    "sensitivity": "Normal",
    "subject": "Datawarehousing Specialist (4. Expert (10+)) SWI000876 - For Swift",
    "text_body": "ENKEL RECHTSTREEKS, GEEN TUSSENPARTIJEN AUB\r\n\r\nVOORRANG VASTE MEDEWERKERS\r\n\r\n\r\n\r\nHallo collega\u2019s,\r\n\r\n\r\n\r\nVoor Swift zoeken we een Datawarehousing Specialist (4. Expert (10+)) SWI000876 die voldoet aan volgende beschrijving:\r\n\r\n\r\n\r\nUiterste reactiedatum: 16/02/2024\r\n\r\nGewenste startdatum: 01/03/2024\r\n\r\nEinddatum: 31/08/2024\r\n\r\n\r\n\r\nReferentie: SWI000876\r\n\r\nTitel: Datawarehousing Specialist (4. Expert (10+)) SWI000876\r\n\r\nLocatie: THE NETHERLANDS - ZOETERWOUDE (ENERGIEWEG 33, 2382 NC ZOETERWOUDE, NEDERLAND)\r\n\r\nStatus: Gepubliceerd\r\n\r\nType contract: Time & material\r\n\r\nCategorie: Niet van toepassing\r\n\r\nAantal personen: 1\r\n\r\nAfdeling: Human Resource (HR)\r\n\r\n\r\n\r\nOmschrijving\r\n\r\n\r\n\r\nThe project for which the candidate will be assigned is called Digital Dashboards, having the goal of building executive dashboarding for Swift. The ideal candidate will have an extensive background and expertise in MS Power BI, with both the ability to design the data model, as well as the reports and dashboards.\r\n\r\nThe candidate will join the project team and will have a key role to play, not just delivering on the scope of the project, but also training the team on that technology since it is still quite new at Swift.\r\n\r\n\r\nOpdracht informatie\r\n\r\n\r\n\r\nProjectnaam: Digital Dashboards\r\n\r\nWerkregime: Voltijds\r\n\r\n\r\n\r\nVaardigheden\r\n\r\n\r\n\r\nSPECIFIEKE VAARDIGHEDEN\r\n\r\n\r\n\r\nData Modeling: Expert (10+)\r\n\r\nETL Development: Expert (10+)\r\n\r\nMS Power Bi: Expert (10+)\r\n\r\n\r\n\r\n\r\nIndien jullie geschikte kandidaten hebben ontvang ik graag hun beschikbaarheid, CV en kostprijs.\r\n\r\n\r\n\r\nAlvast hartelijk bedankt.\r\n\r\n\r\n[signature_1929168496]\r\n\r\nChannice \r\n\r\nExecutive Assistant - Business and sales support\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nDe Cronos Groep nv\r\n\r\nVeldkant 33A, 2550 Kontich\r\n\r\n\r\n",
    "label": "BI_ENGINEER",
    "keywords": [
      "Datawarehousing Specialist",
      "MS Power BI",
      "Data Modeling",
      "ETL Development"
    ]
}

In [309]:
preprocessor = Preprocessor()

[nltk_data] Downloading package punkt to /home/jurrean/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jurrean/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jurrean/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /home/jurrean/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [310]:
preprocessor.preprocess(email['text_body'])

['information',
 'title',
 'datawarehousing',
 'signature',
 'cronos',
 'kontich',
 'number',
 'assistant',
 'de',
 'energewoude',
 'executive',
 'ideal',
 'description',
 'start',
 'join',
 'development',
 'sales',
 'candidate',
 'background',
 'publishedcategory',
 'power',
 'theability',
 'name',
 'field',
 'support',
 'reaction',
 'work',
 'specific',
 'hello',
 'nederland',
 'groupnv',
 'location',
 'human',
 'dashboarding',
 'swift',
 'bi',
 'full',
 'data',
 'zoeterwoude',
 'delivering',
 'business',
 'ms',
 'assignment',
 'called',
 'an',
 'skills',
 'be',
 'project',
 'swi',
 'department',
 'non',
 'digital',
 'building',
 'have',
 'team',
 'quitenew',
 'netherlands',
 'goal',
 'regime',
 'etl',
 'cv',
 'expertise',
 'hr',
 'expert',
 'dashboards',
 'only',
 'modeling',
 'status',
 'exualive',
 'if',
 'resource',
 'having',
 'assigned',
 'excellent',
 'specialist']