# Importing libraries

In [1]:
!pip install -U pip setuptools wheel -q
!pip install -U spacy -q
!python -m spacy download en_core_web_sm -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 7.34.0 requires jedi>=0.16, which is not installed.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
from abc import ABC, abstractmethod
from enum import Enum, auto
from typing import List, Type, Union
import string
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import RSLPStemmer
from spacy.tokenizer import Tokenizer
from spacy.lang.xx import MultiLanguage

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('rslp')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


True

# Defining PreprocessingTextPipeline

In [5]:
class NLPLibrary(Enum):
    nlkt = 0
    spacy = 1

class TextProcessingStep(ABC):
    @abstractmethod
    def execute(self, data: Union[str, List[str]]) -> Union[str, List[str]]:
        pass

    @abstractmethod
    def requires(self) -> List[Type['TextProcessingStep']]:
        pass

class PreprocessingTextPipeline():
    def __init__(self, nlpLibrary: NLPLibrary):
        self.steps = []
        self.nlpLibrary = nlpLibrary

    def add_step(self, step: TextProcessingStep):
        for required_step in step.requires():
            if not any(isinstance(s, required_step) for s in self.steps):
                raise ValueError(f"Step {step.__class__.__name__} requires {required_step.__name__} to be added first.")

        self.steps.append(step)

    def run(self, data: Union[str, List[str]]) -> Union[str, List[str]]:
        for step in self.steps:
            data = step.execute(data, self.nlpLibrary)
        return data

class Lowercase(TextProcessingStep):
    def execute(self, data: str, library: NLPLibrary) -> str:
        return data.lower()

    def requires(self) -> List[Type[TextProcessingStep]]:
        return []

class RemoveNumbers(TextProcessingStep):
    def execute(self, data: str, library: NLPLibrary) -> str:
        return re.sub(r'\d+', '', data)

    def requires(self) -> List[Type[TextProcessingStep]]:
        return []

class RemovePunctuation(TextProcessingStep):
    def execute(self, data: str, library: NLPLibrary) -> str:
        translator = str.maketrans('', '', string.punctuation)
        return data.translate(translator)

    def requires(self) -> List[Type[TextProcessingStep]]:
        return []

class Tokenization(TextProcessingStep):
    def execute(self, data: str, library: NLPLibrary) -> List[str]:
        if library == 0:
            return word_tokenize(data)
        elif library == 1:
            tokens = MultiLanguage().tokenizer(data)
            return [token.text for token in tokens]

    def requires(self) -> List[Type[TextProcessingStep]]:
        return []

class RemoveStopwords(TextProcessingStep):
    def execute(self, data: List[str], library: NLPLibrary) -> List[str]:
        if library == 0:
            stop_words = set(stopwords.words('portuguese'))
            return [word for word in data if word.lower() not in stop_words]
        else:
            raise NotImplementedError("Implement spaCy way")
            return []

    def requires(self) -> List[Type[TextProcessingStep]]:
        return [Tokenization]

class Stemming(TextProcessingStep):
    def execute(self, data: List[str], library: NLPLibrary) -> List[str]:
        if library == 0:
            stemmer = RSLPStemmer()
            return [stemmer.stem(word) for word in data]
        else:
            raise NotImplementedError("Implement spaCy way")
            return []

    def requires(self) -> List[Type[TextProcessingStep]]:
        return [Tokenization, RemoveStopwords]

#### Defining pipeline

In [8]:
pipeline = PreprocessingTextPipeline(0)

pipeline.add_step(Lowercase())
pipeline.add_step(RemoveNumbers())
pipeline.add_step(RemovePunctuation())
pipeline.add_step(Tokenization())
pipeline.add_step(RemoveStopwords())
pipeline.add_step(Stemming())

In [9]:
raw_text = "Bom dia! Meu nome é Jonas Brothers e eu sou do grupo 1."
preprocessed_text = pipeline.run(raw_text)

In [10]:
print(preprocessed_text)

['bom', 'dia', 'nom', 'jon', 'broth', 'grup']
