In [None]:
from bs4 import BeautifulSoup
import re
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
class PreProcessor:
    def __init__(self, word_min_count=1, remove_stopwords=False, tokenizer=None, stemming = True):
        self.__dataset = None
        self.__cleaned_sentences = None
        self.__cleaned_sentences_word_list = None
        self.__tfidf_dict = None
        self.__word_min_count = word_min_count
        self.__remove_stopwords = remove_stopwords
        self.__tokenizer = tokenizer
        self.__stemming = stemming

    def get_cleaned_sentences_word_list(self):
        return self.__cleaned_sentences_word_list

    def get_cleaned_sentences(self):
        return self.__cleaned_sentences

    def get_tfidf_dict(self):
        return self.__tfidf_dict

    def fit(self, dataset):
        self.__dataset = dataset
        self.__gen_cleaned_sentences()
        self.__gen_tfidf_dict()

    def __gen_cleaned_sentences(self):
        self.__cleaned_sentences = []
        self.__cleaned_sentences_word_list = []
        for text in self.__dataset:
            if self.__tokenizer is not None:
                raw_sentences = self.__tokenizer.tokenize(text)
                for sentence in raw_sentences:
                    cleaned_sentence = self.__clean_sentence(sentence)
                    self.__cleaned_sentences.append(cleaned_sentence)
                    self.__cleaned_sentences_word_list.append(cleaned_sentence.split())
            else:
                cleaned_sentence = self.__clean_sentence(text)
                self.__cleaned_sentences.append(cleaned_sentence)
                self.__cleaned_sentences_word_list.append(cleaned_sentence.split())

    def __clean_sentence(self, sentence):
        # 1- Remove tags HTML
        sentence_text = BeautifulSoup(sentence, 'lxml').get_text()

        # 2- Remove caracteres que não são letras
        sentence_text = re.sub('[^a-zA-z]', ' ', sentence_text)

        # 3- Todas as palavras para caixa baixa
        words = nltk.word_tokenize(sentence_text.lower().strip())
        
        if self.__stemming:
            # 4- Stemming words
            stemmer = nltk.stem.snowball.EnglishStemmer()
            new_words = [stemmer.stem(word) for word in words]
        else:
            new_words = words
        
        new_sentence = None
        # 5- Remove stopwords
        if self.__remove_stopwords:
            stops = set(nltk.corpus.stopwords.words('english'))
            new_sentence = ' '.join([w for w in new_words if w not in stops])
        else:
            new_sentence = ' '.join(new_words)

        return new_sentence

    def __gen_tfidf_dict(self):
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(self.__cleaned_sentences)
        weights = np.asarray(X.mean(axis=0)).ravel().tolist()
        self.__tfidf_dict = dict(zip(vectorizer.get_feature_names(), weights))
