In [84]:
from abc import ABC, abstractmethod
import nltk
from sklearn.preprocessing import LabelBinarizer
from nltk import tokenize
nltk.download('averaged_perceptron_tagger')
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\boezi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [85]:
class TextFeatureExtractor(ABC):
    """Abstract class to describe a generic extraction of a feature starting from some text"""

    @abstractmethod
    def __init__(self, text, **kwargs):
        """
        Initialize the extractor with the raw text and other parameters which depend on the extractor
        Parameters:
        - text: raw input to extract feature from
        - **kwargs: dictionary containing specific extractor parameters
        """
        pass

    @abstractmethod
    def extract(self):
        """
        Extract the feature from the input text
        Return: an array containing the extracted features
        """
        pass

The purpose of the feature engineering is trying to add other features in addition to the word embeddings to provide more useful information for each word of the document to summarize, these features will be concatenated to the word embeddings of each word

For the extraction of Tf and Idf the tokenization process is not the same of the keras Tokenizer, therefore the TfIDfVectorizer uses a tokenization process provided as input parameter

In [86]:
def generate_sentence_tokenizer(corpus):
    """
    Returns a function which tokenizes a sentence according to a vocabularu built on corpus
    Parameters:
     corpus: list of documents
    Returns:
     function to compute sentence tokenization
    """
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus)
    def tokenize_sentence(sentence):
        """
        Returns a list of string token given a sentence
        Parameters:
         sentence: sentence to tokenize
        Returns:
         list of string tokens
        """
        sequences = tokenizer.texts_to_sequences([sentence])
        sequence = sequences[0]
        tokenized_sentence = [tokenizer.index_word[i] for i in sequence]
        return tokenized_sentence
    
    return tokenize_sentence

In [87]:
class TfIDfExtractor(TextFeatureExtractor):

    def __init__(self, text, **kwargs):
        """
        Parameters:
         text: raw document to extract features from
         **kwargs:
          tokenize_fun: function to use for tokenization
        """
        self.text = tokenize.sent_tokenize(text) # each sentence is considered as a document
        self.tokenizer = kwargs['tokenize_fun']
        self.vectorizer_tfidf = TfidfVectorizer(tokenizer=self.tokenizer)
        self.vectorizer_tf = TfidfVectorizer(tokenizer=self.tokenizer, use_idf=False)
        

    def extract(self):
        """
        Extraction of Tf and Idf features
        Returns: 
         array containing for each word the category of tf and idf (one feature per column)
        """
        x_tf = self.vectorizer_tf.fit_transform(self.text)
        # get word -> index vocabulary
        vocabulary = self.vectorizer_tf.vocabulary_

        # idf computation
        self.vectorizer_tfidf.fit_transform(self.text)
        x_idf = self.vectorizer_tfidf.idf_
        dict_idf = dict(zip(self.vectorizer_tfidf.get_feature_names(), x_idf))

        # conversion of continuous values of tf and idf into categorical ones using five categories
        x_tf_categorical = self.__convert_categorical(x_tf)

        # generation of pair of vectors containing tf idf category for each word in the document
        tf_words = []
        idf_words = []
        
        for i,sentence in enumerate(self.text):
            for word in self.tokenizer(sentence):
                idf_words.append(dict_idf[word])
                tf_words.append(x_tf_categorical[i, vocabulary[word]])

        tf = np.reshape(tf_words, (len(tf_words),1))
        idf = np.reshape(idf_words, (len(idf_words),1))
        return np.concatenate((tf, idf), axis=1)
                
    def __convert_categorical(self, x):
        """
        Convert continuous tf and idf value into categorical one
        Parameters:
         x: matrix of continuous values to convert
        Return:
         converted categorical matrix
        """
        shape = x.shape
        x_categorical = np.empty(shape)

        # term frequency conversion
        for i in range(shape[0]):
            for j in range(shape[1]):
                if x[i,j] >= 0 and x[i,j] < 0.2:
                    x_categorical[i,j] = 0
                elif x[i,j] >= 0.2 and x[i,j] < 0.4:
                    x_categorical[i,j] = 1
                elif x[i,j] >= 0.4 and x[i,j] < 0.6:
                    x_categorical[i,j] = 2
                elif x[i,j] >= 0.6 and x[i,j] < 0.8:
                    x_categorical[i,j] = 3
                else:
                    x_categorical[i,j] = 4
        return x_categorical

# Use case

document = """What's up fellas? So I got a patron supported that wanted me to talk about passive aggressive women like this is a category of women. All right, guys all women a passive-aggressive. All right guys, when a woman won't set a date with you or you know tell you will see or she cancels a date."""
tokenize_sentence = generate_sentence_tokenizer(document)

extractor = TfIDfExtractor(document, tokenize_fun=tokenize_sentence)
extractor.extract()

        

array([[2.        , 1.91629073],
       [2.        , 1.91629073],
       [2.        , 1.91629073],
       [0.        , 1.91629073],
       [0.        , 1.91629073],
       [0.        , 1.91629073],
       [1.        , 1.22314355],
       [0.        , 1.91629073],
       [0.        , 1.91629073],
       [0.        , 1.91629073],
       [0.        , 1.91629073],
       [0.        , 1.91629073],
       [0.        , 1.91629073],
       [0.        , 1.91629073],
       [0.        , 1.91629073],
       [0.        , 1.51082562],
       [0.        , 1.51082562],
       [1.        , 1.51082562],
       [0.        , 1.91629073],
       [0.        , 1.91629073],
       [0.        , 1.91629073],
       [1.        , 1.22314355],
       [0.        , 1.91629073],
       [0.        , 1.91629073],
       [1.        , 1.51082562],
       [3.        , 1.51082562],
       [1.        , 1.51082562],
       [1.        , 1.51082562],
       [3.        , 1.51082562],
       [1.        , 1.51082562],
       [1.

In [88]:
class POSExtractor(TextFeatureExtractor):
    
    def __init__(self, text, **kwargs):
        """
        Parameters:
         text: document which extracts POS tags from
         **kwargs
          tokenize_fun: function to use for text tokenization
        """

        self.text = text
        # tag set described calling nltk.help.upenn_tagset()
        self.postag_set = ['CC', 'CD', 'DT', 'EX', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNS', 'PDT', 'POS',
        'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WRB']
        self.binarizer = LabelBinarizer().fit(self.postag_set)
        self.tokenize = kwargs['tokenize_fun']

    def extract(self):
        
        tokenized_text = self.tokenize(self.text)
        pos_tags = nltk.pos_tag(tokenized_text)
        tags = [pair[1] for pair in pos_tags]
        return self.binarizer.transform(tags)


document = """What's up fellas? So I got a patron supported that wanted me to talk about passive aggressive women like this is a category of women. All right, guys all women a passive-aggressive. All right guys, when a woman won't set a date with you or you know tell you will see or she cancels a date."""

extractor = POSExtractor(document, tokenize_fun=tokenize_sentence)
extractor.extract()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [89]:
class NERExtractor(TextFeatureExtractor):

    def __init__(self, text, **kwargs):
        self.text = text
        self.tokenizer = kwargs['tokenize_fun']
        # types are not directly accessible, found at https://github.com/explosion/spaCy/blob/master/spacy/symbols.pyx
        self.types = ['PERSON', 'NORP', 'FACILITY', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART',
        'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL']
        self.types.append('NOTHING') # custom type for tokens which do not correspond to any entuty
        self.binarizer = LabelBinarizer().fit(self.types)

    def extract(self):
        
        # loading pretrained model
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(self.text)
        ner_types = []
        for ent in doc:
            
            tokenized_element = self.tokenizer(ent.text)
            num = len(tokenized_element)

            for _ in range(num):
                if ent.ent_iob_ == 'O':
                    ner_types.append('NOTHING')
                else:
                    ner_types.append(ent.ent_type_)

        return self.binarizer.transform(ner_types)


document = "Apple is looking at buying U.K. startup for $1 billion Apple"
tokenize_sentence = generate_sentence_tokenizer(document)

extractor = NERExtractor(document, tokenize_fun=tokenize_sentence)
extractor.extract()
        

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])