In [1]:
from abc import ABC, abstractmethod
import nltk
from sklearn.preprocessing import LabelBinarizer
from nltk import tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
class TextFeatureExtractor(ABC):
    """Abstract class to describe a generic extraction of a feature starting from some text"""

    @abstractmethod
    def __init__(self, text, **kwargs):
        """
        Initialize the extractor with the raw text and other parameters which depend on the extractor
        Parameters:
        - text: raw input to extract feature from
        - **kwargs: dictionary containing specific extractor parameters
        """
        pass

    @abstractmethod
    def extract(self):
        """
        Extract the feature from the input text
        Return: an array containing the extracted features
        """
        pass

The purpose of the feature engineering is trying to add other features in addition to the word embeddings to provide more useful information for each word of the document to summarize, these features will be concatenated to the word embeddings of each word

For the extraction of Tf and Idf the tokenization process is the same of the keras Tokenizer and consists of lowercasing and punctuation stripping unless differently specified

In [27]:
class TfIDfExtractor(TextFeatureExtractor):

    def __init__(self, text, **kwargs):
        """
        Parameters:
         text: raw document to extract features from
         **kwargs:
          tokenize_fun: function to use for tokenization
        """
        self.text = tokenize.sent_tokenize(text) # each sentence is considered as a document
        self.tokenizer = Tokenizer()
        self.vectorizer_tfidf = TfidfVectorizer()
        self.vectorizer_tf = TfidfVectorizer(use_idf=False)
        

    def extract(self):
        """
        Extraction of Tf and Idf features
        Returns: array containing for each word the category of tf and idf
        """
        x_tf = self.vectorizer_tf.fit_transform(self.text)
        # get word -> index vocabulary
        vocabulary = self.vectorizer_tf.vocabulary_

        print(sorted(vocabulary.items()))

        # using the same vocabulary
        self.tokenizer.word_index = vocabulary
        self.tokenizer.index_word = {idx:w for w,idx in self.tokenizer.word_index.items()}

        # idf computation
        self.vectorizer_tfidf.fit_transform(self.text)
        x_idf = self.vectorizer_tfidf.idf_
        dict_idf = dict(zip(self.vectorizer_tfidf.get_feature_names(), x_idf))

        # conversion of continuous values of tf and idf into categorical ones using five categories
        x_tf_categorical = self.__convert_categorical(x_tf)

        # generation of pair of vectors containing tf idf category for each word in the document
        tf_words = []
        idf_words = []
        
        for i,sentence in enumerate(self.text):
            print(i, sentence)
            for word in self.__tokenize_sentence(sentence):
                idf_words.append(dict_idf[word])
                tf_words.append(x_tf_categorical[i, vocabulary[word]])

        print(tf_words)

        tf = np.array(tf_words).T
        idf = np.array(idf_words).T
        return np.concatenate((tf, idf), axis=0)
                
    def __convert_categorical(self, x):
        """
        Convert continuous tf and idf value into categorical one
        Parameters:
         x: matrix of continuous values to convert
        Return:
         converted categorical matrix
        """
        shape = x.shape
        x_categorical = np.empty(shape)

        # term frequency conversion
        for i in range(shape[0]):
            for j in range(shape[1]):
                if x[i,j] >= 0 and x[i,j] < 0.2:
                    x_categorical[i,j] = 0
                elif x[i,j] >= 0.2 and x[i,j] < 0.4:
                    x_categorical[i,j] = 1
                elif x[i,j] >= 0.4 and x[i,j] < 0.6:
                    x_categorical[i,j] = 2
                elif x[i,j] >= 0.6 and x[i,j] < 0.8:
                    x_categorical[i,j] = 3
                else:
                    x_categorical[i,j] = 4
        return x_categorical


    def __tokenize_sentence(self, sentence):
        """
        Return a list of string tokens starting from a sentence
        Parameters:
         sentence: sentence to tokenize
        Return:
         list of tokens
        """

        sequences = self.tokenizer.texts_to_sequences([sentence])
        print(sequences)
        sequence = sequences[0]
        tokenized_sentence = [self.tokenizer.index_word[i] for i in sequence]
        return tokenized_sentence



document = """What's up fellas? So I got a patron supported that wanted me to talk about passive aggressive women like this is a category of women. All right, guys all women a passive-aggressive. All right guys, when a woman won't set a date with you or you know tell you will see or she cancels a date."""
extractor = TfIDfExtractor(document)
extractor.extract()

        

[('about', 0), ('aggressive', 1), ('all', 2), ('cancels', 3), ('category', 4), ('date', 5), ('fellas', 6), ('got', 7), ('guys', 8), ('is', 9), ('know', 10), ('like', 11), ('me', 12), ('of', 13), ('or', 14), ('passive', 15), ('patron', 16), ('right', 17), ('see', 18), ('set', 19), ('she', 20), ('so', 21), ('supported', 22), ('talk', 23), ('tell', 24), ('that', 25), ('this', 26), ('to', 27), ('up', 28), ('wanted', 29), ('what', 30), ('when', 31), ('will', 32), ('with', 33), ('woman', 34), ('women', 35), ('won', 36), ('you', 37)]
0 What's up fellas?
[[28, 6]]
up
fellas
1 So I got a patron supported that wanted me to talk about passive aggressive women like this is a category of women.
[[21, 7, 16, 22, 25, 29, 12, 27, 23, 0, 15, 1, 35, 11, 26, 9, 4, 13, 35]]
so
got
patron
supported
that
wanted
me
to
talk
about
passive
aggressive
women
like
this
is
category
of
women
2 All right, guys all women a passive-aggressive.
[[2, 17, 8, 2, 35, 15, 1]]
all
right
guys
all
women
passive
aggressive
3 All

array([2.        , 2.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 2.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       2.        , 3.        , 1.        , 1.        , 3.        ,
       1.        , 1.        , 1.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 1.        ,
       0.        , 2.        , 1.        , 2.        , 0.        ,
       0.        , 2.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 1.        , 1.91629073, 1.91629073,
       1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,
       1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,
       1.51082562, 1.51082562, 1.51082562, 1.91629073, 1.91629073,
       1.91629073, 1.91629073, 1.91629073, 1.51082562, 1.51082562,
       1.51082562, 1.51082562, 1.51082562, 1.51082562, 1.51082

In [51]:
class POSExtractor(TextFeatureExtractor):
    
    def __init__(self, text, **kwargs):
        """
        Parameters:
         text: document which extracts POS tags from
         **kwargs
          tokenize_fun: function to use for text tokenization
        """

        self.text = text
        self.postag_set = ['CC', 'CD', 'DT', 'EX', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNS', 'PDT', 'POS',
        'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WRB']
        self.binarizer = LabelBinarizer().fit(self.postag_set)
        self.tokenize = kwargs['tokenize_fun']

    def extract(self):
        
        tokenized_text = self.tokenize(self.text)
        pos_tags = nltk.pos_tag(tokenized_text)
        tags = [pair[1] for pair in pos_tags]
        return self.binarizer.transform(tags)


document = """What's up fellas? So I got a patron supported that wanted me to talk about passive aggressive women like this is a category of women. All right, guys all women a passive-aggressive. All right guys, when a woman won't set a date with you or you know tell you will see or she cancels a date."""

penn_treebank_tagset = ['CC', 'CD', 'DT', 'EX', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNS', 'PDT', 'POS',
'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WRB']

extractor = POSExtractor(document)

KeyError: 'tokenize_fun'