In [48]:
from abc import ABC, abstractmethod
import nltk
nltk.download('tagsets')
from sklearn.preprocessing import LabelBinarizer
from nltk import tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\boezi\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [11]:
class TextFeatureExtractor(ABC):
    """Abstract class to describe a generic extraction of a feature starting from some text"""

    @abstractmethod
    def __init__(self, text, **kwargs):
        """
        Initialize the extractor with the raw text and other parameters which depend on the extractor
        Parameters:
        - text: raw input to extract feature from
        - **kwargs: dictionary containing specific extractor parameters
        """
        pass

    @abstractmethod
    def extract(self):
        """
        Extract the feature from the input text
        Return: an array containing the extracted features
        """
        pass

The purpose of the feature engineering is trying to add other features in addition to the word embeddings to provide more useful information for each word of the document to summarize

In [None]:
class TfIDfExtractor(TextFeatureExtractor):

    def __init__(self, text, **kwargs):
        """
        Parameters:
         text: raw document to extract td idf features
         **kwargs: function to use for tokenization
        """
        self.text = tokenize.sent_tokenize(text)
        self.vectorizer_tfidf = TfidfVectorizer()
        self.vectorizer_tf = TfidfVectorizer(use_idf=False)

    def extract(self):
        """TODO"""
        x_tf = self.vectorizer_tf.fit_transform(self.text)
        # get word -> index vocabulary
        vocabulary = self.vectorizer_tf.vocabulary_

        self.vectorizer_tfidf.fit_transform(self.text)
        x_idf = self.vectorizer_tfidf.idf_
        dict_idf = dict(zip(self.vectorizer_tfidf.get_feature_names(), x_idf))
        print(dict_idf)

        # conversion of continuous values of tf and idf into categorical ones using five categories
        x_tf_categorical = self.__convert_categorical(x_tf)

        # generation of pair of vectors containing tf idf category for each word in the document
        tf_words = []
        idf_words = []
        
        """
        for i,sentence in enumerate(self.text):
            for word in my_fun_tok(sentence):
                idf_words.append(dict_idf[word])
                tf_words.append(x_tf_categorical[i, vocabulary[word]])

        return tf_words, idf_words
        """
                
    def __convert_categorical(self, x):
        """
        Convert continuous tf and idf value into categorical one
        Parameters:
         x: matrix of continuous values to convert
        Return:
         converted categorical matrix
        """
        shape = x.shape
        x_categorical = np.empty(shape)

        # term frequency conversion
        for i in range(shape[0]):
            for j in range(shape[1]):
                if x[i,j] >= 0 and x[i,j] < 0.2:
                    x_categorical[i,j] = 0
                elif x[i,j] >= 0.2 and x[i,j] < 0.4:
                    x_categorical[i,j] = 1
                elif x[i,j] >= 0.4 and x[i,j] < 0.6:
                    x_categorical[i,j] = 2
                elif x[i,j] >= 0.6 and x[i,j] < 0.8:
                    x_categorical[i,j] = 3
                else:
                    x_categorical[i,j] = 4
        return x_categorical



document = """What's up fellas? So I got a patron supported that wanted me to talk about passive aggressive women like this is a category of women. All right, guys all women a passive-aggressive. All right guys, when a woman won't set a date with you or you know tell you will see or she cancels a date."""
extractor = TfIDfExtractor(document)
extractor.extract()

        

In [50]:
class POSExtractor(TextFeatureExtractor):
    
    def __init__(self, text, **kwargs):
        """
        Parameters:
         text: document which extracts POS tags from
         **kwargs
          tag_set: tag set to use for POS tagging
          tokenize_fun: function to use for text tokenization
        """

        self.text = text
        self.postag_set = kwargs['tag_set']
        self.binarizer = LabelBinarizer().fit(self.postag_set)
        self.tokenize = kwargs['tokenize_fun']

    def extract(self):
        
        tokenized_text = self.tokenize(self.text)
        pos_tags = nltk.pos_tag(tokenized_text)
        



document = """What's up fellas? So I got a patron supported that wanted me to talk about passive aggressive women like this is a category of women. All right, guys all women a passive-aggressive. All right guys, when a woman won't set a date with you or you know tell you will see or she cancels a date."""

penn_treebank_tagset = ['CC', 'CD', 'DT', 'EX', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNS', 'PDT', 'POS',
'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WRB']

extractor = POSExtractor(document, tag_set=penn_treebank_tagset)

['CC' 'CD' 'DT' 'EX' 'IN' 'JJ' 'JJR' 'JJS' 'LS' 'MD' 'NN' 'NNP' 'NNS'
 'PDT' 'POS' 'PRP' 'PRP$' 'RB' 'RBR' 'RBS' 'RP' 'TO' 'UH' 'VB' 'VBD' 'VBG'
 'VBN' 'VBP' 'VBZ' 'WDT' 'WP' 'WRB']
