In [5]:
from sklearn.base import BaseEstimator

**BaseEstimator:** Muss get_feature_names, fit und transform implementieren

In [3]:
class LinguisticVectorizer(BaseEstimator):

    def get_feature_names(self):
        return np.array(['sent_neut', 'sent_pos', 'sent_neg',
                         'nouns', 'adjectives', 'verbs', 'adverbs',
                         'allcaps', 'exclamation', 'question'])

    def fit(self, documents, y=None):
        return self

    def _get_sentiments(self, d):
        # http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
        sent = tuple(nltk.word_tokenize(d))
        if poscache is not None:
            if d in poscache:
                tagged = poscache[d]
            else:
                poscache[d] = tagged = nltk.pos_tag(sent)
        else:
            tagged = nltk.pos_tag(sent)

        pos_vals = []
        neg_vals = []

        nouns = 0.
        adjectives = 0.
        verbs = 0.
        adverbs = 0.

        for w, t in tagged:
            p, n = 0, 0
            sent_pos_type = None
            if t.startswith("NN"):
                sent_pos_type = "n"
                nouns += 1
            elif t.startswith("JJ"):
                sent_pos_type = "a"
                adjectives += 1
            elif t.startswith("VB"):
                sent_pos_type = "v"
                verbs += 1
            elif t.startswith("RB"):
                sent_pos_type = "r"
                adverbs += 1

            if sent_pos_type is not None:
                sent_word = "%s/%s" % (sent_pos_type, w)

                if sent_word in sent_word_net:
                    p, n = sent_word_net[sent_word]

            pos_vals.append(p)
            neg_vals.append(n)

        l = len(sent)
        avg_pos_val = np.mean(pos_vals)
        avg_neg_val = np.mean(neg_vals)

        return [1 - avg_pos_val - avg_neg_val, avg_pos_val, avg_neg_val,
                nouns / l, adjectives / l, verbs / l, adverbs / l]

    def transform(self, documents):
        obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs = np.array(
            [self._get_sentiments(d) for d in documents]).T

        allcaps = []
        exclamation = []
        question = []

        for d in documents:
            allcaps.append(
                np.sum([t.isupper() for t in d.split() if len(t) > 2]))

            exclamation.append(d.count("!"))
            question.append(d.count("?"))

        result = np.array(
            [obj_val, pos_val, neg_val, nouns, adjectives, verbs, adverbs, allcaps,
             exclamation, question]).T

        return result

emo_repl = {
    # positive emoticons
    "&lt;3": " good ",
    ":d": " good ",  # :D in lower case
    ":dd": " good ",  # :DD in lower case
    "8)": " good ",
    ":-)": " good ",
    ":)": " good ",
    ";)": " good ",
    "(-:": " good ",
    "(:": " good ",

    # negative emoticons:
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    ":-(": " bad ",
    ":(": " bad ",
    ":S": " bad ",
    ":-S": " bad ",
}

emo_repl_order = [k for (k_len, k) in reversed(
    sorted([(len(k), k) for k in list(emo_repl.keys())]))]

re_repl = {
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
}

In [138]:
import nltk # language processing
import math
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import names
from nltk.tag import StanfordNERTagger
import operator

class NamedEntityVectorizer(BaseEstimator):
    
    def __init__(self, entity="PERSON", max_features=None): #max_df=1.0, min_df=1
        #self.max_df = max_df
        #self.min_df = min_df
        self.entity = entity
        self.max_features = max_features
        self.features = []
        
    def get_feature_names(self):
        return np.array(self.features)

    def _stanford_tagger(self, text):
        ttext = word_tokenize(text)
        st_tags = StanfordNERTagger('/opt/nltk_data/stanford/classifiers/english.all.3class.distsim.crf.ser.gz',
                                   '/opt/nltk_data/stanford/stanford-ner.jar', encoding='utf-8')
        tagged = st_tags.tag(ttext)  

        tagList = []
        tempTag = None
        for tag_index, tag in enumerate(tagged):   
            if tag[1] != "O":
                if tempTag is None:
                    tempTag = tag
                elif tempTag[1] == tag[1]:
                    tempTag = (tempTag[0] + " "+ tag[0], tempTag[1])
                else:
                    tagList.append(tempTag)
                    tempTag = tag
            elif tempTag is not None:
                tagList.append(tempTag)
                tempTag = None

        return(tagList)

    def _count_vocab(self, raw_documents, build_doc):
        vocabulary = {}
        doc_counter = []
        counter = {}
        for doc in raw_documents:
            tagged = self._stanford_tagger(doc)
            list_of = [chunk for chunk in tagged if chunk[1] == self.entity]
            list_set_of = list(set(list_of))
            if build_doc:
                doc_dict = {el:0 for el in self.features}
                        
            for feature in list_of:
                try:
                    if build_doc:
                        doc_dict[feature[0]] += 1
                    vocabulary[feature[0]] += 1
                except KeyError:
                    vocabulary[feature[0]] = 1
                    
            for feature in list_set_of:
                try:
                    counter[feature[0]] += 1
                except KeyError:
                    counter[feature[0]] = 1
                    
            if build_doc:        
                doc_counter.append(doc_dict)
                        
        return vocabulary, counter, doc_counter
        
    def fit(self, raw_documents, y=None):
        v, c, d = self._count_vocab(raw_documents, False)
        if self.max_features is None or len(v) > self.max_features:
            max = len(v)
        sorted_v = sorted(v.items(), key=lambda x: x[1], reverse=True)[:self.max_features]
        self.features = [p[0] for p in sorted_v]
        return self
    
    def transform(self, raw_documents):
        v, c, d = self._count_vocab(raw_documents, True)
        result = np.array([list(doc.values()) for doc in d])
        return result

    
nerv = NamedEntityVectorizer(max_features=5)

texts = ["Donald Trump is made by Hillary Trump", "Fabian Retkowski is a person made by Donald Trump.", "Hillary Clinton is not a person, but Hillary Clinton is."]
            
nerv.fit(texts)
print(nerv.get_feature_names())
print(nerv.transform(texts))

['Donald Trump' 'Hillary Clinton' 'Fabian Retkowski']
{'Donald Trump': 0, 'Fabian Retkowski': 0, 'Hillary Clinton': 0}
{'Donald Trump': 0, 'Fabian Retkowski': 0, 'Hillary Clinton': 0}
{'Donald Trump': 0, 'Fabian Retkowski': 0, 'Hillary Clinton': 0}
[[1 0 0]
 [1 1 0]
 [0 0 2]]
