In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from nltk.tokenize import word_tokenize

In [2]:
corpus = [
...     'This is the first document.',
...     'This document is the second document.',
...     'And this is the third one.',
...     'Is this the first document?',
... ]
labels = ["Positive", "Negative", "Negative", "Positive"]

In [31]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(X)
print(vectorizer.get_feature_names())

  (0, 8)	1
  (0, 3)	1
  (0, 6)	1
  (0, 2)	1
  (0, 1)	1
  (1, 8)	1
  (1, 3)	1
  (1, 6)	1
  (1, 1)	2
  (1, 5)	1
  (2, 8)	1
  (2, 3)	1
  (2, 6)	1
  (2, 0)	1
  (2, 7)	1
  (2, 4)	1
  (3, 8)	1
  (3, 3)	1
  (3, 6)	1
  (3, 2)	1
  (3, 1)	1
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [53]:
def type_token_ratio(document, regex=False):
    """
        Input: tokenize document 
        Returns: the type-token-ratio for a document.
    """
    if regex:
        all_tokens = regex_tokeniser(document)
    else:
        print("do not use regex")
        all_tokens = word_tokenize(document)
    num_of_tokens = len(all_tokens)
    unique_tokens = list(set(all_tokens))
    num_of_unique_tokens = len(unique_tokens)
    print("num_of_unique_tokens/num_of_tokens")
    return {"Type-token-ratio": num_of_unique_tokens/num_of_tokens}


In [103]:
class LexicalComplexity(BaseEstimator, TransformerMixin):

    def fit(self, x, y=None):
        return self

    def _get_features(self, doc):
        lexical_score = type_token_ratio(doc)
        print(lexical_score)
        return lexical_score
    

    def transform(self, raw_documents):
        return [self._get_features(doc) for doc in raw_documents]

In [125]:
import pandas as pd 

data = {'text': ["This is a sentence", "this is a sentence."], 'context': ["Context for sent1.", "Context for sent 2 and yeah you know Context."], "labels": ["negative", "positive"] }
df = pd.DataFrame.from_dict(data)
df

Unnamed: 0,text,context,labels
0,This is a sentence,Context for sent1.,negative
1,this is a sentence.,Context for sent 2 and yeah you know Context.,positive


In [126]:
lexical_complexity_vec = Pipeline(
    [
        ('feat', LexicalComplexity()), ('vec', DictVectorizer())
    ]
)

count_vectorizer = TfidfVectorizer()
count_vectorizer_2 = CountVectorizer()

In [107]:
from sklearn.compose import ColumnTransformer, make_column_transformer

In [127]:
col_transformer = ColumnTransformer(
                    transformers=[
                        ('vec1', count_vectorizer, 'text'), ('vec2', lexical_complexity_vec, 'context')] ,  remainder='drop',
                    n_jobs=-1,
                    sparse_threshold=0)

In [101]:
col_transformer

ColumnTransformer(n_jobs=-1, remainder='drop', sparse_threshold=0,
                  transformer_weights=None,
                  transformers=[('vec1',
                                 TfidfVectorizer(analyzer='word', binary=False,
                                                 decode_error='strict',
                                                 dtype=<class 'numpy.float64'>,
                                                 encoding='utf-8',
                                                 input='content',
                                                 lowercase=True, max_df=1.0,
                                                 max_features=None, min_df=1,
                                                 ngram_range=(1, 1), norm='l2',
                                                 preprocessor=None,
                                                 smooth_idf=True,
                                                 stop_words=None,
                                                

In [128]:
Xtrain_transformed = col_transformer.fit_transform(df)
Xtrain_transformed

array([[0.57735027, 0.57735027, 0.57735027, 1.        ],
       [0.57735027, 0.57735027, 0.57735027, 0.9       ]])

In [67]:
from sklearn.naive_bayes import MultinomialNB
Y = df['labels'].to_list()
MultinomialNB().fit(Xtrain_transformed, Y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [13]:
#compare with union 

lexical_complexity_vec = Pipeline(
    [
        ('feat', LexicalComplexity()), ('vec', DictVectorizer())
    ]
)



ex = ["This is a sentence", "this is a sentence."]
union = FeatureUnion([("vec1", count_vectorizer), ("vec2", lexical_complexity_vec)])
union.fit_transform(ex)


do not use regex
do not use regex


<2x4 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [86]:
ex = lexical_complexity_vec.fit_transform(df['context'])
ex

do not use regex
num_of_unique_tokens/num_of_tokens
do not use regex
num_of_unique_tokens/num_of_tokens


<2x1 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [33]:
df['labels'].to_list()

['negative', 'positive']

In [66]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(["This is a sentence", "this is a sentence."])
vectorizer.get_feature_names()


['is', 'sentence', 'this']

In [51]:
MultinomialNB().fit(X, [1,0])

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [110]:
lexical_complexity_vec = Pipeline(
    [
        ('feat', LexicalComplexity()), ('vec', DictVectorizer())
    ]
)

In [117]:
c = ["Context for sent1.", "Context for sent 2 and yeah you know heh."]

In [114]:
lexical_complexity_vec.fit(c)

do not use regex
num_of_unique_tokens/num_of_tokens
{'Type-token-ratio': 1.0}
do not use regex
num_of_unique_tokens/num_of_tokens
{'Type-token-ratio': 1.0}


Pipeline(memory=None,
         steps=[('feat', LexicalComplexity()),
                ('vec',
                 DictVectorizer(dtype=<class 'numpy.float64'>, separator='=',
                                sort=True, sparse=True))],
         verbose=False)

In [118]:
type_token_ratio(c[1])

do not use regex
num_of_unique_tokens/num_of_tokens


{'Type-token-ratio': 1.0}

In [121]:
num_of_tokens = len(word_tokenize(c[1]))

print(num_of_tokens)
#unique_tokens = list(set(all_tokens))
#num_of_unique_tokens = len(unique_tokens)

10


['Context', 'for', 'sent', '2', 'and', 'yeah', 'you', 'know', 'heh', '.']

In [123]:
unique_tokens = list(set(word_tokenize(c[1])))

In [124]:
unique_tokens

['sent', 'you', '2', 'heh', 'and', 'yeah', 'for', 'know', '.', 'Context']