## Feature Extraction
- One-hot Encoding
- Bag of Words
- Ngram
- TF-IDF
- Word2Vec

#### One-hot Encoding

In [18]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

In [19]:
ohe = OneHotEncoder(handle_unknown = 'ignore')

In [25]:
text = ['This is the first document.','This document is the second document.','And this is the third one.',
        'Is this the first document?']
words = [word_tokenize(sent) for sent in text]   # Important Step!!!
print(words)

[['This', 'is', 'the', 'first', 'document', '.'], ['This', 'document', 'is', 'the', 'second', 'document', '.'], ['And', 'this', 'is', 'the', 'third', 'one', '.'], ['Is', 'this', 'the', 'first', 'document', '?']]


In [44]:
tokens = []
for i in range(len(words)):
    tokens = tokens + words[i]
tokens = list(sorted(set(tokens)))            # only tokens (full tokens), set tokens(unique tokens)
print(tokens)

['.', '?', 'And', 'Is', 'This', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [28]:
vo = [[w] for w in tokens]
print(vo)

[['.'], ['?'], ['And'], ['Is'], ['This'], ['document'], ['first'], ['is'], ['one'], ['second'], ['the'], ['third'], ['this']]


In [31]:
X = ohe.fit_transform(vo).toarray() # one hot Encoding
X

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [35]:
ohe.inverse_transform([[1,0,0,0,0,0,0,0,0,0,0,0,0]])

array([['.']], dtype=object)

In [38]:
ohe.transform([['sachin']]).toarray()

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [40]:
ohe.transform([['this']]).toarray()

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

### Bag of words

In [45]:
text

['This is the first document.',
 'This document is the second document.',
 'And this is the third one.',
 'Is this the first document?']

In [49]:
# word level
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(text).toarray()
print(X)

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [50]:
vocab = vectorizer.get_feature_names()
print(vocab)

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [52]:
new_word = vectorizer.transform(['This can be the first document']).toarray()
print(new_word)

[[0 1 1 0 0 0 1 0 1]]


In [54]:
# Char Level
vectorizer = CountVectorizer(analyzer = 'char') # to char lever ('a','b'etc)
X = vectorizer.fit_transform(text).toarray()
X

array([[4, 1, 0, 0, 1, 1, 2, 1, 2, 3, 1, 1, 1, 1, 3, 4, 1],
       [5, 1, 0, 0, 3, 3, 4, 0, 2, 2, 2, 3, 3, 0, 3, 4, 2],
       [5, 1, 0, 1, 0, 2, 2, 0, 3, 3, 0, 2, 1, 1, 2, 3, 0],
       [4, 0, 1, 0, 1, 1, 2, 1, 2, 3, 1, 1, 1, 1, 3, 4, 1]], dtype=int64)

In [57]:
vocab = vectorizer.get_feature_names()
list(vocab)

[' ',
 '.',
 '?',
 'a',
 'c',
 'd',
 'e',
 'f',
 'h',
 'i',
 'm',
 'n',
 'o',
 'r',
 's',
 't',
 'u']

In [59]:
new_word = vectorizer.transform(['This can be my first document']).toarray()
new_word

array([[5, 0, 0, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 3, 1]], dtype=int64)

### N-gram

In [60]:
# Word level
vectorizer = CountVectorizer(ngram_range = (1,2))
X = vectorizer.fit_transform(text).toarray()
X

array([[0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 0, 2, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0],
       [1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0],
       [0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1]],
      dtype=int64)

In [61]:
vocab = vectorizer.get_feature_names()
vocab

['and',
 'and this',
 'document',
 'document is',
 'first',
 'first document',
 'is',
 'is the',
 'is this',
 'one',
 'second',
 'second document',
 'the',
 'the first',
 'the second',
 'the third',
 'third',
 'third one',
 'this',
 'this document',
 'this is',
 'this the']

In [62]:
# char level
vectorizer = CountVectorizer(ngram_range = (1,2), analyzer = 'char')
X = vectorizer.fit_transform(text).toarray()
X

array([[4, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 2, 1, 0, 0, 1,
        1, 1, 2, 1, 1, 3, 1, 2, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 3, 2,
        0, 1, 4, 1, 1, 0, 2, 1, 1],
       [5, 2, 0, 1, 0, 1, 1, 1, 0, 0, 0, 3, 1, 2, 3, 1, 2, 4, 1, 0, 1, 2,
        0, 0, 2, 1, 1, 2, 0, 2, 2, 2, 3, 1, 0, 2, 3, 2, 1, 0, 0, 0, 3, 2,
        1, 0, 4, 1, 1, 0, 2, 2, 2],
       [5, 0, 0, 1, 1, 0, 3, 1, 0, 1, 1, 0, 0, 0, 2, 2, 0, 2, 1, 1, 0, 0,
        0, 0, 3, 1, 2, 3, 1, 2, 0, 0, 2, 1, 1, 0, 1, 0, 1, 1, 1, 0, 2, 2,
        0, 0, 3, 0, 0, 0, 3, 0, 0],
       [4, 1, 1, 0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 2, 1, 0, 0, 1,
        1, 1, 2, 1, 1, 3, 1, 2, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 3, 2,
        0, 1, 4, 1, 0, 1, 2, 1, 1]], dtype=int64)

In [63]:
vocab = vectorizer.get_feature_names()
vocab

[' ',
 ' d',
 ' f',
 ' i',
 ' o',
 ' s',
 ' t',
 '.',
 '?',
 'a',
 'an',
 'c',
 'co',
 'cu',
 'd',
 'd ',
 'do',
 'e',
 'e ',
 'e.',
 'ec',
 'en',
 'f',
 'fi',
 'h',
 'he',
 'hi',
 'i',
 'ir',
 'is',
 'm',
 'me',
 'n',
 'nd',
 'ne',
 'nt',
 'o',
 'oc',
 'on',
 'r',
 'rd',
 'rs',
 's',
 's ',
 'se',
 'st',
 't',
 't ',
 't.',
 't?',
 'th',
 'u',
 'um']

In [65]:
# TF-IDF
# Wordlevel
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text).toarray()
X

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

In [66]:
vocab = vectorizer.get_feature_names()
vocab

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

In [68]:
new_word = vectorizer.transform(['This can be my first document']).toarray()
new_word

array([[0.        , 0.55953044, 0.69113141, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.4574528 ]])

In [69]:
# Char level
vectorizer = TfidfVectorizer(analyzer = 'char')
X = vectorizer.fit_transform(text).toarray()
X

array([[0.47550697, 0.14540332, 0.        , 0.        , 0.14540332,
        0.11887674, 0.23775349, 0.17960203, 0.23775349, 0.35663023,
        0.14540332, 0.11887674, 0.11887674, 0.14540332, 0.35663023,
        0.47550697, 0.14540332],
       [0.44206359, 0.10814145, 0.        , 0.        , 0.32442434,
        0.26523816, 0.35365088, 0.        , 0.17682544, 0.17682544,
        0.21628289, 0.26523816, 0.26523816, 0.        , 0.26523816,
        0.35365088, 0.21628289],
       [0.57481012, 0.14061506, 0.        , 0.22030066, 0.        ,
        0.22992405, 0.22992405, 0.        , 0.34488607, 0.34488607,
        0.        , 0.22992405, 0.11496202, 0.14061506, 0.22992405,
        0.34488607, 0.        ],
       [0.46836004, 0.        , 0.2243785 , 0.        , 0.14321789,
        0.11709001, 0.23418002, 0.17690259, 0.23418002, 0.35127003,
        0.14321789, 0.11709001, 0.11709001, 0.14321789, 0.35127003,
        0.46836004, 0.14321789]])

In [70]:
vocab = vectorizer.get_feature_names()
vocab

[' ',
 '.',
 '?',
 'a',
 'c',
 'd',
 'e',
 'f',
 'h',
 'i',
 'm',
 'n',
 'o',
 'r',
 's',
 't',
 'u']

In [72]:
# char level
vectorizer = TfidfVectorizer(analyzer = 'char', ngram_range = (2,2))
X = vectorizer.fit_transform(text).toarray()
X

array([[0.18466915, 0.22810314, 0.18466915, 0.        , 0.        ,
        0.15097913, 0.        , 0.        , 0.18466915, 0.        ,
        0.18466915, 0.15097913, 0.        , 0.        , 0.18466915,
        0.22810314, 0.15097913, 0.15097913, 0.18466915, 0.30195826,
        0.18466915, 0.        , 0.        , 0.18466915, 0.18466915,
        0.        , 0.        , 0.22810314, 0.30195826, 0.        ,
        0.22810314, 0.18466915, 0.22810314, 0.        , 0.30195826,
        0.18466915],
       [0.25688446, 0.        , 0.12844223, 0.        , 0.20122957,
        0.10500994, 0.        , 0.20122957, 0.25688446, 0.1586517 ,
        0.25688446, 0.10500994, 0.        , 0.20122957, 0.25688446,
        0.        , 0.10500994, 0.10500994, 0.        , 0.21001987,
        0.25688446, 0.1586517 , 0.        , 0.25688446, 0.25688446,
        0.1586517 , 0.        , 0.        , 0.21001987, 0.20122957,
        0.        , 0.12844223, 0.1586517 , 0.        , 0.21001987,
        0.25688446],
      

In [None]:
# Char level
vectorizer = TfidfVectorizer(ngram_range=(2,2))
X = vectorizer.fit_transform(text).toarray()
X

## Word2Vec

In [6]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from gensim.models import Phrases
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import FastText



In [7]:
data = ['This is the first document.','This is the second second document.',
        'And the third one.',
        'Is this the first document?']

In [9]:
tokens = [word_tokenize(text) for text in data]
print(tokens)

[['This', 'is', 'the', 'first', 'document', '.'], ['This', 'is', 'the', 'second', 'second', 'document', '.'], ['And', 'the', 'third', 'one', '.'], ['Is', 'this', 'the', 'first', 'document', '?']]


In [15]:
model = Word2Vec(tokens, size=5,window=5,min_count=1,workers=4)

In [19]:
model.wv.vocab

{'This': <gensim.models.keyedvectors.Vocab at 0x18031ac8080>,
 'is': <gensim.models.keyedvectors.Vocab at 0x18031ac80f0>,
 'the': <gensim.models.keyedvectors.Vocab at 0x18031ac80b8>,
 'first': <gensim.models.keyedvectors.Vocab at 0x18031ac8128>,
 'document': <gensim.models.keyedvectors.Vocab at 0x18031ac8160>,
 '.': <gensim.models.keyedvectors.Vocab at 0x18031ac8198>,
 'second': <gensim.models.keyedvectors.Vocab at 0x18031ac81d0>,
 'And': <gensim.models.keyedvectors.Vocab at 0x18031ac8208>,
 'third': <gensim.models.keyedvectors.Vocab at 0x18031ac8240>,
 'one': <gensim.models.keyedvectors.Vocab at 0x18031ac8278>,
 'Is': <gensim.models.keyedvectors.Vocab at 0x18031ac82b0>,
 'this': <gensim.models.keyedvectors.Vocab at 0x18031ac82e8>,
 '?': <gensim.models.keyedvectors.Vocab at 0x18031ac8320>}

In [22]:
model.wv['This']

array([-0.05589332,  0.02646712,  0.01573865, -0.09324984, -0.00161548],
      dtype=float32)

In [23]:
model['This']

  """Entry point for launching an IPython kernel.


array([-0.05589332,  0.02646712,  0.01573865, -0.09324984, -0.00161548],
      dtype=float32)

In [24]:
model.save('Word2Vec.model')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [26]:
model = Word2Vec.load('Word2Vec.model')

In [27]:
model.wv.similar_by_vector(model.wv['This'])

[('This', 1.0),
 ('this', 0.5043507218360901),
 ('one', 0.4586126506328583),
 ('Is', 0.26038751006126404),
 ('?', 0.12596315145492554),
 ('the', -0.04200264811515808),
 ('document', -0.1248597502708435),
 ('first', -0.1417647898197174),
 ('third', -0.29639196395874023),
 ('is', -0.37558913230895996)]

In [28]:
model.wv.similar_by_word('This')

[('this', 0.5043507218360901),
 ('one', 0.4586126208305359),
 ('Is', 0.26038751006126404),
 ('?', 0.12596315145492554),
 ('the', -0.04200264811515808),
 ('document', -0.1248597651720047),
 ('first', -0.1417647749185562),
 ('third', -0.29639193415641785),
 ('is', -0.37558913230895996),
 ('And', -0.41193658113479614)]

In [30]:
bigram_transformer = Phrases(tokens)
print(bigram_transformer.vocab)

defaultdict(<class 'int'>, {b'This': 2, b'is': 2, b'This_is': 2, b'the': 4, b'is_the': 2, b'first': 2, b'the_first': 2, b'document': 3, b'first_document': 2, b'.': 3, b'document_.': 2, b'second': 2, b'the_second': 1, b'second_second': 1, b'second_document': 1, b'And': 1, b'And_the': 1, b'third': 1, b'the_third': 1, b'one': 1, b'third_one': 1, b'one_.': 1, b'Is': 1, b'this': 1, b'Is_this': 1, b'this_the': 1, b'?': 1, b'document_?': 1})


In [31]:
model = Word2Vec(bigram_transformer[tokens], min_count=1)



In [41]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokens)]
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

In [42]:
model.infer_vector(["this is"])

array([ 0.02028376,  0.07223367, -0.0845509 ,  0.04373696, -0.09653375],
      dtype=float32)

In [43]:
model.infer_vector(["this"])

array([ 0.04542537, -0.06584745,  0.00271636, -0.00229087,  0.07842097],
      dtype=float32)

In [44]:
model.wv.vocab

{'This': <gensim.models.keyedvectors.Vocab at 0x18031b46828>,
 'is': <gensim.models.keyedvectors.Vocab at 0x18031b46860>,
 'the': <gensim.models.keyedvectors.Vocab at 0x18031b46898>,
 'first': <gensim.models.keyedvectors.Vocab at 0x18031b468d0>,
 'document': <gensim.models.keyedvectors.Vocab at 0x18031b46908>,
 '.': <gensim.models.keyedvectors.Vocab at 0x18031b46940>,
 'second': <gensim.models.keyedvectors.Vocab at 0x18031b46978>,
 'And': <gensim.models.keyedvectors.Vocab at 0x18031b469b0>,
 'third': <gensim.models.keyedvectors.Vocab at 0x18031b469e8>,
 'one': <gensim.models.keyedvectors.Vocab at 0x18031b46a20>,
 'Is': <gensim.models.keyedvectors.Vocab at 0x18031b46a58>,
 'this': <gensim.models.keyedvectors.Vocab at 0x18031b46a90>,
 '?': <gensim.models.keyedvectors.Vocab at 0x18031b46ac8>}

In [48]:
model = FastText(size=4, window=3, min_count=1) 
model.build_vocab(sentences=tokens)
model.train(sentences=tokens, total_examples=len(tokens), epochs=10)

In [49]:
model.wv["This"]

array([-0.02406601,  0.02835182,  0.01417441,  0.0065731 ], dtype=float32)