In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
vec = CountVectorizer()
corpus = [
          'Text of first document.',
          'Text of the second document made longer.',
          'Number three.',
          'This is number four.',
]

In [4]:
X = vec.fit_transform(corpus)

In [6]:
vec.get_feature_names()

['document',
 'first',
 'four',
 'is',
 'longer',
 'made',
 'number',
 'of',
 'second',
 'text',
 'the',
 'this',
 'three']

In [8]:
X.toarray()
vec.transform(['A new document']).toarray() #for each word w in vocabulary, see it w exists in the new document 'A new document'

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

### tf-idf
tf(term, document) -> the frequency of the word in a document/ total number of words in document

idf(term) = log2( inverse( total number of documents/ number of dcuments in which the word occurs))

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer

In [18]:
tfidfTrans = TfidfTransformer(smooth_idf=False)

In [22]:
X.shape
# type(X)

(4, 13)

In [20]:
tfidf = tfidfTrans.fit_transform(X.toarray())

In [23]:
tfidf.toarray()

array([[0.44782471, 0.63115694, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.44782471, 0.        , 0.44782471,
        0.        , 0.        , 0.        ],
       [0.30226172, 0.        , 0.        , 0.        , 0.4260028 ,
        0.4260028 , 0.        , 0.30226172, 0.4260028 , 0.30226172,
        0.4260028 , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.57866699, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.81556393],
       [0.        , 0.        , 0.53426056, 0.53426056, 0.        ,
        0.        , 0.37907384, 0.        , 0.        , 0.        ,
        0.        , 0.53426056, 0.        ]])

## Word Embeddings

### Parameter learning methods:

#### 1. One word context:
It takes one word as input and outputs a word. 

Dimensions: Input is a V x 1 vector; hidden weights: V x N; hidden->output weights: N x V => output dimensions: V x 1

Corresponding to the input word, it outputs the probability of each word in the vocabulary to be the context word

#### 2. Multi Word Context:
It takes multiple context words and outputs a word closest to meaning.

Dimensions: Input: C [V X 1] vectors corresponding to C context words; input->hidden layer weights: V x N and the rest is the same as in One Word Context

The second point of difference is in the expression for calculating hidden layer.

In One word context: h = transpose(W). X

In Multi Word CBOW: h = ( transpose(W) * (X1 + X2 + .... XC) ) / C

#### 3. Skip Gram:
It is the reverse of Multi Word Context, i.e. given a target word it outsputs C contextual words.

Dimensions: Input: V X 1; input->hidden weights: V X N; hidden->output weights: C [V X 1] vectors

In [28]:
# Implementing Skip grams on pre-trained model:
from gensim.models import word2vec

In [29]:
splitSen = [sentence.split() for sentence in corpus]

In [35]:
splitSen

[['Text', 'of', 'first', 'document.'],
 ['Text', 'of', 'the', 'second', 'document', 'made', 'longer.'],
 ['Number', 'three.'],
 ['This', 'is', 'number', 'four.']]

In [39]:
model = word2vec.Word2Vec(splitSen, min_count=1)

In [41]:
model.

10000