In [1]:
import nltk
import string
from collections import defaultdict

### Frequency
- vocabulary = unique sets of words
- vector; rows show focuments; columns represent vocaburaly
- focusing on a specific document, try to search the frequency of words.
- do not consider the grammer and relative position of words

In [2]:
def tokenize(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()

    for token in nltk.word_tokenize(text):
        if token in string.punctuation: continue
        yield stem.stem(token)

In [3]:
corpus = ['The elephant sneezed at the sight of potatoes',
          'Bats can see via echolocation. See the bat sight sneeze!',
          'Wondering, she opend the door to the studio.']
          
tokenize(corpus)
def freq_vectorize(doc):
    features = defaultdict(int)
    for token in tokenize(doc):
        features[token] += 1
    return features

vectors = list(map(freq_vectorize, corpus))
print("Corpus 01: ", vectors[0])
print("Corpus 02: ", vectors[1])
print("Corpus 03: ", vectors[2])

Corpus 01:  defaultdict(<class 'int'>, {'the': 2, 'eleph': 1, 'sneez': 1, 'at': 1, 'sight': 1, 'of': 1, 'potato': 1})
Corpus 02:  defaultdict(<class 'int'>, {'bat': 2, 'can': 1, 'see': 2, 'via': 1, 'echoloc': 1, 'the': 1, 'sight': 1, 'sneez': 1})
Corpus 03:  defaultdict(<class 'int'>, {'wonder': 1, 'she': 1, 'opend': 1, 'the': 2, 'door': 1, 'to': 1, 'studio': 1})


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(corpus)
print('Words list:')
print(vectorizer.get_feature_names_out())
print('')
print('Frequency Vectors:')
print(vectors.todense())


Words list:
['at' 'bat' 'bats' 'can' 'door' 'echolocation' 'elephant' 'of' 'opend'
 'potatoes' 'see' 'she' 'sight' 'sneeze' 'sneezed' 'studio' 'the' 'to'
 'via' 'wondering']

Frequency Vectors:
[[1 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 2 0 0 0]
 [0 1 1 1 0 1 0 0 0 0 2 0 1 1 0 0 1 0 1 0]
 [0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 1 2 1 0 1]]


### One-hot Encoding
- effective for very small documents that do not contain many repeated element.
- do not care about the importance of words by frequency.
- using for artificial neural networks, whose activation functions require input to be in the discrete ranges.

- resprenting similality and difference at the document level

In [5]:
corpus  = [tokenize(doc) for doc in corpus]

def onehot_vectorize(doc):
    return {
        token : True 
        for token in doc
    }

onehot_vectors = map(onehot_vectorize, corpus)
print(list(onehot_vectors))

[{'the': True, 'eleph': True, 'sneez': True, 'at': True, 'sight': True, 'of': True, 'potato': True}, {'bat': True, 'can': True, 'see': True, 'via': True, 'echoloc': True, 'the': True, 'sight': True, 'sneez': True}, {'wonder': True, 'she': True, 'opend': True, 'the': True, 'door': True, 'to': True, 'studio': True}]


In [14]:
from sklearn.preprocessing import Binarizer

corpus = ['The elephant sneezed at the sight of potatoes',
          'Bats can see via echolocation. See the bat sight sneeze!',
          'Wondering, she opend the door to the studio.']

freq   = CountVectorizer()
corpus = freq.fit_transform(corpus)

onehot = Binarizer()
corpus = onehot.fit_transform(corpus.toarray())

corpus

array([[1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1]],
      dtype=int64)

### Term Frequency-Inverse Document Frequecy (TF-IDF)
- Consider relative frequency or rareness of tokens in the document against their frequency in other docuements.
- Rare words can give us the important insight.
- Higher score to terms that are very relevant to a specific instance.
- computing a per-term basis
- measured by 

- TF (term frequency): number of times a term t occurs in document d.
- IDF (inverse document frequency): log(number of documents / number of occurrences of term t in all documents.)

- stop-words are weighted lower due to more frequent
- widely using for bag-og-words models

- why we use log?
    - for the purpose of normalization
    - The more words are appared in the whole documents, the lower the IDF is.
      -> the larger the denominator, the lower the IDF is -> Not rare.

In [31]:
from nltk.text import TextCollection

corpus = ['The elephant sneezed at the sight of potatoes',
          'Bats can see via echolocation. See the bat sight sneeze!',
          'Wondering, she opend the door to the studio.']

def tfidf_vectorize(corpus):
    corpus = [tokenize(doc) for doc in corpus]
    texts = TextCollection(corpus)

    for doc in corpus:
        yield {
            term: texts.tf_idf(term, doc)
            for term in doc
        }

tfidf = tfidf_vectorize(corpus)
tfidf

<generator object tfidf_vectorize at 0x000001E04C66EDC0>

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = ['The elephant sneezed at the sight of potatoes',
          'Bats can see via echolocation. See the bat sight sneeze!',
          'Wondering, she opend the door to the studio.']

tfidf = TfidfVectorizer()
corpus = tfidf.fit_transform(corpus)

print('Words list: ')
print(tfidf.get_feature_names_out())
print(' ')
print('Vector')
print(corpus.toarray())


Words list: 
['at' 'bat' 'bats' 'can' 'door' 'echolocation' 'elephant' 'of' 'opend'
 'potatoes' 'see' 'she' 'sight' 'sneeze' 'sneezed' 'studio' 'the' 'to'
 'via' 'wondering']
 
Vector
[[0.37867627 0.         0.         0.         0.         0.
  0.37867627 0.37867627 0.         0.37867627 0.         0.
  0.28799306 0.         0.37867627 0.         0.44730461 0.
  0.         0.        ]
 [0.         0.30251368 0.30251368 0.30251368 0.         0.30251368
  0.         0.         0.         0.         0.60502736 0.
  0.23006945 0.30251368 0.         0.         0.17866945 0.
  0.30251368 0.        ]
 [0.         0.         0.         0.         0.36772387 0.
  0.         0.         0.36772387 0.         0.         0.36772387
  0.         0.         0.         0.36772387 0.43436728 0.36772387
  0.         0.36772387]]


### Disributed Representation

- for Neural Networks
- Context-based, continuous term similarity encoding
- performance is intensive, so using the pre-trained model

Word2vec
- implementing a word ebedding model tha enables us to create these kinds of distributed representations.
- training word reporesentations based on either a continueous bag-of-words(CBOW) or skip-gram model.