In [19]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

# onehot --> order is important
onehot_encoder = DictVectorizer()
instances = [{'city': 'New York'}, {'city': 'San Francisco'}, {'city': 'Chapel Hill'}]

print(onehot_encoder.fit_transform(instances).toarray())

# bag of word model
corpus = [
    'UNC played Duke in basketball',
    'Duke lost the basketball game',
    'I ate a sandwich'
]

vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(corpus).todense()
print vectors
print vectorizer.vocabulary_
print 'Distance between 1st and 2nd documents:', euclidean_distances(vectors[0], vectors[1])
print 'Distance between 1st and 3rd documents:', euclidean_distances(vectors[0], vectors[2])
print 'Distance between 3rd and 2nd documents:', euclidean_distances(vectors[1], vectors[2])


# Extending bag-of-words with TF-IDF weights --> after reduce dimensionality
corpus2 = [
      'The dog ate a sandwich, '
      'the wizard transfigured a sandwich, and I ate a sandwich'
]
vectorizer2 = CountVectorizer(stop_words='english')
print vectorizer2.fit_transform(corpus2).todense()  # dog = 1 & sandwich = 3 (frequency)
print vectorizer2.vocabulary_

vectorizer = TfidfVectorizer(stop_words='english')
print vectorizer.fit_transform(corpus).todense()

# Space-efficient feature vectorizing with the hashing trick
corpus = ['the', 'ate', 'bacon', 'cat']
vectorizer3 = HashingVectorizer(n_features=6)
print vectorizer3.transform(corpus).todense()




[[0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]
[[0 1 1 0 1 0 1 0 0 1]
 [0 1 1 1 0 1 0 0 1 0]
 [1 0 0 0 0 0 0 1 0 0]]
{u'duke': 2, u'basketball': 1, u'lost': 5, u'played': 6, u'in': 4, u'game': 3, u'sandwich': 7, u'unc': 9, u'ate': 0, u'the': 8}
Distance between 1st and 2nd documents: [[2.44948974]]
Distance between 1st and 3rd documents: [[2.64575131]]
Distance between 3rd and 2nd documents: [[2.64575131]]
[[2 1 3 1 1]]
{u'sandwich': 2, u'wizard': 4, u'dog': 1, u'transfigured': 3, u'ate': 0}
[[0.         0.42804604 0.42804604 0.         0.         0.5628291
  0.         0.5628291 ]
 [0.         0.42804604 0.42804604 0.5628291  0.5628291  0.
  0.         0.        ]
 [0.70710678 0.         0.         0.         0.         0.
  0.70710678 0.        ]]
[[-1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.]
 [ 0.  1.  0.  0.  0.  0.]]
