In [21]:
from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_) # vocabulary: which word is represented by which number? -> id of word
# encode document
vector = vectorizer.transform(text) # create encoded vector: the index is the id of the word, only "the" (id=index=7) appears twice
# summarize encoded vector
print(vector.shape)
print(type(vector))
print(vector.toarray())

{u'brown': 0, u'lazy': 4, u'jumped': 3, u'over': 5, u'fox': 2, u'dog': 1, u'quick': 6, u'the': 7}
(1, 8)
<class 'scipy.sparse.csr.csr_matrix'>
[[1 1 1 1 1 1 1 2]]


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
# list of text documents
# Term Frequency: This summarizes how often a given word appears within a document.
# Inverse Document Frequency: This downscales words that appear a lot across documents.
text = ["The quick brown fox jumped over the lazy dog.","The dog.","The fox"]
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocabulary
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_)
print(vectorizer.idf_) # idf=inverse document frequencies: the most frequent word "the" is assigned the lowest frequency, 1
# encode document (as a sparse matrix, scores are normalized)
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape) # shape = [n_samples, n_features], will be (3,8) here
print(vector.toarray())

{u'brown': 0, u'lazy': 4, u'jumped': 3, u'over': 5, u'fox': 2, u'dog': 1, u'quick': 6, u'the': 7}
[ 1.69314718  1.28768207  1.28768207  1.69314718  1.69314718  1.69314718
  1.69314718  1.        ]
(3, 8)
[[ 0.36388646  0.27674503  0.27674503  0.36388646  0.36388646  0.36388646
   0.36388646  0.42983441]
 [ 0.          0.78980693  0.          0.          0.          0.          0.
   0.61335554]
 [ 0.          0.          0.78980693  0.          0.          0.          0.
   0.61335554]]


In [49]:
from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog.","The dog.","The fox"]
# create the transform, tokenize and build vocab
cvectorizer = CountVectorizer().fit(text)
cvector = cvectorizer.transform(text) # create encoded vector: the index is the id of the word, only "the" (id=index=7) appears twice

In [51]:
from sklearn.feature_extraction.text import TfidfTransformer
# list of text documents
# Term Frequency: This summarizes how often a given word appears within a document.
# Inverse Document Frequency: This downscales words that appear a lot across documents.
# create the transform, tokenize and build vocabulary
tvectorizer = TfidfTransformer(use_idf=False).fit(cvector)
tvector = tvectorizer.transform(cvector)

In [52]:
# NB learning
from sklearn.naive_bayes import MultinomialNB # naive Bayes classification
import numpy as np
clf = MultinomialNB().fit(tvector, np.array([0,1,2]))

In [55]:
# check results of training
docs_new = ["lazy dog", "dog","fox"]
X_new_counts = cvectorizer.transform(docs_new)
X_new_tf = tvectorizer.transform(X_new_counts)

predicted = clf.predict(X_new_tf)

print(predicted)

#for doc, category in zip(docs_new, predicted):
#    print('%r => %s' % (doc, np.array([0,1,2]))

[1 1 2]


In [3]:
from sklearn.feature_extraction.text import HashingVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]
# create the transform
vectorizer = HashingVectorizer(n_features=20) # one way hash of words to convert them to integers
# encode document (downside is that the hash is a one-way function so there is no way to convert the encoding back to a word)
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

(1, 20)
[[ 0.          0.          0.          0.          0.          0.33333333
   0.         -0.33333333  0.33333333  0.          0.          0.33333333
   0.          0.          0.         -0.33333333  0.          0.
  -0.66666667  0.        ]]


In [2]:
# one-hot encoding of 'hello world'
from numpy import argmax
# define input string
data = 'hello world'
print(data)
# define universe of possible input values
alphabet = 'abcdefghijklmnopqrstuvwxyz '
# define a mapping of chars to integers
char_to_int = dict((c, i) for i, c in enumerate(alphabet)) # seasons = ['Spring',... -> [(0, 'Spring'),...
int_to_char = dict((i, c) for i, c in enumerate(alphabet))
# integer encode input data
integer_encoded = [char_to_int[char] for char in data]
print(integer_encoded)
# one hot encode
onehot_encoded = list()
for value in integer_encoded:
	letter = [0 for _ in range(len(alphabet))]
	letter[value] = 1
	onehot_encoded.append(letter)
print(onehot_encoded)
# invert encoding
inverted = int_to_char[argmax(onehot_encoded[0])]
print(inverted)

hello world
[7, 4, 11, 11, 14, 26, 22, 14, 17, 11, 3]
[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
h


In [2]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])

OneHotEncoder(categorical_features='all', dtype=<type 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [22]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
wdict = {'a': 0, 'b': 1, 'c': 2, 'd': 3}
dictarr = np.asarray(wdict.values()).reshape(-1, 1)
enc = OneHotEncoder()
enc.fit(dictarr)
enc.transform([[2]]).toarray()

array([[ 0.,  0.,  1.,  0.]])

In [19]:
# vectorize text with skip-grams in scikit-learn by passing the skip gram tokens as the vocabulary
# to CountVectorizer will not work -> example vectorizer that produces 1-skip-2-grams
from toolz import itertoolz, compose
from toolz.curried import map as cmap, sliding_window, pluck
from sklearn.feature_extraction.text import CountVectorizer

# pluck: plucking “fields” from an iterable of values e.g. pluck(objects, 'age') -> [30, 56, 56]
#        or list(pluck([0, 1], [[1, 2, 3], [4, 5, 7]])) -> [(1, 2), (4, 5)]
# sliding_window creates a sliding window: list(sliding_window(2, [1, 2, 3, 4])) -> [(1, 2), (2, 3), (3, 4)]
# map: apply function to every item of iterable and return a list of the results
# curried form of map: map(func,[[1,2],[3,4]]) can be written as map(func)([[1,2],[3,4]])

class SkipGramVectorizer(CountVectorizer):
    def build_analyzer(self):    
        preprocess = self.build_preprocessor() # Return a function to preprocess the text before tokenization
        stop_words = self.get_stop_words() # Build or fetch the effective stop words (words that are filtered out) list
        tokenize = self.build_tokenizer() # Return a function that splits a string into a sequence of tokens
        return lambda doc: self._word_skip_grams( # lambda-functions: anonymous functions not bound to a name
                compose(tokenize, preprocess, self.decode)(doc), # compose: ompose functions to operate in series
                stop_words)

    def _word_skip_grams(self, tokens, stop_words=None):
        # handle stop words
        if stop_words is not None:
            tokens = [w for w in tokens if w not in stop_words]
        print list(sliding_window(3, tokens))
        return compose(cmap(' '.join), pluck([0, 2]), sliding_window(3))(tokens) # str.join(sequence)

In [20]:
text = ['the rain in Spain falls mainly on the plain']

vect = SkipGramVectorizer()
vect.fit(text) # Learn a vocabulary dictionary of all tokens in the raw documents
vect.get_feature_names() # Array mapping from feature integer indices to feature name

[(u'the', u'rain', u'in'), (u'rain', u'in', u'spain'), (u'in', u'spain', u'falls'), (u'spain', u'falls', u'mainly'), (u'falls', u'mainly', u'on'), (u'mainly', u'on', u'the'), (u'on', u'the', u'plain')]


[u'falls on',
 u'in falls',
 u'mainly the',
 u'on plain',
 u'rain spain',
 u'spain mainly',
 u'the in']

In [24]:
tokens = [2,3,4,5,6]
stop_words = [0,3]
print [w for w in tokens if w not in stop_words]
separator = "#-#"
sequence = ("a", "b", "c")
print separator.join(sequence)

[2, 4, 5, 6]
a#-#b#-#c


In [7]:
# vectorize text with skip-grams in scikit-learn by passing the skip gram tokens as the vocabulary
# to CountVectorizer will not work -> example vectorizer that produces 1-skip-nsize-grams
from toolz import itertoolz, compose
from toolz.curried import map as cmap, sliding_window, pluck
from sklearn.feature_extraction.text import CountVectorizer

# pluck: plucking “fields” from an iterable of values e.g. pluck(objects, 'age') -> [30, 56, 56]
#        or list(pluck([0, 1], [[1, 2, 3], [4, 5, 7]])) -> [(1, 2), (4, 5)]
# sliding_window creates a sliding window: list(sliding_window(2, [1, 2, 3, 4])) -> [(1, 2), (2, 3), (3, 4)]
# map: apply function to every item of iterable and return a list of the results
# curried form of map: map(func,[[1,2],[3,4]]) can be written as map(func)([[1,2],[3,4]])

nsize = 5

class SkipGramVectorizer(CountVectorizer):
    def build_analyzer(self):
        preprocess = self.build_preprocessor() # Return a function to preprocess the text before tokenization
        stop_words = self.get_stop_words() # Build or fetch the effective stop words (words that are filtered out) list
        tokenize = self.build_tokenizer() # Return a function that splits a string into a sequence of tokens
        return lambda doc: self._word_skip_grams( # lambda-functions: anonymous functions not bound to a name
                compose(tokenize, preprocess, self.decode)(doc), # compose: ompose functions to operate in series
                stop_words)

    def _word_skip_grams(self, tokens, stop_words=None):
        # handle stop words
        if stop_words is not None:
            tokens = [w for w in tokens if w not in stop_words]
        # print list(compose(cmap(' '.join), pluck(range(0,nsize)), sliding_window(nsize))(tokens))
        return compose(cmap(' '.join), pluck(range(0,nsize)), sliding_window(nsize))(tokens) # str.join(sequence)

In [8]:
text = ['the rain in Spain falls mainly on the plain with pain where hardly any rain stains the']

vect = SkipGramVectorizer()
vect.fit(text) # Learn a vocabulary dictionary of all tokens in the raw documents
vect.get_feature_names() # Array mapping from feature integer indices to feature name

[u'falls mainly on the plain',
 u'hardly any rain stains the',
 u'in spain falls mainly on',
 u'mainly on the plain with',
 u'on the plain with pain',
 u'pain where hardly any rain',
 u'plain with pain where hardly',
 u'rain in spain falls mainly',
 u'spain falls mainly on the',
 u'the plain with pain where',
 u'the rain in spain falls',
 u'where hardly any rain stains',
 u'with pain where hardly any']

In [9]:
# build a dictonary from a text as input to a one-hot encoder
# the number of unique words == the vocabulary == dimension V
from toolz import itertoolz

text = 'the rain in Spain falls mainly on the plain with pain where hardly any rain stains the'
textlist = text.split(" ")
myDict = {}
i = 0
for word in textlist:
    # print word, i
    newEntry = {word.lower(): i}
    if not myDict.has_key(word):
        myDict.update(newEntry)
    i = i + 1

print myDict


{'on': 6, 'pain': 10, 'stains': 15, 'plain': 8, 'mainly': 5, 'rain': 1, 'falls': 4, 'where': 11, 'hardly': 12, 'in': 2, 'the': 0, 'with': 9, 'any': 13, 'spain': 3}


In [10]:
# one-hot encode the dictionary
import numpy as np
from sklearn.preprocessing import OneHotEncoder

dictarr = np.asarray(myDict.values()).reshape(-1, 1)
enc = OneHotEncoder()
enc.fit(dictarr)
enc.transform([[8]]).toarray()

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
         0.]])

In [16]:
# one-hot encode the 1-skip-ngrams from SkipGramVectorizer: map words from vocabulary (dict myDict)

from toolz import itertoolz

text = 'the rain in Spain falls mainly on the plain with pain where hardly any rain stains the'

ovecm = []
for ovec in vect.get_feature_names():
    # print ovec
    ovecd = {}
    for oword in ovec.split(" "):
        # print (oword, myDict[oword])
        ovecd[oword] = myDict[oword]
    ovecm.append(ovecd)

print ovecm

[{u'on': 6, u'the': 0, u'plain': 8, u'mainly': 5, u'falls': 4}, {u'stains': 15, u'the': 0, u'hardly': 12, u'any': 13, u'rain': 1}, {u'on': 6, u'falls': 4, u'mainly': 5, u'spain': 3, u'in': 2}, {u'with': 9, u'on': 6, u'the': 0, u'mainly': 5, u'plain': 8}, {u'on': 6, u'the': 0, u'with': 9, u'pain': 10, u'plain': 8}, {u'where': 11, u'pain': 10, u'any': 13, u'rain': 1, u'hardly': 12}, {u'plain': 8, u'with': 9, u'where': 11, u'hardly': 12, u'pain': 10}, {u'falls': 4, u'mainly': 5, u'spain': 3, u'rain': 1, u'in': 2}, {u'on': 6, u'the': 0, u'mainly': 5, u'spain': 3, u'falls': 4}, {u'plain': 8, u'the': 0, u'with': 9, u'where': 11, u'pain': 10}, {u'the': 0, u'falls': 4, u'spain': 3, u'rain': 1, u'in': 2}, {u'stains': 15, u'hardly': 12, u'where': 11, u'any': 13, u'rain': 1}, {u'any': 13, u'pain': 10, u'with': 9, u'where': 11, u'hardly': 12}]


In [22]:
# now one-hot encode the words mapped from vocabulary

import numpy as np
from sklearn.preprocessing import OneHotEncoder
for wdict in ovecm:
    # print np.asarray(wdict.values()).reshape(-1, 1)
    dictarr = np.asarray(wdict.values()).reshape(-1, 1) # extract values from dict (.values), as array and transform (reshape)
    enc = OneHotEncoder() # Encode categorical integer features using a one-hot aka one-of-K scheme
    # The output will be a sparse matrix where each column corresponds to one possible value of one feature
    enc.fit(dictarr) # Fit OneHotEncoder to dictarr
    print enc.transform([[0]]).toarray()

[[ 1.  0.  0.  0.  0.]]
[[ 1.  0.  0.  0.  0.]]
[[ 0.  0.  0.  0.  0.]]
[[ 1.  0.  0.  0.  0.]]
[[ 1.  0.  0.  0.  0.]]
[[ 0.  0.  0.  0.  0.]]
[[ 0.  0.  0.  0.  0.]]
[[ 0.  0.  0.  0.  0.]]
[[ 1.  0.  0.  0.  0.]]
[[ 1.  0.  0.  0.  0.]]
[[ 1.  0.  0.  0.  0.]]
[[ 0.  0.  0.  0.  0.]]
[[ 0.  0.  0.  0.  0.]]


In [12]:
# skip-gram learning example

import numpy as np

wi = np.array([[-0.094, -0.44, 0.31], [-0.491, -0.23, 0.065], [0.07, 0.17, -0.36], [0.1, 0.46, 0.08], [-0.23, -0.15, -0.04], [0.41, -0.19, -0.44], [0.18, 0.09, 0.28], [-0.05, 0.49, 0.26]])
wo = np.array([[0.02, 0.48, 0.43, 0.37, -0.36, -0.12, 0.27, -0.35], [-0.37, 0.42, -0.26, -0.15, 0.03, 0.35, -0.14, 0.13], [0.42, 0.36, 0.47, -0.02, -0.42, -0.44, 0.27, -0.45]])
xk = np.array([[0, 1, 0, 0, 0, 0, 0, 0]])
ht = np.dot(xk, wi)
u0 = np.dot(ht, wo)

#print u0
yk = np.exp(u0) / np.dot(np.exp(u0), np.exp(u0).transpose())
#print yk

# backpropagation (following Xin Rong's paper here)
tc = np.array([[0, 1, 0, 0, 0, 0, 0, 0]]) # truth
ej = yk - tc # error
etha = 0.01

# Update equation for hidden→output weights
wo = wo - etha * (np.transpose(ht) * ej)
# Update equation for input→hidden weights
ehi = np.dot(ej, np.transpose(wo)) # dE/dhi
wi = wi - etha * ehi

# next cycles
i=0
while i<200:
    ht = np.dot(xk, wi)
    u0 = np.dot(ht, wo)
    yk = np.exp(u0) / np.dot(np.exp(u0), np.exp(u0).transpose())
    ej = yk - tc # error
    #print ej
    #ei = np.sum(ej) sum has to be over context, here C=1
    wo = wo - etha * (np.transpose(ht) * ej)
    ehi = np.dot(ej, np.transpose(wo)) # dE/dhi
    wi = wi - etha * ehi
    i += 1

#print '%.2f' % yk
np.set_printoptions(precision=2)
print yk

[[ 0.02  0.14  0.02  0.02  0.01  0.01  0.02  0.01]]
