In [21]:
from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_) # vocabulary: which word is represented by which number? -> id of word
# encode document
vector = vectorizer.transform(text) # create encoded vector: the index is the id of the word, only "the" (id=index=7) appears twice
# summarize encoded vector
print(vector.shape)
print(type(vector))
print(vector.toarray())

{u'brown': 0, u'lazy': 4, u'jumped': 3, u'over': 5, u'fox': 2, u'dog': 1, u'quick': 6, u'the': 7}
(1, 8)
<class 'scipy.sparse.csr.csr_matrix'>
[[1 1 1 1 1 1 1 2]]


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
# list of text documents
# Term Frequency: This summarizes how often a given word appears within a document.
# Inverse Document Frequency: This downscales words that appear a lot across documents.
text = ["The quick brown fox jumped over the lazy dog.","The dog.","The fox"]
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocabulary
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_)
print(vectorizer.idf_) # idf=inverse document frequencies: the most frequent word "the" is assigned the lowest frequency, 1
# encode document (as a sparse matrix, scores are normalized)
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape) # shape = [n_samples, n_features], will be (3,8) here
print(vector.toarray())

{u'brown': 0, u'lazy': 4, u'jumped': 3, u'over': 5, u'fox': 2, u'dog': 1, u'quick': 6, u'the': 7}
[ 1.69314718  1.28768207  1.28768207  1.69314718  1.69314718  1.69314718
  1.69314718  1.        ]
(3, 8)
[[ 0.36388646  0.27674503  0.27674503  0.36388646  0.36388646  0.36388646
   0.36388646  0.42983441]
 [ 0.          0.78980693  0.          0.          0.          0.          0.
   0.61335554]
 [ 0.          0.          0.78980693  0.          0.          0.          0.
   0.61335554]]


In [49]:
from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog.","The dog.","The fox"]
# create the transform, tokenize and build vocab
cvectorizer = CountVectorizer().fit(text)
cvector = cvectorizer.transform(text) # create encoded vector: the index is the id of the word, only "the" (id=index=7) appears twice

In [51]:
from sklearn.feature_extraction.text import TfidfTransformer
# list of text documents
# Term Frequency: This summarizes how often a given word appears within a document.
# Inverse Document Frequency: This downscales words that appear a lot across documents.
# create the transform, tokenize and build vocabulary
tvectorizer = TfidfTransformer(use_idf=False).fit(cvector)
tvector = tvectorizer.transform(cvector)

In [52]:
# NB learning
from sklearn.naive_bayes import MultinomialNB # naive Bayes classification
import numpy as np
clf = MultinomialNB().fit(tvector, np.array([0,1,2]))

In [55]:
# check results of training
docs_new = ["lazy dog", "dog","fox"]
X_new_counts = cvectorizer.transform(docs_new)
X_new_tf = tvectorizer.transform(X_new_counts)

predicted = clf.predict(X_new_tf)

print(predicted)

#for doc, category in zip(docs_new, predicted):
#    print('%r => %s' % (doc, np.array([0,1,2]))

[1 1 2]


In [3]:
from sklearn.feature_extraction.text import HashingVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]
# create the transform
vectorizer = HashingVectorizer(n_features=20) # one way hash of words to convert them to integers
# encode document (downside is that the hash is a one-way function so there is no way to convert the encoding back to a word)
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

(1, 20)
[[ 0.          0.          0.          0.          0.          0.33333333
   0.         -0.33333333  0.33333333  0.          0.          0.33333333
   0.          0.          0.         -0.33333333  0.          0.
  -0.66666667  0.        ]]


In [2]:
# one-hot encoding of 'hello world'
from numpy import argmax
# define input string
data = 'hello world'
print(data)
# define universe of possible input values
alphabet = 'abcdefghijklmnopqrstuvwxyz '
# define a mapping of chars to integers
char_to_int = dict((c, i) for i, c in enumerate(alphabet)) # seasons = ['Spring',... -> [(0, 'Spring'),...
int_to_char = dict((i, c) for i, c in enumerate(alphabet))
# integer encode input data
integer_encoded = [char_to_int[char] for char in data]
print(integer_encoded)
# one hot encode
onehot_encoded = list()
for value in integer_encoded:
	letter = [0 for _ in range(len(alphabet))]
	letter[value] = 1
	onehot_encoded.append(letter)
print(onehot_encoded)
# invert encoding
inverted = int_to_char[argmax(onehot_encoded[0])]
print(inverted)

hello world
[7, 4, 11, 11, 14, 26, 22, 14, 17, 11, 3]
[[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
h


In [2]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])

OneHotEncoder(categorical_features='all', dtype=<type 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [22]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
wdict = {'a': 0, 'b': 1, 'c': 2, 'd': 3}
dictarr = np.asarray(wdict.values()).reshape(-1, 1)
enc = OneHotEncoder()
enc.fit(dictarr)
enc.transform([[2]]).toarray()

array([[ 0.,  0.,  1.,  0.]])

In [19]:
# vectorize text with skip-grams in scikit-learn by passing the skip gram tokens as the vocabulary
# to CountVectorizer will not work -> example vectorizer that produces 1-skip-2-grams
from toolz import itertoolz, compose
from toolz.curried import map as cmap, sliding_window, pluck
from sklearn.feature_extraction.text import CountVectorizer

# pluck: plucking “fields” from an iterable of values e.g. pluck(objects, 'age') -> [30, 56, 56]
#        or list(pluck([0, 1], [[1, 2, 3], [4, 5, 7]])) -> [(1, 2), (4, 5)]
# sliding_window creates a sliding window: list(sliding_window(2, [1, 2, 3, 4])) -> [(1, 2), (2, 3), (3, 4)]
# map: apply function to every item of iterable and return a list of the results
# curried form of map: map(func,[[1,2],[3,4]]) can be written as map(func)([[1,2],[3,4]])

class SkipGramVectorizer(CountVectorizer):
    def build_analyzer(self):    
        preprocess = self.build_preprocessor() # Return a function to preprocess the text before tokenization
        stop_words = self.get_stop_words() # Build or fetch the effective stop words (words that are filtered out) list
        tokenize = self.build_tokenizer() # Return a function that splits a string into a sequence of tokens
        return lambda doc: self._word_skip_grams( # lambda-functions: anonymous functions not bound to a name
                compose(tokenize, preprocess, self.decode)(doc), # compose: ompose functions to operate in series
                stop_words)

    def _word_skip_grams(self, tokens, stop_words=None):
        # handle stop words
        if stop_words is not None:
            tokens = [w for w in tokens if w not in stop_words]
        print list(sliding_window(3, tokens))
        return compose(cmap(' '.join), pluck([0, 2]), sliding_window(3))(tokens) # str.join(sequence)

In [20]:
text = ['the rain in Spain falls mainly on the plain']

vect = SkipGramVectorizer()
vect.fit(text) # Learn a vocabulary dictionary of all tokens in the raw documents
vect.get_feature_names() # Array mapping from feature integer indices to feature name

[(u'the', u'rain', u'in'), (u'rain', u'in', u'spain'), (u'in', u'spain', u'falls'), (u'spain', u'falls', u'mainly'), (u'falls', u'mainly', u'on'), (u'mainly', u'on', u'the'), (u'on', u'the', u'plain')]


[u'falls on',
 u'in falls',
 u'mainly the',
 u'on plain',
 u'rain spain',
 u'spain mainly',
 u'the in']

In [24]:
tokens = [2,3,4,5,6]
stop_words = [0,3]
print [w for w in tokens if w not in stop_words]
separator = "#-#"
sequence = ("a", "b", "c")
print separator.join(sequence)

[2, 4, 5, 6]
a#-#b#-#c


In [7]:
# vectorize text with skip-grams in scikit-learn by passing the skip gram tokens as the vocabulary
# to CountVectorizer will not work -> example vectorizer that produces 1-skip-nsize-grams
from toolz import itertoolz, compose
from toolz.curried import map as cmap, sliding_window, pluck
from sklearn.feature_extraction.text import CountVectorizer

# pluck: plucking “fields” from an iterable of values e.g. pluck(objects, 'age') -> [30, 56, 56]
#        or list(pluck([0, 1], [[1, 2, 3], [4, 5, 7]])) -> [(1, 2), (4, 5)]
# sliding_window creates a sliding window: list(sliding_window(2, [1, 2, 3, 4])) -> [(1, 2), (2, 3), (3, 4)]
# map: apply function to every item of iterable and return a list of the results
# curried form of map: map(func,[[1,2],[3,4]]) can be written as map(func)([[1,2],[3,4]])

nsize = 5

class SkipGramVectorizer(CountVectorizer):
    def build_analyzer(self):
        preprocess = self.build_preprocessor() # Return a function to preprocess the text before tokenization
        stop_words = self.get_stop_words() # Build or fetch the effective stop words (words that are filtered out) list
        tokenize = self.build_tokenizer() # Return a function that splits a string into a sequence of tokens
        return lambda doc: self._word_skip_grams( # lambda-functions: anonymous functions not bound to a name
                compose(tokenize, preprocess, self.decode)(doc), # compose: ompose functions to operate in series
                stop_words)

    def _word_skip_grams(self, tokens, stop_words=None):
        # handle stop words
        if stop_words is not None:
            tokens = [w for w in tokens if w not in stop_words]
        # print list(compose(cmap(' '.join), pluck(range(0,nsize)), sliding_window(nsize))(tokens))
        return compose(cmap(' '.join), pluck(range(0,nsize)), sliding_window(nsize))(tokens) # str.join(sequence)

In [8]:
text = ['the rain in Spain falls mainly on the plain with pain where hardly any rain stains the']

vect = SkipGramVectorizer()
vect.fit(text) # Learn a vocabulary dictionary of all tokens in the raw documents
vect.get_feature_names() # Array mapping from feature integer indices to feature name

[u'falls mainly on the plain',
 u'hardly any rain stains the',
 u'in spain falls mainly on',
 u'mainly on the plain with',
 u'on the plain with pain',
 u'pain where hardly any rain',
 u'plain with pain where hardly',
 u'rain in spain falls mainly',
 u'spain falls mainly on the',
 u'the plain with pain where',
 u'the rain in spain falls',
 u'where hardly any rain stains',
 u'with pain where hardly any']

In [1]:
# build a dictonary from a text as input to a one-hot encoder
# the number of unique words == the vocabulary == dimension V
from toolz import itertoolz

text = 'the rain in Spain falls mainly on the plain with pain where hardly any rain stains the'
textlist = text.split(" ")
myDict = {}
i = 0
for word in textlist:
    # print word, i
    newEntry = {word.lower(): i}
    if not myDict.has_key(word):
        myDict.update(newEntry)
    i = i + 1

print myDict


{'on': 6, 'pain': 10, 'stains': 15, 'plain': 8, 'mainly': 5, 'rain': 1, 'falls': 4, 'where': 11, 'hardly': 12, 'in': 2, 'the': 0, 'with': 9, 'any': 13, 'spain': 3}


In [2]:
# one-hot encode the dictionary
import numpy as np
from sklearn.preprocessing import OneHotEncoder

dictarr = np.asarray(myDict.values()).reshape(-1, 1)
enc = OneHotEncoder()
enc.fit(dictarr)
enc.transform([[8]]).toarray()

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
         0.]])

In [16]:
# one-hot encode the 1-skip-ngrams from SkipGramVectorizer: map words from vocabulary (dict myDict)

from toolz import itertoolz

text = 'the rain in Spain falls mainly on the plain with pain where hardly any rain stains the'

ovecm = []
for ovec in vect.get_feature_names():
    # print ovec
    ovecd = {}
    for oword in ovec.split(" "):
        # print (oword, myDict[oword])
        ovecd[oword] = myDict[oword]
    ovecm.append(ovecd)

print ovecm

[{u'on': 6, u'the': 0, u'plain': 8, u'mainly': 5, u'falls': 4}, {u'stains': 15, u'the': 0, u'hardly': 12, u'any': 13, u'rain': 1}, {u'on': 6, u'falls': 4, u'mainly': 5, u'spain': 3, u'in': 2}, {u'with': 9, u'on': 6, u'the': 0, u'mainly': 5, u'plain': 8}, {u'on': 6, u'the': 0, u'with': 9, u'pain': 10, u'plain': 8}, {u'where': 11, u'pain': 10, u'any': 13, u'rain': 1, u'hardly': 12}, {u'plain': 8, u'with': 9, u'where': 11, u'hardly': 12, u'pain': 10}, {u'falls': 4, u'mainly': 5, u'spain': 3, u'rain': 1, u'in': 2}, {u'on': 6, u'the': 0, u'mainly': 5, u'spain': 3, u'falls': 4}, {u'plain': 8, u'the': 0, u'with': 9, u'where': 11, u'pain': 10}, {u'the': 0, u'falls': 4, u'spain': 3, u'rain': 1, u'in': 2}, {u'stains': 15, u'hardly': 12, u'where': 11, u'any': 13, u'rain': 1}, {u'any': 13, u'pain': 10, u'with': 9, u'where': 11, u'hardly': 12}]


In [22]:
# now one-hot encode the words mapped from vocabulary

import numpy as np
from sklearn.preprocessing import OneHotEncoder
for wdict in ovecm:
    # print np.asarray(wdict.values()).reshape(-1, 1)
    dictarr = np.asarray(wdict.values()).reshape(-1, 1) # extract values from dict (.values), as array and transform (reshape)
    enc = OneHotEncoder() # Encode categorical integer features using a one-hot aka one-of-K scheme
    # The output will be a sparse matrix where each column corresponds to one possible value of one feature
    enc.fit(dictarr) # Fit OneHotEncoder to dictarr
    print enc.transform([[0]]).toarray()

[[ 1.  0.  0.  0.  0.]]
[[ 1.  0.  0.  0.  0.]]
[[ 0.  0.  0.  0.  0.]]
[[ 1.  0.  0.  0.  0.]]
[[ 1.  0.  0.  0.  0.]]
[[ 0.  0.  0.  0.  0.]]
[[ 0.  0.  0.  0.  0.]]
[[ 0.  0.  0.  0.  0.]]
[[ 1.  0.  0.  0.  0.]]
[[ 1.  0.  0.  0.  0.]]
[[ 1.  0.  0.  0.  0.]]
[[ 0.  0.  0.  0.  0.]]
[[ 0.  0.  0.  0.  0.]]


In [1]:
# skip-gram learning example
# model structure
# xk -> wi -> ht -> wo -> yk : tc

import numpy as np

wi = np.array([[-0.094, -0.44, 0.31], [-0.491, -0.23, 0.065], [0.07, 0.17, -0.36], [0.1, 0.46, 0.08], [-0.23, -0.15, -0.04], [0.41, -0.19, -0.44], [0.18, 0.09, 0.28], [-0.05, 0.49, 0.26]])
wo = np.array([[0.02, 0.48, 0.43, 0.37, -0.36, -0.12, 0.27, -0.35], [-0.37, 0.42, -0.26, -0.15, 0.03, 0.35, -0.14, 0.13], [0.42, 0.36, 0.47, -0.02, -0.42, -0.44, 0.27, -0.45]])
xk = np.array([[0, 1, 0, 0, 0, 0, 0, 0]])
ht = np.dot(xk, wi)
u0 = np.dot(ht, wo)

#print u0
yk = np.exp(u0) / np.dot(np.exp(u0), np.exp(u0).transpose())
#print yk

# backpropagation (following Xin Rong's paper here)
tc = np.array([[0, 1, 0, 0, 0, 0, 0, 0]]) # truth
ej = yk - tc # error
etha = 0.01

# Update equation for hidden→output weights
wo = wo - etha * (np.transpose(ht) * ej)
# Update equation for input→hidden weights
ehi = np.dot(ej, np.transpose(wo)) # dE/dhi
wi = wi - etha * ehi

# next cycles
i=0
while i<200:
    ht = np.dot(xk, wi)
    u0 = np.dot(ht, wo)
    yk = np.exp(u0) / np.dot(np.exp(u0), np.exp(u0).transpose())
    ej = yk - tc # error
    #print ej
    #ei = np.sum(ej) sum has to be over context, here C=1
    wo = wo - etha * (np.transpose(ht) * ej)
    ehi = np.dot(ej, np.transpose(wo)) # dE/dhi
    wi = wi - etha * ehi
    i += 1

#print '%.2f' % yk
np.set_printoptions(precision=2)
print yk

[[ 0.02  0.14  0.02  0.02  0.01  0.01  0.02  0.01]]


In [1]:
import math

def sig(x, ds):
    if ds:
        return sig(x, False) * (1.0 - sig(x, False))
    return 1.0 / (1.0 + math.exp(-x))

print sig(2.0, True)

0.104993585404


In [46]:
# negative sampling weight updates (based on above example)
# model structure
# xk -> wi -> ht -> wo -> yk : tc
import numpy as np

def sig(x, ds):
    if ds:
        return sig(x, False) * (1.0 - sig(x, False))
    return 1.0 / (1.0 + np.exp(-x))

# negative sample: all the words that train to 0: P(wi)=f(wi^3/4)/sum(f(wi)^3/4)
# for now manual
wneg = [0, 0, 0, 0, 1, 1, 1, 1] # truth is the 2nd word, 4 words in the negative sample

# vocabulary of V=8, N=3 hidden nodes: wi(VxN), wo(NxV)
wi = np.array([[-0.094, -0.44, 0.31], [-0.491, -0.23, 0.065], [0.07, 0.17, -0.36], [0.1, 0.46, 0.08], [-0.23, -0.15, -0.04], [0.41, -0.19, -0.44], [0.18, 0.09, 0.28], [-0.05, 0.49, 0.26]])
wo = np.array([[0.02, 0.48, 0.43, 0.37, -0.36, -0.12, 0.27, -0.35], [-0.37, 0.42, -0.26, -0.15, 0.03, 0.35, -0.14, 0.13], [0.42, 0.36, 0.47, -0.02, -0.42, -0.44, 0.27, -0.45]])
xk = np.array([[0, 1, 0, 0, 0, 0, 0, 0]])
ht = np.dot(xk, wi)
u0 = np.dot(ht, wo)

yk = np.exp(u0) / np.dot(np.exp(u0), np.exp(u0).transpose())

# backpropagation (following Xin Rong's paper here)
tc = np.array([[0, 1, 0, 0, 0, 0, 0, 0]]) # truth
ej = yk - tc # error
etha = 0.01

# "label" of the word: tj=1 if wj in positive sample, t=0 otherwise
tj = [0, 1, 0, 0, 0, 0, 0, 0] # this is equal to truth tc

# Update equation for hidden→output weights
# todo here: only for wo and wneg -> need to restrict
wo = wo - etha * (sig(wo * np.transpose(ht), False) - tj) * np.transpose(ht)
# Update equation for input→hidden weights
ehi = (sig(wo * np.transpose(ht), False) - tj) * wo
wi = wi - etha * np.transpose(ehi)

#print yk

# next cycles
i=0
while i<400:
    ht = np.dot(xk, wi)
    u0 = np.dot(ht, wo)
    yk = np.exp(u0) / np.dot(np.exp(u0), np.exp(u0).transpose())
    ej = yk - tc # error
    wo = wo - etha * (sig(wo * np.transpose(ht), False) - tj) * np.transpose(ht)
    ehi = (sig(wo * np.transpose(ht), False) - tj) * wo
    wi = wi - etha * np.transpose(ehi)
    i += 1

print yk

[[ 0.02  0.2   0.02  0.01  0.01  0.01  0.02  0.01]]


In [6]:
# word training set threshold
import math

# Run-length encoding of a list
def rlencode(lst):
    lstlst = []
    sublst = []
    previtem = lst[0]
    for item in lst:
        if item != previtem:
            lstlst.append(sublst)
            sublst = []
        sublst.append(item)
        previtem = item
    lstlst.append(sublst)
    lst = []    
    for item in lstlst:
        lst.append([len(item), item[0]])
    return lst

# threshold probability
def thresprob(f, t):
    if f > 0.0:
        return 1.0 - math.sqrt(t / f)
    else:
        reurn -1.0

text = 'this test text the the was not this test some others some the never the never'
textlist = text.split(" ")
textlist.sort()
wordfreqs = rlencode(textlist)
for elem in wordfreqs:
    print('{0}: {1:.3f}'.format(elem[1], thresprob(elem[0], 0.1)))

never: 0.776
not: 0.684
others: 0.684
some: 0.776
test: 0.776
text: 0.684
the: 0.842
this: 0.776
was: 0.684


In [20]:
# read a large text file, line by line and build vocabulary
import os
import math
from sklearn.feature_extraction.text import TfidfVectorizer
import operator # to sort dict

def read1k():
    return f.read(1024)

def process_data(chunk, text):
    #print('processed: {0}'.format(ctr))
    text.append(unicode(chunk, errors='ignore')) # 'utf8' codec can't decode byte 0xc3
    
if __name__=="__main__":
    
    # read corpus
    os.chdir('C:\Users\Bernie\Documents\ML4D\lrgtxt0')
    f = open('todinvenedig.txt')
    text = []
    for piece in iter(read1k, ''):
        process_data(piece, text)
    
    # get stopwords
    f = open('gerstopw0.txt')
    stptext = []
    for piece in iter(read1k, ''):
        process_data(piece, stptext)
    stopwrds = []
    for elem in stptext:
        stopwrds += elem.split('\n')
    #print stopwrds
    
    # create the transform
    vectorizer = TfidfVectorizer(analyzer='word', stop_words=stopwrds)
    # tokenize and build vocabulary
    vectorizer.fit(text)
    # summarize
    #print(vectorizer.vocabulary_)
    print('size of vocabulary: {0}'.format(len(vectorizer.vocabulary_)))
    print(vectorizer.idf_) # idf=inverse document frequencies: the most frequent word "the" is assigned the lowest frequency, 1
    # encode document (as a sparse matrix, scores are normalized)
    vector = vectorizer.transform(text)
    # summarize encoded vector
    print(vector.shape) # shape = [n_samples, n_features]
    print(vector.toarray())
    
    
    # print the idf for the vocabulary, sort dict: sorted_x = sorted(x.items(), key=operator.itemgetter(1))
    idf = vectorizer.idf_
    featureidfs = sorted(dict(zip(vectorizer.get_feature_names(), idf)).items(), key=operator.itemgetter(1))
    favrg = 0.0
    for f in featureidfs:
        favrg += f[1]
    #print featureidfs
    favrg /= float(len(featureidfs))
    print favrg
    fstdev = 0.0
    for f in featureidfs:
        fstdev += (f[1] - favrg) * (f[1] - favrg)
    fstdev = math.sqrt(fstdev / float(len(featureidfs) - 1))
    print fstdev
    
    # delete all entries from featureidfs that have very low scores (2 std dev)
    #flimit = favrg - 2.0 * fstdev
    #for f in featureidfs:
        #if f[1] < flimit:
            #print f

size of vocabulary: 7116
[ 5.44265126  5.44265126  5.44265126 ...,  5.44265126  5.44265126
  5.44265126]
(169, 7116)
[[ 0.12712286  0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]
5.23157147097
0.414852073285


In [24]:
# test reading stopword file
import os

def read1l():
    return f.read(1024)

def process_data(chunk, text):
    text.append(unicode(chunk, errors='replace')) # 'utf8' codec can't decode byte 0xc3

if __name__=="__main__":
    
    os.chdir('C:\Users\Bernie\Documents\ML4D\lrgtxt0')
    f = open('gerstopw0.txt')
    text = []
    
    for piece in iter(read1k, ''):
        process_data(piece, text)
    
    #print text
    
    stopwrds = []
    
    for elem in text:
        stopwrds.append(elem.split('\n'))
    
    print stopwrds

[[u'aber', u'alle', u'allem', u'allen', u'aller', u'alles', u'als', u'also', u'am', u'an', u'ander', u'andere', u'anderem', u'anderen', u'anderer', u'anderes', u'anderm', u'andern', u'anderr', u'anders', u'auch', u'auf', u'aus', u'bei', u'bin', u'bis', u'bist', u'da', u'damit', u'dann', u'der', u'den', u'des', u'dem', u'die', u'das', u'da\ufffd\ufffd', u'dass', u'derselbe', u'derselben', u'denselben', u'desselben', u'demselben', u'dieselbe', u'dieselben', u'dasselbe', u'dazu', u'dein', u'deine', u'deinem', u'deinen', u'deiner', u'deines', u'denn', u'derer', u'dessen', u'dich', u'dir', u'du', u'dies', u'diese', u'diesem', u'diesen', u'dieser', u'dieses', u'doch', u'dort', u'durch', u'ein', u'eine', u'einem', u'einen', u'einer', u'eines', u'einig', u'einige', u'einigem', u'einigen', u'einiger', u'einiges', u'einmal', u'er', u'ihn', u'ihm', u'es', u'etwas', u'euer', u'eure', u'eurem', u'euren', u'eurer', u'eures', u'f\ufffd\ufffdr', u'gegen', u'gewesen', u'hab', u'habe', u'haben', u'hat',

In [38]:
# size of vocabulary about 7000: test init weight matrices
import numpy as np

wi = np.random.rand(7000,3) - 0.5
print wi[2,2]

-0.494480916111


In [2]:
# Pointwise Mutual Information(PMI)
from collections import Counter # implements specialized container datatypes
from math import log

def gen_bigrams(data, window_size=5):
    for idx in range(len(data)):
        window = data[idx: idx + window_size]
        if len(window) < 2:
            break
        w = window[0]
        for next_word in window[1:]:
            yield (w, next_word) # like return but returns a generator (a one-time iterator)
            

def construct_vocab(data):
    vocab = Counter()
    for (w1, w2) in gen_bigrams(data, window_size=5): # count 1gram & 2gram
        vocab.update([w1, w2, (w1, w2)])
    return vocab
        

def calc_pmi(vocab):
    det = sum(vocab.values())
    for (w1, w2) in filter(lambda el: isinstance(el, tuple), vocab):
        p_a, p_b = float(vocab[w1]), float(vocab[w2])
        p_ab = float(vocab[(w1, w2)])
        
        yield (w1, w2, log((det * p_ab) / (p_a * p_b), 2))
    

corpus = ["a", "b", "c", "d", "e", "b", "g", "a", "h"]
vocab = construct_vocab(corpus)

#print vocab

#for i in gen_bigrams(corpus):
    #print i

for (w1, w2, pmi) in calc_pmi(vocab):
    print("{}_{}: {:.3f}".format(w1, w2, pmi))

b_c: 0.115
c_g: 1.115
d_e: 0.478
b_h: 0.700
a_d: 0.308
a_b: -0.469
d_b: -0.107
e_g: 0.700
b_a: -0.469
e_b: -0.300
a_h: 1.115
b_d: -0.107
b_e: -0.300
c_e: 0.700
g_h: 1.700
b_g: 0.115
d_g: 0.893
g_a: 0.531
e_h: 1.285
c_d: 0.893
e_a: 0.115
b_b: -0.885
a_c: 0.531
d_a: 0.308
a_e: 0.115
c_b: 0.115


In [24]:
a = [('a',1), ('b',2)]
for k in a:
    # v is the list of grades for student k
    #avgDict[k] = sum(v)/ float(len(v))
    print k[1]
print 'xxxxxxxxxxxxxxxxxxxxxx'
mygenerator = (x*x for x in range(3))
for i in mygenerator:
    print(i)

1
2
xxxxxxxxxxxxxxxxxxxxxx
0
1
4


In [23]:
# phrase extraction

def valinlst(lst, val):
    for elem in lst:
        if elem == val:
            return True
    return False

a = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
win2 = 2
i = 0
phrase = []
phrases = []
for elem in a:
    if i > -win2-1 and i < len(a)-win2:
        for j in range(i-win2,i+win2+1):
            phrase.append(a[j])
        phrases.append(phrase)
        phrase = []
    i += 1
print phrases
# PMI = log(p(phrase)/tt p(w)) w from phrase
for elem in phrases:
    print valinlst(elem, 'a')

[['g', 'h', 'a', 'b', 'c'], ['h', 'a', 'b', 'c', 'd'], ['a', 'b', 'c', 'd', 'e'], ['b', 'c', 'd', 'e', 'f'], ['c', 'd', 'e', 'f', 'g'], ['d', 'e', 'f', 'g', 'h']]
True
True
True
False
False
False


In [23]:
# co-occurance matrix and PMI (example from vector semantics Stanford NLP)
import numpy as np

cooccf = np.matrix([[0., 0., .05, 0., .05], [0., 0., .05, 0., .05], [0.11, 0.05, 0., 0.05, 0.], [0.05, 0.32, 0., 0.21, 0.]])

lplcsm = 0.0 # Laplace smoothing between 0.1 to 3.0

pistar = (np.sum(cooccf, axis=1) + lplcsm) / (np.sum(cooccf) + lplcsm) # words
#print pistar
pstarj = (np.sum(cooccf, axis=0) + lplcsm) / (np.sum(cooccf) + lplcsm) # contexts
#print pstarj
pij = cooccf / np.sum(cooccf)
ppmi = np.maximum(np.log2(pij / pistar / pstarj), 0.)

print ppmi

alpha = 0.75 # Levy et al. (2015)
palpha = np.power(np.count_nonzero(cooccf, axis=0) + lplcsm, alpha) / np.power(np.count_nonzero(cooccf) + lplcsm, alpha)
ppmia = np.maximum(np.log2(pij / pistar / palpha), 0.)

print ppmia

[[ 0.          0.          2.30742853  0.          2.30742853]
 [ 0.          0.          2.30742853  0.          2.30742853]
 [ 1.69647082  0.          0.          0.          0.        ]
 [ 0.          0.56192226  0.          0.46325333  0.        ]]
[[ 0.          0.          0.74144607  0.          0.74144607]
 [ 0.          0.          0.74144607  0.          0.74144607]
 [ 0.80856027  0.          0.          0.          0.        ]
 [ 0.          0.88346508  0.          0.2757825   0.        ]]


  del sys.path[0]


In [2]:
# co-occurance test large files
import os
import math
import re # regex
import numpy as np
#import scipy.sparse as sp

def read1k():
    return f.read(1024)

def process_data(chunk, text):
    #print('processed: {0}'.format(ctr))
    text.append(unicode(chunk, errors='ignore')) # 'utf8' codec can't decode byte 0xc3
    
def valinlst(val, lst): # checks if value is in a list
    for elem in lst:
        if elem == val:
            #print('valinlst elem: {0} val: {1}'.format(elem, val))
            #re.match( val, elem, re.I)
            return True
    return False

def findphrases(corpus, win2): # returns all phrases from corpus for given window as a list
    i = 0
    phrase = []
    phrases = []
    for elem in corpus:
        if i > -win2-1 and i < len(corpus)-win2:
            for j in range(i-win2,i+win2+1): # for skip-gram we would need to omit the i-value
                phrase.append(corpus[j])
            phrases.append(phrase)
            phrase = []
        #i += 1
        i += ( 2 * win2 + 1 )
    return phrases

def rmsword(corpus, stopwords): # remove stopwords from corpus
    i = 0
    for elem in corpus:
        for sword in stopwords:
            if elem == sword:
                while True:
                    try:
                        corpus.remove(elem) # this throws an error if elem not in corpus (might have been removed already)
                        i += 1
                    except:
                        break
    return i # returns number of stopwords removed

if __name__=="__main__":
    # read corpus
    os.chdir('C:\Users\Bernie\Documents\ML4D\lrgtxt0')
    #f = open('todinvenedig.txt')
    #f = open('todinvenedigshrt.txt') # shorter version for tests
    f = open('vecsemtst0.txt') # test file with fixed similarities
    text = []
    for piece in iter(read1k, ''):
        process_data(piece, text)
    
    corpus = []
    for elem in text:
        corpus += elem.split() # splits on all whitespaces
    corpus = [t.lower() for t in corpus] # convert to lower case
    
    # remove special characters
    corpus = [re.sub(r'[^\w]', ' ', t) for t in corpus]
    
    #print corpus
    
    text = []
    # get stopwords
    f = open('gerstopw0.txt')
    for piece in iter(read1k, ''):
        process_data(piece, text)
    stopwrds = []
    for elem in text:
        stopwrds += elem.split()
    stopwrds = [t.lower() for t in stopwrds]
    #print stopwrds
    
    #print('length of corpus: {0}'.format(len(corpus)))
    print rmsword(corpus, stopwrds) # remove stopwords from corpus
    #print('length of corpus: {0}'.format(len(corpus)))
    
    # fill words set (unique!) from corpus
    words = set() # words as set: each entry unique
    for elem in corpus:
        words.add(elem)
        
    print('number of words in corpus (after stopwords removed): {0}'.format(len(words)))
    
    phrases = findphrases(corpus, 1) # extract phrases from corpus (window +/- the given size)
    #print phrases
    print('number of phrases: {0}'.format(len(phrases)))
    
    wordcont = np.zeros((len(words),len(phrases)))
    k = 1.5 # Laplacian smoothing
    
    # fill word-context (or word-phrase) matrix
    i = 0 # word index
    wordlist = []
    for word in words:
        j = 0 # phrease index
        wordlist.append(word) # to allow for index-access later
        for phrase in phrases:
            if valinlst(word, phrase):
                wordcont[i][j] += 1 + k # k for Laplacian smoothing
                #if wordcont[i][j] >= 1.0:
                #    print('wordcont[{0}][{1}]: {2} word: {3} phrease: {4}'.format(i,j,wordcont[i][j],word,phrase))
            j += 1
        i += 1
    
    #print sp.issparse(wordcont)
    #print wordcont
    #print np.nonzero(wordcont)
    #print("word set: {0} word list: {1}".format(next(iter(words)), wordlist[0]))
    
    # calculate pointwise mutual information
    fijsum = 0.0
    pistar = []
    pstarj = []
    for i in range(len(words)):
        pistar.append(0.0)
        for j in range(len(phrases)):
            if i is 0:
                pstarj.append(0.0)
            fijsum += wordcont[i][j]
            pistar[i] += wordcont[i][j]
            pstarj[j] += wordcont[i][j]
    
    #print fijsum
    #print pistar
    #print pstarj
    
    ppmi = np.zeros((len(words),len(phrases)))
    for i in range(len(words)):
        for j in range(len(phrases)):
            if wordcont[i][j] > 0.0:
                ppmi[i][j] = max(math.log((wordcont[i][j]*fijsum)/pistar[i]/pstarj[j], 2.0), 0.0)
    
    #print ppmi

0
number of words in corpus (after stopwords removed): 10
number of phrases: 70


In [3]:
# read out ppmi-matrix and display words and context/phrases
import numpy as np

def getwordi(words, indx): # access words set
    i = 0
    for elem in words:
        if i == indx:
            return elem
        i += 1

print('ppmi mean: {0} and stdev: {1}'.format(np.mean(ppmi), np.std(ppmi)))
print('ppmi max: {0} and min: {1}'.format(np.amax(ppmi), np.amin(ppmi)))

isigword = [] # index array for significant words
isiphrs = [] # index array for significant words

wcounter = 0
for i in range(len(wordlist)):
    for j in range(len(phrases)):
        if ppmi[i][j] > 0.3: # note limit
            #print('ppmi = {0} for word: {1} and phrase: {2}'.format(ppmi[i][j], wordlist[i], phrases[j]))
            isigword.append(i)
            isiphrs.append(j)
            wcounter += 1
            
print('wcounter - no. of ppmi above limit: {0}'.format(wcounter))

#for i in range(len(isigword)):
for i in range(20):
    print('ppmi {0:.2f} for word: {1} and phrase: {2} idx {3} , {4}'.format(ppmi[isigword[i]][isiphrs[i]], wordlist[isigword[i]], phrases[isiphrs[i]], isigword[i], isiphrs[i]))

ppmi mean: 0.39103800535 and stdev: 0.856071298175
ppmi max: 5.533978572 and min: 0.0
wcounter - no. of ppmi above limit: 139
ppmi 0.63 for word: fool and phrase: [u'fool', u'like', u'night'] idx 0 , 1
ppmi 1.21 for word: fool and phrase: [u'fool', u'like', u'fool'] idx 0 , 30
ppmi 1.21 for word: fool and phrase: [u'like', u'fool', u'like'] idx 0 , 31
ppmi 1.21 for word: fool and phrase: [u'fool', u'like', u'fool'] idx 0 , 32
ppmi 1.21 for word: fool and phrase: [u'like', u'fool', u'like'] idx 0 , 33
ppmi 1.21 for word: fool and phrase: [u'fool', u'like', u'fool'] idx 0 , 34
ppmi 1.21 for word: fool and phrase: [u'like', u'fool', u'like'] idx 0 , 35
ppmi 1.21 for word: fool and phrase: [u'fool', u'like', u'fool'] idx 0 , 36
ppmi 1.21 for word: fool and phrase: [u'like', u'fool', u'like'] idx 0 , 37
ppmi 1.21 for word: fool and phrase: [u'fool', u'like', u'fool'] idx 0 , 38
ppmi 1.21 for word: fool and phrase: [u'like', u'fool', u'like'] idx 0 , 39
ppmi 1.21 for word: fool and phrase: [

In [20]:
# measure similarity between words for given context/phrases
import numpy as np

def cosine(wordv, wordw): # cosine similarity for two word vectors
    sumv = 0.
    sumw = 0.
    sumvw = 0.
    for elemv, elemw in zip(wordv, wordw):
        #print('for: {0} {1}'.format(elemv, elemw))
        sumv += elemv * elemv
        sumw += elemw * elemw
        sumvw += elemv * elemw
    sumv = math.sqrt(sumv)
    sumw = math.sqrt(sumw)
    #print('{0} {1}'.format(sumv, sumw))
    if sumv > 0. and sumw > 0.:
        return sumvw / sumv / sumw
    else: return -1.

if __name__=="__main__":
    #for i in range(len(isigword)):
        #for j in range(i+1,len(isiphrs)):
    for i in range(10):
        for j in range(i+1,10):
            thiscosine = cosine(ppmi[isigword[i]], ppmi[isigword[j]])
            if thiscosine > 0.1 and wordlist[isigword[i]] != wordlist[isigword[j]]:
                print('cosine {0} and {1}: {2:.3f}'.format(wordlist[isigword[i]], wordlist[isigword[j]], thiscosine))

cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.68

cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.6

cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and fo: 0.142
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry and soldier: 0.168
cosine henry 

cosine caesar and battle: 0.622
cosine caesar and battle: 0.622
cosine caesar and battle: 0.622
cosine caesar and battle: 0.622
cosine caesar and battle: 0.622
cosine caesar and battle: 0.622
cosine caesar and battle: 0.622
cosine caesar and night: 0.235
cosine caesar and night: 0.235
cosine caesar and night: 0.235
cosine caesar and night: 0.235
cosine caesar and night: 0.235
cosine caesar and night: 0.235
cosine caesar and night: 0.235
cosine caesar and night: 0.235
cosine caesar and night: 0.235
cosine caesar and night: 0.235
cosine caesar and night: 0.235
cosine caesar and night: 0.235
cosine caesar and night: 0.235
cosine caesar and night: 0.235
cosine caesar and night: 0.235
cosine caesar and night: 0.235
cosine caesar and night: 0.235
cosine caesar and battle: 0.622
cosine caesar and battle: 0.622
cosine caesar and battle: 0.622
cosine caesar and battle: 0.622
cosine caesar and battle: 0.622
cosine caesar and battle: 0.622
cosine caesar and battle: 0.622
cosine caesar and battle:

In [4]:
# Dense vectors - applying SVD
import numpy as np
from scipy import linalg

W, S, C = linalg.svd(ppmi, overwrite_a=True, full_matrices=False)
Sc = np.diag(S)

#print W.shape, Sc.shape, C.shape
#np.allclose(ppmi, np.dot(W, np.dot(Sc, C)))
#np.dot(W, np.dot(Sc, C))

# need to reduce matrices to e.g. 50
k = 50 # number of singular values we want to keep
Cred = C[:k, :]
Sred = Sc[:k, :k]
Wred = W[:, :k]
ppmi_red = np.dot(Wred, np.dot(Sred, Cred))

print('ppmi_red mean: {0} and stdev: {1}'.format(np.mean(ppmi_red), np.std(ppmi_red)))
print('ppmi_red max: {0} and min: {1}'.format(np.amax(ppmi_red), np.amin(ppmi_red)))

ppmi_red mean: 0.39103800535 and stdev: 0.856071298175
ppmi_red max: 5.533978572 and min: -3.94305785178e-15


In [24]:
# measure similarity between words for given context/phrases
import numpy as np

if __name__=="__main__":
    
    isigword = [] # index array for significant words
    isiphrs = [] # index array for significant words

    wcounter = 0
    for i in range(len(wordlist)):
        for j in range(len(phrases)):
            if ppmi_red[i][j] > 1.0: # note limit
                isigword.append(i)
                isiphrs.append(j)
                wcounter += 1
    
    print wcounter
    
    #for i in range(len(isigword)):
        #for j in range(i+1,len(isiphrs)):
    for i in range(10):
        for j in range(i+1,10):
            thiscosine = cosine(ppmi_red[isigword[i]], ppmi_red[isigword[j]])
            if thiscosine > 0.3and wordlist[isigword[i]] != wordlist[isigword[j]]:
                print('cosine {0} and {1}: {2:.3f}'.format(wordlist[isigword[i]], wordlist[isigword[j]], thiscosine))

135
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 

cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and like: 0.617
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.687
cosine fool and henry: 0.68

cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine soldier and night: 0.750
cosine s

In [104]:
# regular weight updates
# model structure
# xk -> wi -> ht -> wo -> yk : tc
import os
import math
import re # regex
import numpy as np
from toolz import itertoolz
#from sklearn.preprocessing import OneHotEncoder

def read1k():
    return f.read(1024)

def process_data(chunk, text):
    #print('processed: {0}'.format(ctr))
    text.append(unicode(chunk, errors='ignore')) # 'utf8' codec can't decode byte 0xc3
    
def onehotvec(key, vec, dim): # returns a one-hot vector of dimension dim, all 0 but for 1 at vec
    retvec = []
    retvec.append(key)
    for i in range(dim):
        if i == vec:
            retvec.append(1)
        else:
            retvec.append(0)
    return retvec

def onehotenc(dic): # one-hot encodes a dictionary
    ohotret = []
    dlen = len(dic)
    i=0
    for key in dic:
        #ohotret.append(onehotvec(dic[key], dlen)) # does not work as dict keys are not guaranteed to be continuous
        ohotret.append(onehotvec(dic[key], i, dlen))
        i += 1
    return ohotret

def get1hot(dic, dichot, word): # get a 1-hot encoded vector for word from vocabulary dic and its 1-hot version dichot
    keyval = dic[word]
    for dicvec in dichot:
        if dicvec[0] == keyval:
            return dicvec[1:] # return all but the first entry as vector
    return []

def skipgram(corpus, window): # returns skip-grams for given window size from corpus (center word included)
    if window > 5:
        return []
    cpanel = []
    for i in range(window+1,len(corpus)-window+1):
        skipg = []
        for j in range(i-window-1,i+window):
            skipg.append(corpus[j])
        cpanel.append(skipg)
    return cpanel

def sig(x, ds): # sigmoid
    if ds:
        return sig(x, False) * (1.0 - sig(x, False))
    return 1.0 / (1.0 + np.exp(-x))

def relu(x, ds): # linear rectifier (approx. version)
    if ds:
        return 1.0 / (1.0 + np.exp(-x))
    return np.log(1.0 + np.exp(x))

if __name__=="__main__":
    # read corpus from file
    os.chdir('C:\Users\Bernie\Documents\ML4D\lrgtxt0')
    f = open('vecsemtst0.txt') # test file with fixed similarities
    #f = open('todinvenedigshrt.txt')
    text = []
    for piece in iter(read1k, ''):
        process_data(piece, text)
    
    corpus = []
    for elem in text:
        corpus += elem.split() # splits on all whitespaces
    corpus = [t.lower() for t in corpus] # convert to lower case
    
    # remove special characters
    corpus = [re.sub(r'[^\w]', ' ', t) for t in corpus]

    # build dictionary
    myDict = {}
    i = 0
    wordlist = []
    for word in corpus:
        # print word, i
        newEntry = {word.lower(): i}
        if not myDict.has_key(word):
            myDict.update(newEntry)
            wordlist.append(word)
        i = i + 1

    #print myDict

    # one-hot encode the dictionary
    #dictarr = np.asarray(myDict.values()).reshape(-1, 1)
    #enc = OneHotEncoder()
    #enc.fit(dictarr)
    #enc.transform([[8]]).toarray()
    myDic1hot = onehotenc(myDict)
    
    # simple example to illustrate learning: window size = 3
    # vocabulary [A,B,C,D,E,F] represented at indices [0,1,2,3,4,5]
    # two contexts: C0: [A,B,C] and C1: [D,E,F]
    # input word wi = B
    # for C0: [0,1,0,0,0,0] -> truth [1,0,0,0,0,0] and [0,0,1,0,0,0] (wi=B is in context, A and C are related to B)
    # for C1: [0,1,0,0,0,0] -> truth [0,0,0,0,0,0] and [0,0,0,0,0,0] (wi=B is not in context, D and F are unrelated to B)
    # input word wi = E
    # for C1: [0,0,0,0,1,0] -> truth [0,0,0,1,0,0] and [0,0,0,0,0,1] (wi=E is in context, D and F are related to E)

    # negative sample: all the words that train to 0: P(wi)=f(wi^3/4)/sum(f(wi)^3/4)
    
    # build all possible contexts
    winsize = 1
    contexts = skipgram(corpus, winsize)
    
    V = len(myDict)
    N = 4
    # initialize weight matrices (random -0.5 to 0.5)
    # vocabulary of V=8, N=3 hidden nodes: wi(VxN), wo(NxV)
    wi = np.random.rand(V,N) - np.full((V,N), 0.5) # inputs to hidden
    wo = np.random.rand(N,V) - np.full((N,V), 0.5) # hidden to outputs
    
    #print('wi before: {0}'.format(wi))

    for context in contexts: # loop over context
        inpword = context[winsize] # middle word of current context
        # access vocabulary dictionary an get 1-hot encoded vector
        inpvec = get1hot(myDict, myDic1hot, inpword)
        xk = np.array(inpvec) # input vector for training
        #print('for input {0} and context word {1} in context {2}'.format(inpword, cword, context))
        # for non-central context words run the training (word-vector entry for context word = 1)
        ht = np.dot(xk, wi)
        u0 = np.dot(ht, wo)

        yk = np.exp(u0 + 1.0) / ( np.dot(np.exp(u0), np.exp(u0).transpose()) + 1.0) # added + 1.0 to prevent overflow

        # backpropagation (following Xin Rong's paper here)       
        ej = np.zeros(V)
        i = 0
        for cword in context: # loop over current context, cword is the truth
            # access vocabulary dictionary an get 1-hot encoded vector
            #if i != winsize: # input word not in context (or is it ???)
            cwordvec = get1hot(myDict, myDic1hot, cword)
            ej += yk - cwordvec # error summed over context
            i += 1
        #print('ej {0}'.format(ej))
        etha = 0.03

        # Update equation for hidden→output weights
        wo -= etha * (np.transpose([ht]) * ej)
        # Update equation for input→hidden weights
        ehi = np.dot(ej, np.transpose(wo)) # dE/dhi
        wi -= etha * ehi

    #print('wi after: {0}'.format(wi))
    
    # check results
    #print myDict
    #print('cwordvec for: {0} is: {1}'.format('erkennen',get1hot(myDict, myDic1hot, 'erkennen')))
    cwordvec = get1hot(myDict, myDic1hot, 't2')
    ht = np.dot(xk, wi)
    u0 = np.dot(ht, wo)
    yk = np.exp(u0) / np.dot(np.exp(u0), np.exp(u0).transpose())
    i = 0
    for elem in yk:
        if elem > 0.0:
            print('yk for {0} is {1:.4f}'.format(wordlist[i],elem))
        i += 1

yk for a is 0.1962
yk for t0 is 0.1036
yk for b is 2.7945
yk for t1 is 0.0951
yk for c is 0.1983
yk for t2 is 1.5698


In [91]:
# negative sampling weight updates
# model structure
# xk -> wi -> ht -> wo -> yk : tc
import os
import math
import re # regex
import numpy as np
from toolz import itertoolz
from random import randint
#from sklearn.preprocessing import OneHotEncoder

def read1k():
    return f.read(1024)

def process_data(chunk, text):
    #print('processed: {0}'.format(ctr))
    text.append(unicode(chunk, errors='ignore')) # 'utf8' codec can't decode byte 0xc3
    
def onehotvec(key, vec, dim): # returns a one-hot vector of dimension dim, all 0 but for 1 at vec
    retvec = []
    retvec.append(key)
    for i in range(dim):
        if i == vec:
            retvec.append(1)
        else:
            retvec.append(0)
    return retvec

def onehotenc(dic): # one-hot encodes a dictionary
    ohotret = []
    dlen = len(dic)
    i=0
    for key in dic:
        #ohotret.append(onehotvec(dic[key], dlen)) # does not work as dict keys are not guaranteed to be continuous
        ohotret.append(onehotvec(dic[key], i, dlen))
        i += 1
    return ohotret

def get1hot(dic, dichot, word): # get a 1-hot encoded vector for word from vocabulary dic and its 1-hot version dichot
    keyval = dic[word]
    for dicvec in dichot:
        if dicvec[0] == keyval:
            return dicvec[1:] # return all but the first entry as vector
    return np.zeros(len(dic))

def skipgram(corpus, window): # returns skip-grams for given window size from corpus (center word included)
    if window > 5:
        return []
    cpanel = []
    for i in range(window+1,len(corpus)-window+1):
        skipg = []
        for j in range(i-window-1,i+window):
            skipg.append(corpus[j])
        cpanel.append(skipg)
    return cpanel

def noncword(w, cword, contexts): # return a random word from contexts not equal to the current word cword
    rndctx = contexts[randint(0, len(contexts)-1)]
    while cword == rndctx[w]:
         rndctx = contexts[randint(0, len(contexts)-1)]
    return rndctx[w+1]

def sig(x, ds): # sigmoid
    if ds:
        return sig(x, False) * (1.0 - sig(x, False))
    return 1.0 / (1.0 + np.exp(-x))

def relu(x, ds): # linear rectifier (approx. version)
    if ds:
        return 1.0 / (1.0 + np.exp(-x))
    return np.log(1.0 + np.exp(x))

if __name__=="__main__":
    # read corpus from file
    os.chdir('C:\Users\Bernie\Documents\ML4D\lrgtxt0')
    f = open('vecsemtst0.txt') # test file with fixed similarities
    #f = open('todinvenedigshrt.txt')
    text = []
    for piece in iter(read1k, ''):
        process_data(piece, text)
    
    corpus = []
    for elem in text:
        corpus += elem.split() # splits on all whitespaces
    corpus = [t.lower() for t in corpus] # convert to lower case
    
    # remove special characters
    corpus = [re.sub(r'[^\w]', ' ', t) for t in corpus]

    # build dictionary
    myDict = {}
    i = 0
    wordlist = []
    for word in corpus:
        # print word, i
        newEntry = {word.lower(): i}
        if not myDict.has_key(word):
            myDict.update(newEntry)
            wordlist.append(word)
        i = i + 1

    #print myDict

    # one-hot encode the dictionary
    #dictarr = np.asarray(myDict.values()).reshape(-1, 1)
    #enc = OneHotEncoder()
    #enc.fit(dictarr)
    #enc.transform([[8]]).toarray()
    myDic1hot = onehotenc(myDict)
    
    # simple example to illustrate learning: window size = 3
    # vocabulary [A,B,C,D,E,F] represented at indices [0,1,2,3,4,5]
    # two contexts: C0: [A,B,C] and C1: [D,E,F]
    # input word wi = B
    # for C0: [0,1,0,0,0,0] -> truth [1,0,0,0,0,0] and [0,0,1,0,0,0] (wi=B is in context, A and C are related to B)
    # for C1: [0,1,0,0,0,0] -> truth [0,0,0,0,0,0] and [0,0,0,0,0,0] (wi=B is not in context, D and F are unrelated to B)
    # input word wi = E
    # for C1: [0,0,0,0,1,0] -> truth [0,0,0,1,0,0] and [0,0,0,0,0,1] (wi=E is in context, D and F are related to E)

    # negative sample: all the words that train to 0: P(wi)=f(wi^3/4)/sum(f(wi)^3/4)
    
    # build all possible contexts
    winsize = 1
    contexts = skipgram(corpus, winsize)
    
    V = len(myDict)
    N = 5
    # initialize weight matrices (random -0.5 to 0.5)
    # vocabulary of V=8, N=3 hidden nodes: wi(VxN), wo(NxV)
    wi = np.random.rand(V,N) - np.full((V,N), 0.5) # inputs to hidden
    wo = np.random.rand(N,V) - np.full((N,V), 0.5) # hidden to outputs
    
    #print('wi before: {0}'.format(wi))
    
    for context in contexts: # loop over contexts, each context a collection of words
        
        # for negative samples need to add negative words not in context
        j=0
        while j<5: # 3 iterations, 1 positive sampe, 4 negative samples
            if j == 0:
                inpword = context[winsize] # middle word of current context
            else:
                inpword = noncword(winsize, context[winsize], contexts) # a word not in the current context (neg. sample)
            # access vocabulary dictionary an get 1-hot encoded vector
            inpvec = get1hot(myDict, myDic1hot, inpword)
            if j == 0:
                truth = inpvec
            else:
                truth = np.zeros(len(myDict))
            xk = np.array(inpvec) # input vector for training
            # for non-central context words run the training (word-vector entry for context word = 1)

            ht = np.dot(xk, wi)
            #u0 = np.dot(ht, wo)

            #yk = np.exp(u0) / np.dot(np.exp(u0), np.exp(u0).transpose())

            # backpropagation (following Xin Rong's paper here)      
            #ej = np.zeros(V)
            #for cword in context: # loop over current context, cword is the truth
                # access vocabulary dictionary an get 1-hot encoded vector
                #cwordvec = get1hot(myDict, myDic1hot, cword)
                #ej += yk - cwordvec # error summed over context
            etha = 0.025

            # "label" of the word: tj=1 if wj in positive sample, t=0 otherwise
            tj = truth # this is equal to truth tc

            # Update equation for hidden→output weights
            # todo here: only for wo and wneg -> need to restrict
            #print np.multiply((sig(np.dot(np.transpose(wo), ht), False) - tj), ht[:, np.newaxis])
            wo = wo - etha * np.multiply((sig(np.dot(np.transpose(wo), ht), False) - tj), ht[:, np.newaxis])
            # Update equation for input→hidden weights
            ehi = (sig(np.dot(np.transpose(wo), ht), False) - tj) * wo
            wi = wi - etha * np.transpose(ehi)

            j += 1

    #print('wi after: {0}'.format(wi))
    
    # check results
    #print myDict
    #print('cwordvec for: {0} is: {1}'.format('erkennen',get1hot(myDict, myDic1hot, 'erkennen')))
    cwordvec = get1hot(myDict, myDic1hot, 't2')
    ht = np.dot(xk, wi)
    u0 = np.dot(ht, wo)
    yk = np.exp(u0) / np.dot(np.exp(u0), np.exp(u0).transpose())
    i = 0
    for elem in yk:
        if elem > 0.:
            print('yk for {0} is {1:.2f}'.format(wordlist[i],elem))
        i += 1

yk for a is 20.62
yk for t0 is 26.15
yk for b is 46.24
yk for t1 is 8.20
yk for c is 11.25
yk for t2 is 10.69


In [87]:
# scikit learn weight updates
# model structure
# xk -> wi -> ht -> wo -> yk : tc
import os
import math
import re # regex
import numpy as np
from toolz import itertoolz
from sklearn.neural_network import MLPClassifier

def read1k():
    return f.read(1024)

def process_data(chunk, text):
    #print('processed: {0}'.format(ctr))
    text.append(unicode(chunk, errors='ignore')) # 'utf8' codec can't decode byte 0xc3
    
def onehotvec(key, vec, dim): # returns a one-hot vector of dimension dim, all 0 but for 1 at vec
    retvec = []
    retvec.append(key)
    for i in range(dim):
        if i == vec:
            retvec.append(1)
        else:
            retvec.append(0)
    return retvec

def onehotenc(dic): # one-hot encodes a dictionary
    ohotret = []
    dlen = len(dic)
    i=0
    for key in dic:
        #ohotret.append(onehotvec(dic[key], dlen)) # does not work as dict keys are not guaranteed to be continuous
        ohotret.append(onehotvec(dic[key], i, dlen))
        i += 1
    return ohotret

def get1hot(dic, dichot, word): # get a 1-hot encoded vector for word from vocabulary dic and its 1-hot version dichot
    keyval = dic[word]
    for dicvec in dichot:
        if dicvec[0] == keyval:
            return dicvec[1:] # return all but the first entry as vector
    return []

def skipgram(corpus, window): # returns skip-grams for given window size from corpus (center word included)
    if window > 5:
        return []
    cpanel = []
    for i in range(window+1,len(corpus)-window+1):
        skipg = []
        for j in range(i-window-1,i+window):
            skipg.append(corpus[j])
        cpanel.append(skipg)
    return cpanel

def sig(x, ds): # sigmoid
    if ds:
        return sig(x, False) * (1.0 - sig(x, False))
    return 1.0 / (1.0 + np.exp(-x))

def relu(x, ds): # linear rectifier (approx. version)
    if ds:
        return 1.0 / (1.0 + np.exp(-x))
    return np.log(1.0 + np.exp(x))

if __name__=="__main__":
    # read corpus from file
    os.chdir('C:\Users\Bernie\Documents\ML4D\lrgtxt0')
    f = open('vecsemtst0.txt') # test file with fixed similarities
    #f = open('todinvenedigshrt.txt')
    text = []
    for piece in iter(read1k, ''):
        process_data(piece, text)
    
    corpus = []
    for elem in text:
        corpus += elem.split() # splits on all whitespaces
    corpus = [t.lower() for t in corpus] # convert to lower case
    
    # remove special characters
    corpus = [re.sub(r'[^\w]', ' ', t) for t in corpus]

    # build dictionary
    myDict = {}
    i = 0
    wordlist = []
    for word in corpus:
        # print word, i
        newEntry = {word.lower(): i}
        if not myDict.has_key(word):
            myDict.update(newEntry)
            wordlist.append(word)
        i = i + 1

    #print myDict

    # one-hot encode the dictionary
    #dictarr = np.asarray(myDict.values()).reshape(-1, 1)
    #enc = OneHotEncoder()
    #enc.fit(dictarr)
    #enc.transform([[8]]).toarray()
    myDic1hot = onehotenc(myDict)
    
    # build all possible contexts
    winsize = 1
    contexts = skipgram(corpus, winsize)
    
    V = len(myDict)
    N = 5
    
    X = []
    y = []

    for context in contexts: # loop over context
        inpword = context[winsize] # middle word of current context
        # access vocabulary dictionary an get 1-hot encoded vector
        inpvec = get1hot(myDict, myDic1hot, inpword)
        
        i = 0
        for cword in context: # loop over current context, cword is the truth
            # access vocabulary dictionary an get 1-hot encoded vector
            if i != winsize: # input word not in context
                cwordvec = get1hot(myDict, myDic1hot, cword)
                X.append(cwordvec) # target
                y.append(inpvec)
            i += 1
        
    # train model
    #print('shape X: {0} shape y: {1}'.format(len(X), len(y)))
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(N,), random_state=1)
    
    #X = np.array(X).reshape((len(X), 1))
    clf.fit(X, y)
    # check results
    cwordvec = get1hot(myDict, myDic1hot, 't0')
    myresults = clf.predict([cwordvec])
    i = 0
    for elem in myresults[0]:
        print('myresults for {0} is {1:.2f}'.format(wordlist[i], elem))
        i += 1

myresults for a is 1.00
myresults for t0 is 0.00
myresults for b is 0.00
myresults for t1 is 0.00
myresults for c is 0.00
myresults for t2 is 0.00


In [1]:
# import modules and set up logging
from gensim.models import word2vec
import logging
import os

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# load up unzipped corpus from http://mattmahoney.net/dc/text8.zip
os.chdir('C:\Users\Bernie\Documents\ML4D\lrgtxt0')
sentences = word2vec.Text8Corpus('vecsemtst0.txt')
# train the skip-gram model; default window=5
model = word2vec.Word2Vec(sentences, size=200)
# ... and some hours later... just as advertised...
#print model.most_similar(positive=['a', 'b'], negative=['c'], topn=1)
 
# pickle the entire model to disk, so we can load&resume training later
#model.save('todvenshrt0.model')
# store the learned weights, in a format the original C tool understands
#model.save_word2vec_format('todvenshrt0.model.bin', binary=True)
# or, import word weights created by the (faster) C word2vec
# this way, you can switch between the C/Python toolkits easily
#model = word2vec.Word2Vec.load_word2vec_format('todvenshrt0.bin', binary=True)
 
# "boy" is to "father" as "girl" is to ...?
#print model.most_similar(['a', 't0'], ['b'], topn=3)
    
# which word doesn't go with the others?
#print model.doesnt_match("a b c".split())

print model.most_similar(['t1'], topn=3)

2018-01-07 12:07:40,540 : INFO : collecting all words and their counts
2018-01-07 12:07:40,543 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-01-07 12:07:40,545 : INFO : collected 6 word types from a corpus of 232 raw words and 1 sentences
2018-01-07 12:07:40,546 : INFO : Loading a fresh vocabulary
2018-01-07 12:07:40,549 : INFO : min_count=5 retains 6 unique words (100% of original 6, drops 0)
2018-01-07 12:07:40,549 : INFO : min_count=5 leaves 232 word corpus (100% of original 232, drops 0)
2018-01-07 12:07:40,552 : INFO : deleting the raw counts dictionary of 6 items
2018-01-07 12:07:40,553 : INFO : sample=0.001 downsamples 6 most-common words
2018-01-07 12:07:40,555 : INFO : downsampling leaves estimated 18 word corpus (8.0% of prior 232)
2018-01-07 12:07:40,558 : INFO : estimated required memory for 6 words and 200 dimensions: 12600 bytes
2018-01-07 12:07:40,559 : INFO : resetting layer weights
2018-01-07 12:07:40,562 : INFO : training model with 3

[(u'b', 0.2965014576911926), (u'c', 0.25921857357025146), (u't2', 0.2152470052242279)]
